xref: /petsc/src/mat/impls/aij/seq/inode.c (revision c4762a1b19cd2af06abeed90e8f9d34fb975dd94)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
13 
14   PetscFunctionBegin;
15   n      = A->cmap->n;
16   m      = A->rmap->n;
17   ns_row = a->inode.size;
18 
19   min_mn = (m < n) ? m : n;
20   if (!ns) {
21     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22     for (; count+1 < n; count++,i++) ;
23     if (count < n)  {
24       i++;
25     }
26     *size = i;
27     PetscFunctionReturn(0);
28   }
29   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
30 
31   /* Use the same row structure wherever feasible. */
32   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33     ns_col[i] = ns_row[i];
34   }
35 
36   /* if m < n; pad up the remainder with inode_limit */
37   for (; count+1 < n; count++,i++) {
38     ns_col[i] = 1;
39   }
40   /* The last node is the odd ball. padd it up with the remaining rows; */
41   if (count < n) {
42     ns_col[i] = n - count;
43     i++;
44   } else if (count > n) {
45     /* Adjust for the over estimation */
46     ns_col[i-1] += n - count;
47   }
48   *size = i;
49   *ns   = ns_col;
50   PetscFunctionReturn(0);
51 }
52 
53 
54 /*
55       This builds symmetric version of nonzero structure,
56 */
57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58 {
59   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
60   PetscErrorCode ierr;
61   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
64 
65   PetscFunctionBegin;
66   nslim_row = a->inode.node_count;
67   m         = A->rmap->n;
68   n         = A->cmap->n;
69   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
70 
71   /* Use the row_inode as column_inode */
72   nslim_col = nslim_row;
73   ns_col    = ns_row;
74 
75   /* allocate space for reformated inode structure */
76   ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr);
77   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
78 
79   for (i1=0,col=0; i1<nslim_col; ++i1) {
80     nsz = ns_col[i1];
81     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
82   }
83   /* allocate space for row pointers */
84   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
85   *iia = ia;
86   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
87 
88   /* determine the number of columns in each row */
89   ia[0] = oshift;
90   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
91 
92     j    = aj + ai[row] + ishift;
93     jmax = aj + ai[row+1] + ishift;
94     if (j==jmax) continue; /* empty row */
95     col  = *j++ + ishift;
96     i2   = tvc[col];
97     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
98       ia[i1+1]++;
99       ia[i2+1]++;
100       i2++;                     /* Start col of next node */
101       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
102       i2 = tvc[col];
103     }
104     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
105   }
106 
107   /* shift ia[i] to point to next row */
108   for (i1=1; i1<nslim_row+1; i1++) {
109     row        = ia[i1-1];
110     ia[i1]    += row;
111     work[i1-1] = row - oshift;
112   }
113 
114   /* allocate space for column pointers */
115   nz   = ia[nslim_row] + (!ishift);
116   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
117   *jja = ja;
118 
119   /* loop over lower triangular part putting into ja */
120   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
121     j    = aj + ai[row] + ishift;
122     jmax = aj + ai[row+1] + ishift;
123     if (j==jmax) continue; /* empty row */
124     col  = *j++ + ishift;
125     i2   = tvc[col];
126     while (i2<i1 && j<jmax) {
127       ja[work[i2]++] = i1 + oshift;
128       ja[work[i1]++] = i2 + oshift;
129       ++i2;
130       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
131       i2 = tvc[col];
132     }
133     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
134 
135   }
136   ierr = PetscFree(work);CHKERRQ(ierr);
137   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
138   PetscFunctionReturn(0);
139 }
140 
141 /*
142       This builds nonsymmetric version of nonzero structure,
143 */
144 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
145 {
146   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
147   PetscErrorCode ierr;
148   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
149   PetscInt       *tns,*tvc,nsz,i1,i2;
150   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
151 
152   PetscFunctionBegin;
153   nslim_row = a->inode.node_count;
154   n         = A->cmap->n;
155 
156   /* Create The column_inode for this matrix */
157   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
158 
159   /* allocate space for reformated column_inode structure */
160   ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr);
161   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
162 
163   for (i1=0,col=0; i1<nslim_col; ++i1) {
164     nsz = ns_col[i1];
165     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
166   }
167   /* allocate space for row pointers */
168   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
169   *iia = ia;
170   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
171 
172   /* determine the number of columns in each row */
173   ia[0] = oshift;
174   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
175     j   = aj + ai[row] + ishift;
176     nz  = ai[row+1] - ai[row];
177     if (!nz) continue; /* empty row */
178     col = *j++ + ishift;
179     i2  = tvc[col];
180     while (nz-- > 0) {           /* off-diagonal elemets */
181       ia[i1+1]++;
182       i2++;                     /* Start col of next node */
183       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
184       if (nz > 0) i2 = tvc[col];
185     }
186   }
187 
188   /* shift ia[i] to point to next row */
189   for (i1=1; i1<nslim_row+1; i1++) {
190     row        = ia[i1-1];
191     ia[i1]    += row;
192     work[i1-1] = row - oshift;
193   }
194 
195   /* allocate space for column pointers */
196   nz   = ia[nslim_row] + (!ishift);
197   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
198   *jja = ja;
199 
200   /* loop over matrix putting into ja */
201   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
202     j   = aj + ai[row] + ishift;
203     nz  = ai[row+1] - ai[row];
204     if (!nz) continue; /* empty row */
205     col = *j++ + ishift;
206     i2  = tvc[col];
207     while (nz-- > 0) {
208       ja[work[i1]++] = i2 + oshift;
209       ++i2;
210       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
211       if (nz > 0) i2 = tvc[col];
212     }
213   }
214   ierr = PetscFree(ns_col);CHKERRQ(ierr);
215   ierr = PetscFree(work);CHKERRQ(ierr);
216   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
221 {
222   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
223   PetscErrorCode ierr;
224 
225   PetscFunctionBegin;
226   *n = a->inode.node_count;
227   if (!ia) PetscFunctionReturn(0);
228   if (!blockcompressed) {
229     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
230   } else if (symmetric) {
231     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
232   } else {
233     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
234   }
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   if (!ia) PetscFunctionReturn(0);
244 
245   if (!blockcompressed) {
246     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
247   } else {
248     ierr = PetscFree(*ia);CHKERRQ(ierr);
249     ierr = PetscFree(*ja);CHKERRQ(ierr);
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 /* ----------------------------------------------------------- */
255 
256 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
257 {
258   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
259   PetscErrorCode ierr;
260   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
261   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
262 
263   PetscFunctionBegin;
264   nslim_row = a->inode.node_count;
265   n         = A->cmap->n;
266 
267   /* Create The column_inode for this matrix */
268   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
269 
270   /* allocate space for reformated column_inode structure */
271   ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr);
272   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
273 
274   for (i1=0,col=0; i1<nslim_col; ++i1) {
275     nsz = ns_col[i1];
276     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
277   }
278   /* allocate space for column pointers */
279   ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
280   *iia = ia;
281   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
282 
283   /* determine the number of columns in each row */
284   ia[0] = oshift;
285   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
286     j   = aj + ai[row] + ishift;
287     col = *j++ + ishift;
288     i2  = tvc[col];
289     nz  = ai[row+1] - ai[row];
290     while (nz-- > 0) {           /* off-diagonal elemets */
291       /* ia[i1+1]++; */
292       ia[i2+1]++;
293       i2++;
294       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
295       if (nz > 0) i2 = tvc[col];
296     }
297   }
298 
299   /* shift ia[i] to point to next col */
300   for (i1=1; i1<nslim_col+1; i1++) {
301     col        = ia[i1-1];
302     ia[i1]    += col;
303     work[i1-1] = col - oshift;
304   }
305 
306   /* allocate space for column pointers */
307   nz   = ia[nslim_col] + (!ishift);
308   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
309   *jja = ja;
310 
311   /* loop over matrix putting into ja */
312   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
313     j   = aj + ai[row] + ishift;
314     col = *j++ + ishift;
315     i2  = tvc[col];
316     nz  = ai[row+1] - ai[row];
317     while (nz-- > 0) {
318       /* ja[work[i1]++] = i2 + oshift; */
319       ja[work[i2]++] = i1 + oshift;
320       i2++;
321       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
322       if (nz > 0) i2 = tvc[col];
323     }
324   }
325   ierr = PetscFree(ns_col);CHKERRQ(ierr);
326   ierr = PetscFree(work);CHKERRQ(ierr);
327   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
332 {
333   PetscErrorCode ierr;
334 
335   PetscFunctionBegin;
336   ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr);
337   if (!ia) PetscFunctionReturn(0);
338 
339   if (!blockcompressed) {
340     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
341   } else if (symmetric) {
342     /* Since the indices are symmetric it does'nt matter */
343     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
344   } else {
345     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
346   }
347   PetscFunctionReturn(0);
348 }
349 
350 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   if (!ia) PetscFunctionReturn(0);
356   if (!blockcompressed) {
357     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
358   } else {
359     ierr = PetscFree(*ia);CHKERRQ(ierr);
360     ierr = PetscFree(*ja);CHKERRQ(ierr);
361   }
362   PetscFunctionReturn(0);
363 }
364 
365 /* ----------------------------------------------------------- */
366 
367 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
368 {
369   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
370   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
371   PetscScalar       *y;
372   const PetscScalar *x;
373   const MatScalar   *v1,*v2,*v3,*v4,*v5;
374   PetscErrorCode    ierr;
375   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
376   const PetscInt    *idx,*ns,*ii;
377 
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
380 #endif
381 
382   PetscFunctionBegin;
383   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
384   node_max = a->inode.node_count;
385   ns       = a->inode.size;     /* Node Size array */
386   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
387   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
388   idx      = a->j;
389   v1       = a->a;
390   ii       = a->i;
391 
392   for (i = 0,row = 0; i< node_max; ++i) {
393     nsz         = ns[i];
394     n           = ii[1] - ii[0];
395     nonzerorow += (n>0)*nsz;
396     ii         += nsz;
397     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
398     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
399     sz = n;                     /* No of non zeros in this row */
400                                 /* Switch on the size of Node */
401     switch (nsz) {               /* Each loop in 'case' is unrolled */
402     case 1:
403       sum1 = 0.;
404 
405       for (n = 0; n< sz-1; n+=2) {
406         i1    = idx[0];         /* The instructions are ordered to */
407         i2    = idx[1];         /* make the compiler's job easy */
408         idx  += 2;
409         tmp0  = x[i1];
410         tmp1  = x[i2];
411         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
412       }
413 
414       if (n == sz-1) {          /* Take care of the last nonzero  */
415         tmp0  = x[*idx++];
416         sum1 += *v1++ *tmp0;
417       }
418       y[row++]=sum1;
419       break;
420     case 2:
421       sum1 = 0.;
422       sum2 = 0.;
423       v2   = v1 + n;
424 
425       for (n = 0; n< sz-1; n+=2) {
426         i1    = idx[0];
427         i2    = idx[1];
428         idx  += 2;
429         tmp0  = x[i1];
430         tmp1  = x[i2];
431         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
433       }
434       if (n == sz-1) {
435         tmp0  = x[*idx++];
436         sum1 += *v1++ * tmp0;
437         sum2 += *v2++ * tmp0;
438       }
439       y[row++]=sum1;
440       y[row++]=sum2;
441       v1      =v2;              /* Since the next block to be processed starts there*/
442       idx    +=sz;
443       break;
444     case 3:
445       sum1 = 0.;
446       sum2 = 0.;
447       sum3 = 0.;
448       v2   = v1 + n;
449       v3   = v2 + n;
450 
451       for (n = 0; n< sz-1; n+=2) {
452         i1    = idx[0];
453         i2    = idx[1];
454         idx  += 2;
455         tmp0  = x[i1];
456         tmp1  = x[i2];
457         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
460       }
461       if (n == sz-1) {
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465         sum3 += *v3++ * tmp0;
466       }
467       y[row++]=sum1;
468       y[row++]=sum2;
469       y[row++]=sum3;
470       v1      =v3;              /* Since the next block to be processed starts there*/
471       idx    +=2*sz;
472       break;
473     case 4:
474       sum1 = 0.;
475       sum2 = 0.;
476       sum3 = 0.;
477       sum4 = 0.;
478       v2   = v1 + n;
479       v3   = v2 + n;
480       v4   = v3 + n;
481 
482       for (n = 0; n< sz-1; n+=2) {
483         i1    = idx[0];
484         i2    = idx[1];
485         idx  += 2;
486         tmp0  = x[i1];
487         tmp1  = x[i2];
488         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
489         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
490         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
491         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
492       }
493       if (n == sz-1) {
494         tmp0  = x[*idx++];
495         sum1 += *v1++ * tmp0;
496         sum2 += *v2++ * tmp0;
497         sum3 += *v3++ * tmp0;
498         sum4 += *v4++ * tmp0;
499       }
500       y[row++]=sum1;
501       y[row++]=sum2;
502       y[row++]=sum3;
503       y[row++]=sum4;
504       v1      =v4;              /* Since the next block to be processed starts there*/
505       idx    +=3*sz;
506       break;
507     case 5:
508       sum1 = 0.;
509       sum2 = 0.;
510       sum3 = 0.;
511       sum4 = 0.;
512       sum5 = 0.;
513       v2   = v1 + n;
514       v3   = v2 + n;
515       v4   = v3 + n;
516       v5   = v4 + n;
517 
518       for (n = 0; n<sz-1; n+=2) {
519         i1    = idx[0];
520         i2    = idx[1];
521         idx  += 2;
522         tmp0  = x[i1];
523         tmp1  = x[i2];
524         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
525         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
526         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
527         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
528         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
529       }
530       if (n == sz-1) {
531         tmp0  = x[*idx++];
532         sum1 += *v1++ * tmp0;
533         sum2 += *v2++ * tmp0;
534         sum3 += *v3++ * tmp0;
535         sum4 += *v4++ * tmp0;
536         sum5 += *v5++ * tmp0;
537       }
538       y[row++]=sum1;
539       y[row++]=sum2;
540       y[row++]=sum3;
541       y[row++]=sum4;
542       y[row++]=sum5;
543       v1      =v5;       /* Since the next block to be processed starts there */
544       idx    +=4*sz;
545       break;
546     default:
547       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
548     }
549   }
550   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
551   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
552   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
553   PetscFunctionReturn(0);
554 }
555 /* ----------------------------------------------------------- */
556 /* Almost same code as the MatMult_SeqAIJ_Inode() */
557 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
558 {
559   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
560   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
561   const MatScalar   *v1,*v2,*v3,*v4,*v5;
562   const PetscScalar *x;
563   PetscScalar       *y,*z,*zt;
564   PetscErrorCode    ierr;
565   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
566   const PetscInt    *idx,*ns,*ii;
567 
568   PetscFunctionBegin;
569   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
570   node_max = a->inode.node_count;
571   ns       = a->inode.size;     /* Node Size array */
572 
573   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
575   zt = z;
576 
577   idx = a->j;
578   v1  = a->a;
579   ii  = a->i;
580 
581   for (i = 0,row = 0; i< node_max; ++i) {
582     nsz = ns[i];
583     n   = ii[1] - ii[0];
584     ii += nsz;
585     sz  = n;                    /* No of non zeros in this row */
586                                 /* Switch on the size of Node */
587     switch (nsz) {               /* Each loop in 'case' is unrolled */
588     case 1:
589       sum1 = *zt++;
590 
591       for (n = 0; n< sz-1; n+=2) {
592         i1    = idx[0];         /* The instructions are ordered to */
593         i2    = idx[1];         /* make the compiler's job easy */
594         idx  += 2;
595         tmp0  = x[i1];
596         tmp1  = x[i2];
597         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
598       }
599 
600       if (n   == sz-1) {          /* Take care of the last nonzero  */
601         tmp0  = x[*idx++];
602         sum1 += *v1++ * tmp0;
603       }
604       y[row++]=sum1;
605       break;
606     case 2:
607       sum1 = *zt++;
608       sum2 = *zt++;
609       v2   = v1 + n;
610 
611       for (n = 0; n< sz-1; n+=2) {
612         i1    = idx[0];
613         i2    = idx[1];
614         idx  += 2;
615         tmp0  = x[i1];
616         tmp1  = x[i2];
617         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
618         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
619       }
620       if (n   == sz-1) {
621         tmp0  = x[*idx++];
622         sum1 += *v1++ * tmp0;
623         sum2 += *v2++ * tmp0;
624       }
625       y[row++]=sum1;
626       y[row++]=sum2;
627       v1      =v2;              /* Since the next block to be processed starts there*/
628       idx    +=sz;
629       break;
630     case 3:
631       sum1 = *zt++;
632       sum2 = *zt++;
633       sum3 = *zt++;
634       v2   = v1 + n;
635       v3   = v2 + n;
636 
637       for (n = 0; n< sz-1; n+=2) {
638         i1    = idx[0];
639         i2    = idx[1];
640         idx  += 2;
641         tmp0  = x[i1];
642         tmp1  = x[i2];
643         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
644         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
645         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
646       }
647       if (n == sz-1) {
648         tmp0  = x[*idx++];
649         sum1 += *v1++ * tmp0;
650         sum2 += *v2++ * tmp0;
651         sum3 += *v3++ * tmp0;
652       }
653       y[row++]=sum1;
654       y[row++]=sum2;
655       y[row++]=sum3;
656       v1      =v3;              /* Since the next block to be processed starts there*/
657       idx    +=2*sz;
658       break;
659     case 4:
660       sum1 = *zt++;
661       sum2 = *zt++;
662       sum3 = *zt++;
663       sum4 = *zt++;
664       v2   = v1 + n;
665       v3   = v2 + n;
666       v4   = v3 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1    = idx[0];
670         i2    = idx[1];
671         idx  += 2;
672         tmp0  = x[i1];
673         tmp1  = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
677         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
678       }
679       if (n == sz-1) {
680         tmp0  = x[*idx++];
681         sum1 += *v1++ * tmp0;
682         sum2 += *v2++ * tmp0;
683         sum3 += *v3++ * tmp0;
684         sum4 += *v4++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       y[row++]=sum4;
690       v1      =v4;              /* Since the next block to be processed starts there*/
691       idx    +=3*sz;
692       break;
693     case 5:
694       sum1 = *zt++;
695       sum2 = *zt++;
696       sum3 = *zt++;
697       sum4 = *zt++;
698       sum5 = *zt++;
699       v2   = v1 + n;
700       v3   = v2 + n;
701       v4   = v3 + n;
702       v5   = v4 + n;
703 
704       for (n = 0; n<sz-1; n+=2) {
705         i1    = idx[0];
706         i2    = idx[1];
707         idx  += 2;
708         tmp0  = x[i1];
709         tmp1  = x[i2];
710         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
711         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
712         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
713         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
714         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
715       }
716       if (n   == sz-1) {
717         tmp0  = x[*idx++];
718         sum1 += *v1++ * tmp0;
719         sum2 += *v2++ * tmp0;
720         sum3 += *v3++ * tmp0;
721         sum4 += *v4++ * tmp0;
722         sum5 += *v5++ * tmp0;
723       }
724       y[row++]=sum1;
725       y[row++]=sum2;
726       y[row++]=sum3;
727       y[row++]=sum4;
728       y[row++]=sum5;
729       v1      =v5;       /* Since the next block to be processed starts there */
730       idx    +=4*sz;
731       break;
732     default:
733       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
734     }
735   }
736   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
737   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
738   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
739   PetscFunctionReturn(0);
740 }
741 
742 /* ----------------------------------------------------------- */
743 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
744 {
745   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
746   IS                iscol = a->col,isrow = a->row;
747   PetscErrorCode    ierr;
748   const PetscInt    *r,*c,*rout,*cout;
749   PetscInt          i,j,n = A->rmap->n,nz;
750   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
751   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
752   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
753   PetscScalar       sum1,sum2,sum3,sum4,sum5;
754   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
755   const PetscScalar *b;
756 
757   PetscFunctionBegin;
758   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
759   node_max = a->inode.node_count;
760   ns       = a->inode.size;     /* Node Size array */
761 
762   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
763   ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
764   tmp  = a->solve_work;
765 
766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
768 
769   /* forward solve the lower triangular */
770   tmps = tmp;
771   aa   = a_a;
772   aj   = a_j;
773   ad   = a->diag;
774 
775   for (i = 0,row = 0; i< node_max; ++i) {
776     nsz = ns[i];
777     aii = ai[row];
778     v1  = aa + aii;
779     vi  = aj + aii;
780     nz  = ad[row]- aii;
781     if (i < node_max-1) {
782       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
783       * but our indexing to determine it's size could. */
784       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
785       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
786       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
787       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
788     }
789 
790     switch (nsz) {               /* Each loop in 'case' is unrolled */
791     case 1:
792       sum1 = b[*r++];
793       for (j=0; j<nz-1; j+=2) {
794         i0    = vi[0];
795         i1    = vi[1];
796         vi   +=2;
797         tmp0  = tmps[i0];
798         tmp1  = tmps[i1];
799         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
800       }
801       if (j == nz-1) {
802         tmp0  = tmps[*vi++];
803         sum1 -= *v1++ *tmp0;
804       }
805       tmp[row++]=sum1;
806       break;
807     case 2:
808       sum1 = b[*r++];
809       sum2 = b[*r++];
810       v2   = aa + ai[row+1];
811 
812       for (j=0; j<nz-1; j+=2) {
813         i0    = vi[0];
814         i1    = vi[1];
815         vi   +=2;
816         tmp0  = tmps[i0];
817         tmp1  = tmps[i1];
818         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
819         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
820       }
821       if (j == nz-1) {
822         tmp0  = tmps[*vi++];
823         sum1 -= *v1++ *tmp0;
824         sum2 -= *v2++ *tmp0;
825       }
826       sum2     -= *v2++ *sum1;
827       tmp[row++]=sum1;
828       tmp[row++]=sum2;
829       break;
830     case 3:
831       sum1 = b[*r++];
832       sum2 = b[*r++];
833       sum3 = b[*r++];
834       v2   = aa + ai[row+1];
835       v3   = aa + ai[row+2];
836 
837       for (j=0; j<nz-1; j+=2) {
838         i0    = vi[0];
839         i1    = vi[1];
840         vi   +=2;
841         tmp0  = tmps[i0];
842         tmp1  = tmps[i1];
843         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
846       }
847       if (j == nz-1) {
848         tmp0  = tmps[*vi++];
849         sum1 -= *v1++ *tmp0;
850         sum2 -= *v2++ *tmp0;
851         sum3 -= *v3++ *tmp0;
852       }
853       sum2 -= *v2++ * sum1;
854       sum3 -= *v3++ * sum1;
855       sum3 -= *v3++ * sum2;
856 
857       tmp[row++]=sum1;
858       tmp[row++]=sum2;
859       tmp[row++]=sum3;
860       break;
861 
862     case 4:
863       sum1 = b[*r++];
864       sum2 = b[*r++];
865       sum3 = b[*r++];
866       sum4 = b[*r++];
867       v2   = aa + ai[row+1];
868       v3   = aa + ai[row+2];
869       v4   = aa + ai[row+3];
870 
871       for (j=0; j<nz-1; j+=2) {
872         i0    = vi[0];
873         i1    = vi[1];
874         vi   +=2;
875         tmp0  = tmps[i0];
876         tmp1  = tmps[i1];
877         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
878         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
879         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
880         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
881       }
882       if (j == nz-1) {
883         tmp0  = tmps[*vi++];
884         sum1 -= *v1++ *tmp0;
885         sum2 -= *v2++ *tmp0;
886         sum3 -= *v3++ *tmp0;
887         sum4 -= *v4++ *tmp0;
888       }
889       sum2 -= *v2++ * sum1;
890       sum3 -= *v3++ * sum1;
891       sum4 -= *v4++ * sum1;
892       sum3 -= *v3++ * sum2;
893       sum4 -= *v4++ * sum2;
894       sum4 -= *v4++ * sum3;
895 
896       tmp[row++]=sum1;
897       tmp[row++]=sum2;
898       tmp[row++]=sum3;
899       tmp[row++]=sum4;
900       break;
901     case 5:
902       sum1 = b[*r++];
903       sum2 = b[*r++];
904       sum3 = b[*r++];
905       sum4 = b[*r++];
906       sum5 = b[*r++];
907       v2   = aa + ai[row+1];
908       v3   = aa + ai[row+2];
909       v4   = aa + ai[row+3];
910       v5   = aa + ai[row+4];
911 
912       for (j=0; j<nz-1; j+=2) {
913         i0    = vi[0];
914         i1    = vi[1];
915         vi   +=2;
916         tmp0  = tmps[i0];
917         tmp1  = tmps[i1];
918         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
919         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
920         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
921         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
922         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
923       }
924       if (j == nz-1) {
925         tmp0  = tmps[*vi++];
926         sum1 -= *v1++ *tmp0;
927         sum2 -= *v2++ *tmp0;
928         sum3 -= *v3++ *tmp0;
929         sum4 -= *v4++ *tmp0;
930         sum5 -= *v5++ *tmp0;
931       }
932 
933       sum2 -= *v2++ * sum1;
934       sum3 -= *v3++ * sum1;
935       sum4 -= *v4++ * sum1;
936       sum5 -= *v5++ * sum1;
937       sum3 -= *v3++ * sum2;
938       sum4 -= *v4++ * sum2;
939       sum5 -= *v5++ * sum2;
940       sum4 -= *v4++ * sum3;
941       sum5 -= *v5++ * sum3;
942       sum5 -= *v5++ * sum4;
943 
944       tmp[row++]=sum1;
945       tmp[row++]=sum2;
946       tmp[row++]=sum3;
947       tmp[row++]=sum4;
948       tmp[row++]=sum5;
949       break;
950     default:
951       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
952     }
953   }
954   /* backward solve the upper triangular */
955   for (i=node_max -1,row = n-1; i>=0; i--) {
956     nsz = ns[i];
957     aii = ai[row+1] -1;
958     v1  = aa + aii;
959     vi  = aj + aii;
960     nz  = aii- ad[row];
961     switch (nsz) {               /* Each loop in 'case' is unrolled */
962     case 1:
963       sum1 = tmp[row];
964 
965       for (j=nz; j>1; j-=2) {
966         vi   -=2;
967         i0    = vi[2];
968         i1    = vi[1];
969         tmp0  = tmps[i0];
970         tmp1  = tmps[i1];
971         v1   -= 2;
972         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
973       }
974       if (j==1) {
975         tmp0  = tmps[*vi--];
976         sum1 -= *v1-- * tmp0;
977       }
978       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
979       break;
980     case 2:
981       sum1 = tmp[row];
982       sum2 = tmp[row -1];
983       v2   = aa + ai[row]-1;
984       for (j=nz; j>1; j-=2) {
985         vi   -=2;
986         i0    = vi[2];
987         i1    = vi[1];
988         tmp0  = tmps[i0];
989         tmp1  = tmps[i1];
990         v1   -= 2;
991         v2   -= 2;
992         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
993         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
994       }
995       if (j==1) {
996         tmp0  = tmps[*vi--];
997         sum1 -= *v1-- * tmp0;
998         sum2 -= *v2-- * tmp0;
999       }
1000 
1001       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1002       sum2   -= *v2-- * tmp0;
1003       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1004       break;
1005     case 3:
1006       sum1 = tmp[row];
1007       sum2 = tmp[row -1];
1008       sum3 = tmp[row -2];
1009       v2   = aa + ai[row]-1;
1010       v3   = aa + ai[row -1]-1;
1011       for (j=nz; j>1; j-=2) {
1012         vi   -=2;
1013         i0    = vi[2];
1014         i1    = vi[1];
1015         tmp0  = tmps[i0];
1016         tmp1  = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         v3   -= 2;
1020         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1023       }
1024       if (j==1) {
1025         tmp0  = tmps[*vi--];
1026         sum1 -= *v1-- * tmp0;
1027         sum2 -= *v2-- * tmp0;
1028         sum3 -= *v3-- * tmp0;
1029       }
1030       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031       sum2   -= *v2-- * tmp0;
1032       sum3   -= *v3-- * tmp0;
1033       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1034       sum3   -= *v3-- * tmp0;
1035       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1036 
1037       break;
1038     case 4:
1039       sum1 = tmp[row];
1040       sum2 = tmp[row -1];
1041       sum3 = tmp[row -2];
1042       sum4 = tmp[row -3];
1043       v2   = aa + ai[row]-1;
1044       v3   = aa + ai[row -1]-1;
1045       v4   = aa + ai[row -2]-1;
1046 
1047       for (j=nz; j>1; j-=2) {
1048         vi   -=2;
1049         i0    = vi[2];
1050         i1    = vi[1];
1051         tmp0  = tmps[i0];
1052         tmp1  = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         v4   -= 2;
1057         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1058         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1059         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1060         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1061       }
1062       if (j==1) {
1063         tmp0  = tmps[*vi--];
1064         sum1 -= *v1-- * tmp0;
1065         sum2 -= *v2-- * tmp0;
1066         sum3 -= *v3-- * tmp0;
1067         sum4 -= *v4-- * tmp0;
1068       }
1069 
1070       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1071       sum2   -= *v2-- * tmp0;
1072       sum3   -= *v3-- * tmp0;
1073       sum4   -= *v4-- * tmp0;
1074       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1075       sum3   -= *v3-- * tmp0;
1076       sum4   -= *v4-- * tmp0;
1077       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1078       sum4   -= *v4-- * tmp0;
1079       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1080       break;
1081     case 5:
1082       sum1 = tmp[row];
1083       sum2 = tmp[row -1];
1084       sum3 = tmp[row -2];
1085       sum4 = tmp[row -3];
1086       sum5 = tmp[row -4];
1087       v2   = aa + ai[row]-1;
1088       v3   = aa + ai[row -1]-1;
1089       v4   = aa + ai[row -2]-1;
1090       v5   = aa + ai[row -3]-1;
1091       for (j=nz; j>1; j-=2) {
1092         vi   -= 2;
1093         i0    = vi[2];
1094         i1    = vi[1];
1095         tmp0  = tmps[i0];
1096         tmp1  = tmps[i1];
1097         v1   -= 2;
1098         v2   -= 2;
1099         v3   -= 2;
1100         v4   -= 2;
1101         v5   -= 2;
1102         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1103         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1104         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1105         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1106         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1107       }
1108       if (j==1) {
1109         tmp0  = tmps[*vi--];
1110         sum1 -= *v1-- * tmp0;
1111         sum2 -= *v2-- * tmp0;
1112         sum3 -= *v3-- * tmp0;
1113         sum4 -= *v4-- * tmp0;
1114         sum5 -= *v5-- * tmp0;
1115       }
1116 
1117       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1118       sum2   -= *v2-- * tmp0;
1119       sum3   -= *v3-- * tmp0;
1120       sum4   -= *v4-- * tmp0;
1121       sum5   -= *v5-- * tmp0;
1122       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1123       sum3   -= *v3-- * tmp0;
1124       sum4   -= *v4-- * tmp0;
1125       sum5   -= *v5-- * tmp0;
1126       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1127       sum4   -= *v4-- * tmp0;
1128       sum5   -= *v5-- * tmp0;
1129       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1130       sum5   -= *v5-- * tmp0;
1131       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1132       break;
1133     default:
1134       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1135     }
1136   }
1137   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1138   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1139   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1140   ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
1141   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1146 {
1147   Mat             C     =B;
1148   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1149   IS              isrow = b->row,isicol = b->icol;
1150   PetscErrorCode  ierr;
1151   const PetscInt  *r,*ic,*ics;
1152   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1153   PetscInt        i,j,k,nz,nzL,row,*pj;
1154   const PetscInt  *ajtmp,*bjtmp;
1155   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1156   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1157   FactorShiftCtx  sctx;
1158   const PetscInt  *ddiag;
1159   PetscReal       rs;
1160   MatScalar       d;
1161   PetscInt        inod,nodesz,node_max,col;
1162   const PetscInt  *ns;
1163   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1164 
1165   PetscFunctionBegin;
1166   /* MatPivotSetUp(): initialize shift context sctx */
1167   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1168 
1169   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1170     ddiag          = a->diag;
1171     sctx.shift_top = info->zeropivot;
1172     for (i=0; i<n; i++) {
1173       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1174       d  = (aa)[ddiag[i]];
1175       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1176       v  = aa+ai[i];
1177       nz = ai[i+1] - ai[i];
1178       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1179       if (rs>sctx.shift_top) sctx.shift_top = rs;
1180     }
1181     sctx.shift_top *= 1.1;
1182     sctx.nshift_max = 5;
1183     sctx.shift_lo   = 0.;
1184     sctx.shift_hi   = 1.;
1185   }
1186 
1187   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1188   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1189 
1190   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1191   ics   = ic;
1192 
1193   node_max = a->inode.node_count;
1194   ns       = a->inode.size;
1195   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1196 
1197   /* If max inode size > 4, split it into two inodes.*/
1198   /* also map the inode sizes according to the ordering */
1199   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1200   for (i=0,j=0; i<node_max; ++i,++j) {
1201     if (ns[i] > 4) {
1202       tmp_vec1[j] = 4;
1203       ++j;
1204       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1205     } else {
1206       tmp_vec1[j] = ns[i];
1207     }
1208   }
1209   /* Use the correct node_max */
1210   node_max = j;
1211 
1212   /* Now reorder the inode info based on mat re-ordering info */
1213   /* First create a row -> inode_size_array_index map */
1214   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1215   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1216   for (i=0,row=0; i<node_max; i++) {
1217     nodesz = tmp_vec1[i];
1218     for (j=0; j<nodesz; j++,row++) {
1219       nsmap[row] = i;
1220     }
1221   }
1222   /* Using nsmap, create a reordered ns structure */
1223   for (i=0,j=0; i< node_max; i++) {
1224     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1225     tmp_vec2[i] = nodesz;
1226     j          += nodesz;
1227   }
1228   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1229   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1230 
1231   /* Now use the correct ns */
1232   ns = tmp_vec2;
1233 
1234   do {
1235     sctx.newshift = PETSC_FALSE;
1236     /* Now loop over each block-row, and do the factorization */
1237     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1238       nodesz = ns[inod];
1239 
1240       switch (nodesz) {
1241       case 1:
1242         /*----------*/
1243         /* zero rtmp1 */
1244         /* L part */
1245         nz    = bi[i+1] - bi[i];
1246         bjtmp = bj + bi[i];
1247         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1248 
1249         /* U part */
1250         nz    = bdiag[i]-bdiag[i+1];
1251         bjtmp = bj + bdiag[i+1]+1;
1252         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1253 
1254         /* load in initial (unfactored row) */
1255         nz    = ai[r[i]+1] - ai[r[i]];
1256         ajtmp = aj + ai[r[i]];
1257         v     = aa + ai[r[i]];
1258         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1259 
1260         /* ZeropivotApply() */
1261         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1262 
1263         /* elimination */
1264         bjtmp = bj + bi[i];
1265         row   = *bjtmp++;
1266         nzL   = bi[i+1] - bi[i];
1267         for (k=0; k < nzL; k++) {
1268           pc = rtmp1 + row;
1269           if (*pc != 0.0) {
1270             pv   = b->a + bdiag[row];
1271             mul1 = *pc * (*pv);
1272             *pc  = mul1;
1273             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1274             pv   = b->a + bdiag[row+1]+1;
1275             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1276             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1277             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1278           }
1279           row = *bjtmp++;
1280         }
1281 
1282         /* finished row so stick it into b->a */
1283         rs = 0.0;
1284         /* L part */
1285         pv = b->a + bi[i];
1286         pj = b->j + bi[i];
1287         nz = bi[i+1] - bi[i];
1288         for (j=0; j<nz; j++) {
1289           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1290         }
1291 
1292         /* U part */
1293         pv = b->a + bdiag[i+1]+1;
1294         pj = b->j + bdiag[i+1]+1;
1295         nz = bdiag[i] - bdiag[i+1]-1;
1296         for (j=0; j<nz; j++) {
1297           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1298         }
1299 
1300         /* Check zero pivot */
1301         sctx.rs = rs;
1302         sctx.pv = rtmp1[i];
1303         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1304         if (sctx.newshift) break;
1305 
1306         /* Mark diagonal and invert diagonal for simplier triangular solves */
1307         pv  = b->a + bdiag[i];
1308         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1309         break;
1310 
1311       case 2:
1312         /*----------*/
1313         /* zero rtmp1 and rtmp2 */
1314         /* L part */
1315         nz    = bi[i+1] - bi[i];
1316         bjtmp = bj + bi[i];
1317         for  (j=0; j<nz; j++) {
1318           col        = bjtmp[j];
1319           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1320         }
1321 
1322         /* U part */
1323         nz    = bdiag[i]-bdiag[i+1];
1324         bjtmp = bj + bdiag[i+1]+1;
1325         for  (j=0; j<nz; j++) {
1326           col        = bjtmp[j];
1327           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1328         }
1329 
1330         /* load in initial (unfactored row) */
1331         nz    = ai[r[i]+1] - ai[r[i]];
1332         ajtmp = aj + ai[r[i]];
1333         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1334         for (j=0; j<nz; j++) {
1335           col        = ics[ajtmp[j]];
1336           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1337         }
1338         /* ZeropivotApply(): shift the diagonal of the matrix  */
1339         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1340 
1341         /* elimination */
1342         bjtmp = bj + bi[i];
1343         row   = *bjtmp++; /* pivot row */
1344         nzL   = bi[i+1] - bi[i];
1345         for (k=0; k < nzL; k++) {
1346           pc1 = rtmp1 + row;
1347           pc2 = rtmp2 + row;
1348           if (*pc1 != 0.0 || *pc2 != 0.0) {
1349             pv   = b->a + bdiag[row];
1350             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1351             *pc1 = mul1;       *pc2 = mul2;
1352 
1353             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1354             pv = b->a + bdiag[row+1]+1;
1355             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1356             for (j=0; j<nz; j++) {
1357               col         = pj[j];
1358               rtmp1[col] -= mul1 * pv[j];
1359               rtmp2[col] -= mul2 * pv[j];
1360             }
1361             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1362           }
1363           row = *bjtmp++;
1364         }
1365 
1366         /* finished row i; check zero pivot, then stick row i into b->a */
1367         rs = 0.0;
1368         /* L part */
1369         pc1 = b->a + bi[i];
1370         pj  = b->j + bi[i];
1371         nz  = bi[i+1] - bi[i];
1372         for (j=0; j<nz; j++) {
1373           col    = pj[j];
1374           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1375         }
1376         /* U part */
1377         pc1 = b->a + bdiag[i+1]+1;
1378         pj  = b->j + bdiag[i+1]+1;
1379         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1380         for (j=0; j<nz; j++) {
1381           col    = pj[j];
1382           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1383         }
1384 
1385         sctx.rs = rs;
1386         sctx.pv = rtmp1[i];
1387         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1388         if (sctx.newshift) break;
1389         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1390         *pc1 = 1.0/sctx.pv;
1391 
1392         /* Now take care of diagonal 2x2 block. */
1393         pc2 = rtmp2 + i;
1394         if (*pc2 != 0.0) {
1395           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1396           *pc2 = mul1;          /* insert L entry */
1397           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1398           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1399           for (j=0; j<nz; j++) {
1400             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1401           }
1402           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1403         }
1404 
1405         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1406         rs = 0.0;
1407         /* L part */
1408         pc2 = b->a + bi[i+1];
1409         pj  = b->j + bi[i+1];
1410         nz  = bi[i+2] - bi[i+1];
1411         for (j=0; j<nz; j++) {
1412           col    = pj[j];
1413           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1414         }
1415         /* U part */
1416         pc2 = b->a + bdiag[i+2]+1;
1417         pj  = b->j + bdiag[i+2]+1;
1418         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1419         for (j=0; j<nz; j++) {
1420           col    = pj[j];
1421           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1422         }
1423 
1424         sctx.rs = rs;
1425         sctx.pv = rtmp2[i+1];
1426         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1427         if (sctx.newshift) break;
1428         pc2  = b->a + bdiag[i+1];
1429         *pc2 = 1.0/sctx.pv;
1430         break;
1431 
1432       case 3:
1433         /*----------*/
1434         /* zero rtmp */
1435         /* L part */
1436         nz    = bi[i+1] - bi[i];
1437         bjtmp = bj + bi[i];
1438         for  (j=0; j<nz; j++) {
1439           col        = bjtmp[j];
1440           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1441         }
1442 
1443         /* U part */
1444         nz    = bdiag[i]-bdiag[i+1];
1445         bjtmp = bj + bdiag[i+1]+1;
1446         for  (j=0; j<nz; j++) {
1447           col        = bjtmp[j];
1448           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1449         }
1450 
1451         /* load in initial (unfactored row) */
1452         nz    = ai[r[i]+1] - ai[r[i]];
1453         ajtmp = aj + ai[r[i]];
1454         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1455         for (j=0; j<nz; j++) {
1456           col        = ics[ajtmp[j]];
1457           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1458         }
1459         /* ZeropivotApply(): shift the diagonal of the matrix  */
1460         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1461 
1462         /* elimination */
1463         bjtmp = bj + bi[i];
1464         row   = *bjtmp++; /* pivot row */
1465         nzL   = bi[i+1] - bi[i];
1466         for (k=0; k < nzL; k++) {
1467           pc1 = rtmp1 + row;
1468           pc2 = rtmp2 + row;
1469           pc3 = rtmp3 + row;
1470           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1471             pv   = b->a + bdiag[row];
1472             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1473             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1474 
1475             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1476             pv = b->a + bdiag[row+1]+1;
1477             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1478             for (j=0; j<nz; j++) {
1479               col         = pj[j];
1480               rtmp1[col] -= mul1 * pv[j];
1481               rtmp2[col] -= mul2 * pv[j];
1482               rtmp3[col] -= mul3 * pv[j];
1483             }
1484             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1485           }
1486           row = *bjtmp++;
1487         }
1488 
1489         /* finished row i; check zero pivot, then stick row i into b->a */
1490         rs = 0.0;
1491         /* L part */
1492         pc1 = b->a + bi[i];
1493         pj  = b->j + bi[i];
1494         nz  = bi[i+1] - bi[i];
1495         for (j=0; j<nz; j++) {
1496           col    = pj[j];
1497           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1498         }
1499         /* U part */
1500         pc1 = b->a + bdiag[i+1]+1;
1501         pj  = b->j + bdiag[i+1]+1;
1502         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1503         for (j=0; j<nz; j++) {
1504           col    = pj[j];
1505           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1506         }
1507 
1508         sctx.rs = rs;
1509         sctx.pv = rtmp1[i];
1510         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1511         if (sctx.newshift) break;
1512         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1513         *pc1 = 1.0/sctx.pv;
1514 
1515         /* Now take care of 1st column of diagonal 3x3 block. */
1516         pc2 = rtmp2 + i;
1517         pc3 = rtmp3 + i;
1518         if (*pc2 != 0.0 || *pc3 != 0.0) {
1519           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1520           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1521           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1522           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1523           for (j=0; j<nz; j++) {
1524             col         = pj[j];
1525             rtmp2[col] -= mul2 * rtmp1[col];
1526             rtmp3[col] -= mul3 * rtmp1[col];
1527           }
1528           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1529         }
1530 
1531         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1532         rs = 0.0;
1533         /* L part */
1534         pc2 = b->a + bi[i+1];
1535         pj  = b->j + bi[i+1];
1536         nz  = bi[i+2] - bi[i+1];
1537         for (j=0; j<nz; j++) {
1538           col    = pj[j];
1539           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1540         }
1541         /* U part */
1542         pc2 = b->a + bdiag[i+2]+1;
1543         pj  = b->j + bdiag[i+2]+1;
1544         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1545         for (j=0; j<nz; j++) {
1546           col    = pj[j];
1547           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1548         }
1549 
1550         sctx.rs = rs;
1551         sctx.pv = rtmp2[i+1];
1552         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1553         if (sctx.newshift) break;
1554         pc2  = b->a + bdiag[i+1];
1555         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1556 
1557         /* Now take care of 2nd column of diagonal 3x3 block. */
1558         pc3 = rtmp3 + i+1;
1559         if (*pc3 != 0.0) {
1560           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1561           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1562           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1563           for (j=0; j<nz; j++) {
1564             col         = pj[j];
1565             rtmp3[col] -= mul3 * rtmp2[col];
1566           }
1567           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1568         }
1569 
1570         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1571         rs = 0.0;
1572         /* L part */
1573         pc3 = b->a + bi[i+2];
1574         pj  = b->j + bi[i+2];
1575         nz  = bi[i+3] - bi[i+2];
1576         for (j=0; j<nz; j++) {
1577           col    = pj[j];
1578           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1579         }
1580         /* U part */
1581         pc3 = b->a + bdiag[i+3]+1;
1582         pj  = b->j + bdiag[i+3]+1;
1583         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1584         for (j=0; j<nz; j++) {
1585           col    = pj[j];
1586           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1587         }
1588 
1589         sctx.rs = rs;
1590         sctx.pv = rtmp3[i+2];
1591         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1592         if (sctx.newshift) break;
1593         pc3  = b->a + bdiag[i+2];
1594         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1595         break;
1596       case 4:
1597         /*----------*/
1598         /* zero rtmp */
1599         /* L part */
1600         nz    = bi[i+1] - bi[i];
1601         bjtmp = bj + bi[i];
1602         for  (j=0; j<nz; j++) {
1603           col        = bjtmp[j];
1604           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1605         }
1606 
1607         /* U part */
1608         nz    = bdiag[i]-bdiag[i+1];
1609         bjtmp = bj + bdiag[i+1]+1;
1610         for  (j=0; j<nz; j++) {
1611           col        = bjtmp[j];
1612           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1613         }
1614 
1615         /* load in initial (unfactored row) */
1616         nz    = ai[r[i]+1] - ai[r[i]];
1617         ajtmp = aj + ai[r[i]];
1618         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1619         for (j=0; j<nz; j++) {
1620           col        = ics[ajtmp[j]];
1621           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1622         }
1623         /* ZeropivotApply(): shift the diagonal of the matrix  */
1624         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1625 
1626         /* elimination */
1627         bjtmp = bj + bi[i];
1628         row   = *bjtmp++; /* pivot row */
1629         nzL   = bi[i+1] - bi[i];
1630         for (k=0; k < nzL; k++) {
1631           pc1 = rtmp1 + row;
1632           pc2 = rtmp2 + row;
1633           pc3 = rtmp3 + row;
1634           pc4 = rtmp4 + row;
1635           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1636             pv   = b->a + bdiag[row];
1637             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1638             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1639 
1640             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1641             pv = b->a + bdiag[row+1]+1;
1642             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1643             for (j=0; j<nz; j++) {
1644               col         = pj[j];
1645               rtmp1[col] -= mul1 * pv[j];
1646               rtmp2[col] -= mul2 * pv[j];
1647               rtmp3[col] -= mul3 * pv[j];
1648               rtmp4[col] -= mul4 * pv[j];
1649             }
1650             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1651           }
1652           row = *bjtmp++;
1653         }
1654 
1655         /* finished row i; check zero pivot, then stick row i into b->a */
1656         rs = 0.0;
1657         /* L part */
1658         pc1 = b->a + bi[i];
1659         pj  = b->j + bi[i];
1660         nz  = bi[i+1] - bi[i];
1661         for (j=0; j<nz; j++) {
1662           col    = pj[j];
1663           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1664         }
1665         /* U part */
1666         pc1 = b->a + bdiag[i+1]+1;
1667         pj  = b->j + bdiag[i+1]+1;
1668         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1669         for (j=0; j<nz; j++) {
1670           col    = pj[j];
1671           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1672         }
1673 
1674         sctx.rs = rs;
1675         sctx.pv = rtmp1[i];
1676         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1677         if (sctx.newshift) break;
1678         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1679         *pc1 = 1.0/sctx.pv;
1680 
1681         /* Now take care of 1st column of diagonal 4x4 block. */
1682         pc2 = rtmp2 + i;
1683         pc3 = rtmp3 + i;
1684         pc4 = rtmp4 + i;
1685         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1686           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1687           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1688           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1689           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1690           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1691           for (j=0; j<nz; j++) {
1692             col         = pj[j];
1693             rtmp2[col] -= mul2 * rtmp1[col];
1694             rtmp3[col] -= mul3 * rtmp1[col];
1695             rtmp4[col] -= mul4 * rtmp1[col];
1696           }
1697           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1698         }
1699 
1700         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1701         rs = 0.0;
1702         /* L part */
1703         pc2 = b->a + bi[i+1];
1704         pj  = b->j + bi[i+1];
1705         nz  = bi[i+2] - bi[i+1];
1706         for (j=0; j<nz; j++) {
1707           col    = pj[j];
1708           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1709         }
1710         /* U part */
1711         pc2 = b->a + bdiag[i+2]+1;
1712         pj  = b->j + bdiag[i+2]+1;
1713         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1714         for (j=0; j<nz; j++) {
1715           col    = pj[j];
1716           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1717         }
1718 
1719         sctx.rs = rs;
1720         sctx.pv = rtmp2[i+1];
1721         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1722         if (sctx.newshift) break;
1723         pc2  = b->a + bdiag[i+1];
1724         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1725 
1726         /* Now take care of 2nd column of diagonal 4x4 block. */
1727         pc3 = rtmp3 + i+1;
1728         pc4 = rtmp4 + i+1;
1729         if (*pc3 != 0.0 || *pc4 != 0.0) {
1730           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1731           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1732           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1733           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1734           for (j=0; j<nz; j++) {
1735             col         = pj[j];
1736             rtmp3[col] -= mul3 * rtmp2[col];
1737             rtmp4[col] -= mul4 * rtmp2[col];
1738           }
1739           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1740         }
1741 
1742         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1743         rs = 0.0;
1744         /* L part */
1745         pc3 = b->a + bi[i+2];
1746         pj  = b->j + bi[i+2];
1747         nz  = bi[i+3] - bi[i+2];
1748         for (j=0; j<nz; j++) {
1749           col    = pj[j];
1750           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1751         }
1752         /* U part */
1753         pc3 = b->a + bdiag[i+3]+1;
1754         pj  = b->j + bdiag[i+3]+1;
1755         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1756         for (j=0; j<nz; j++) {
1757           col    = pj[j];
1758           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1759         }
1760 
1761         sctx.rs = rs;
1762         sctx.pv = rtmp3[i+2];
1763         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1764         if (sctx.newshift) break;
1765         pc3  = b->a + bdiag[i+2];
1766         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1767 
1768         /* Now take care of 3rd column of diagonal 4x4 block. */
1769         pc4 = rtmp4 + i+2;
1770         if (*pc4 != 0.0) {
1771           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1772           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1773           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1774           for (j=0; j<nz; j++) {
1775             col         = pj[j];
1776             rtmp4[col] -= mul4 * rtmp3[col];
1777           }
1778           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1779         }
1780 
1781         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1782         rs = 0.0;
1783         /* L part */
1784         pc4 = b->a + bi[i+3];
1785         pj  = b->j + bi[i+3];
1786         nz  = bi[i+4] - bi[i+3];
1787         for (j=0; j<nz; j++) {
1788           col    = pj[j];
1789           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1790         }
1791         /* U part */
1792         pc4 = b->a + bdiag[i+4]+1;
1793         pj  = b->j + bdiag[i+4]+1;
1794         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1795         for (j=0; j<nz; j++) {
1796           col    = pj[j];
1797           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1798         }
1799 
1800         sctx.rs = rs;
1801         sctx.pv = rtmp4[i+3];
1802         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1803         if (sctx.newshift) break;
1804         pc4  = b->a + bdiag[i+3];
1805         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1806         break;
1807 
1808       default:
1809         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1810       }
1811       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1812       i += nodesz;                 /* Update the row */
1813     }
1814 
1815     /* MatPivotRefine() */
1816     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1817       /*
1818        * if no shift in this attempt & shifting & started shifting & can refine,
1819        * then try lower shift
1820        */
1821       sctx.shift_hi       = sctx.shift_fraction;
1822       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1823       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1824       sctx.newshift       = PETSC_TRUE;
1825       sctx.nshift++;
1826     }
1827   } while (sctx.newshift);
1828 
1829   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1830   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1831   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1832   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1833 
1834   if (b->inode.size) {
1835     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1836   } else {
1837     C->ops->solve           = MatSolve_SeqAIJ;
1838   }
1839   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1840   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1841   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1842   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1843   C->assembled              = PETSC_TRUE;
1844   C->preallocated           = PETSC_TRUE;
1845 
1846   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1847 
1848   /* MatShiftView(A,info,&sctx) */
1849   if (sctx.nshift) {
1850     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1851       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1852     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1853       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1854     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1855       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1856     }
1857   }
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1862 {
1863   Mat             C     = B;
1864   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1865   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1866   PetscErrorCode  ierr;
1867   const PetscInt  *r,*ic,*c,*ics;
1868   PetscInt        n   = A->rmap->n,*bi = b->i;
1869   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1870   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1871   PetscInt        *ai = a->i,*aj = a->j;
1872   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1873   PetscScalar     mul1,mul2,mul3,tmp;
1874   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1875   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1876   PetscReal       rs=0.0;
1877   FactorShiftCtx  sctx;
1878 
1879   PetscFunctionBegin;
1880   sctx.shift_top      = 0;
1881   sctx.nshift_max     = 0;
1882   sctx.shift_lo       = 0;
1883   sctx.shift_hi       = 0;
1884   sctx.shift_fraction = 0;
1885 
1886   /* if both shift schemes are chosen by user, only use info->shiftpd */
1887   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1888     sctx.shift_top = 0;
1889     for (i=0; i<n; i++) {
1890       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1891       rs    = 0.0;
1892       ajtmp = aj + ai[i];
1893       rtmp1 = aa + ai[i];
1894       nz    = ai[i+1] - ai[i];
1895       for (j=0; j<nz; j++) {
1896         if (*ajtmp != i) {
1897           rs += PetscAbsScalar(*rtmp1++);
1898         } else {
1899           rs -= PetscRealPart(*rtmp1++);
1900         }
1901         ajtmp++;
1902       }
1903       if (rs>sctx.shift_top) sctx.shift_top = rs;
1904     }
1905     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1906     sctx.shift_top *= 1.1;
1907     sctx.nshift_max = 5;
1908     sctx.shift_lo   = 0.;
1909     sctx.shift_hi   = 1.;
1910   }
1911   sctx.shift_amount = 0;
1912   sctx.nshift       = 0;
1913 
1914   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1915   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1916   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1917   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1918   ics    = ic;
1919 
1920   node_max = a->inode.node_count;
1921   ns       = a->inode.size;
1922   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1923 
1924   /* If max inode size > 3, split it into two inodes.*/
1925   /* also map the inode sizes according to the ordering */
1926   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1927   for (i=0,j=0; i<node_max; ++i,++j) {
1928     if (ns[i]>3) {
1929       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1930       ++j;
1931       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1932     } else {
1933       tmp_vec1[j] = ns[i];
1934     }
1935   }
1936   /* Use the correct node_max */
1937   node_max = j;
1938 
1939   /* Now reorder the inode info based on mat re-ordering info */
1940   /* First create a row -> inode_size_array_index map */
1941   ierr = PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);CHKERRQ(ierr);
1942   for (i=0,row=0; i<node_max; i++) {
1943     nodesz = tmp_vec1[i];
1944     for (j=0; j<nodesz; j++,row++) {
1945       nsmap[row] = i;
1946     }
1947   }
1948   /* Using nsmap, create a reordered ns structure */
1949   for (i=0,j=0; i< node_max; i++) {
1950     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1951     tmp_vec2[i] = nodesz;
1952     j          += nodesz;
1953   }
1954   ierr = PetscFree2(nsmap,tmp_vec1);CHKERRQ(ierr);
1955   /* Now use the correct ns */
1956   ns = tmp_vec2;
1957 
1958   do {
1959     sctx.newshift = PETSC_FALSE;
1960     /* Now loop over each block-row, and do the factorization */
1961     for (i=0,row=0; i<node_max; i++) {
1962       nodesz = ns[i];
1963       nz     = bi[row+1] - bi[row];
1964       bjtmp  = bj + bi[row];
1965 
1966       switch (nodesz) {
1967       case 1:
1968         for  (j=0; j<nz; j++) {
1969           idx         = bjtmp[j];
1970           rtmp11[idx] = 0.0;
1971         }
1972 
1973         /* load in initial (unfactored row) */
1974         idx    = r[row];
1975         nz_tmp = ai[idx+1] - ai[idx];
1976         ajtmp  = aj + ai[idx];
1977         v1     = aa + ai[idx];
1978 
1979         for (j=0; j<nz_tmp; j++) {
1980           idx         = ics[ajtmp[j]];
1981           rtmp11[idx] = v1[j];
1982         }
1983         rtmp11[ics[r[row]]] += sctx.shift_amount;
1984 
1985         prow = *bjtmp++;
1986         while (prow < row) {
1987           pc1 = rtmp11 + prow;
1988           if (*pc1 != 0.0) {
1989             pv     = ba + bd[prow];
1990             pj     = nbj + bd[prow];
1991             mul1   = *pc1 * *pv++;
1992             *pc1   = mul1;
1993             nz_tmp = bi[prow+1] - bd[prow] - 1;
1994             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
1995             for (j=0; j<nz_tmp; j++) {
1996               tmp          = pv[j];
1997               idx          = pj[j];
1998               rtmp11[idx] -= mul1 * tmp;
1999             }
2000           }
2001           prow = *bjtmp++;
2002         }
2003         pj  = bj + bi[row];
2004         pc1 = ba + bi[row];
2005 
2006         sctx.pv     = rtmp11[row];
2007         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2008         rs          = 0.0;
2009         for (j=0; j<nz; j++) {
2010           idx    = pj[j];
2011           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2012           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2013         }
2014         sctx.rs = rs;
2015         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2016         if (sctx.newshift) goto endofwhile;
2017         break;
2018 
2019       case 2:
2020         for (j=0; j<nz; j++) {
2021           idx         = bjtmp[j];
2022           rtmp11[idx] = 0.0;
2023           rtmp22[idx] = 0.0;
2024         }
2025 
2026         /* load in initial (unfactored row) */
2027         idx    = r[row];
2028         nz_tmp = ai[idx+1] - ai[idx];
2029         ajtmp  = aj + ai[idx];
2030         v1     = aa + ai[idx];
2031         v2     = aa + ai[idx+1];
2032         for (j=0; j<nz_tmp; j++) {
2033           idx         = ics[ajtmp[j]];
2034           rtmp11[idx] = v1[j];
2035           rtmp22[idx] = v2[j];
2036         }
2037         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2038         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2039 
2040         prow = *bjtmp++;
2041         while (prow < row) {
2042           pc1 = rtmp11 + prow;
2043           pc2 = rtmp22 + prow;
2044           if (*pc1 != 0.0 || *pc2 != 0.0) {
2045             pv   = ba + bd[prow];
2046             pj   = nbj + bd[prow];
2047             mul1 = *pc1 * *pv;
2048             mul2 = *pc2 * *pv;
2049             ++pv;
2050             *pc1 = mul1;
2051             *pc2 = mul2;
2052 
2053             nz_tmp = bi[prow+1] - bd[prow] - 1;
2054             for (j=0; j<nz_tmp; j++) {
2055               tmp          = pv[j];
2056               idx          = pj[j];
2057               rtmp11[idx] -= mul1 * tmp;
2058               rtmp22[idx] -= mul2 * tmp;
2059             }
2060             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2061           }
2062           prow = *bjtmp++;
2063         }
2064 
2065         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2066         pc1 = rtmp11 + prow;
2067         pc2 = rtmp22 + prow;
2068 
2069         sctx.pv = *pc1;
2070         pj      = bj + bi[prow];
2071         rs      = 0.0;
2072         for (j=0; j<nz; j++) {
2073           idx = pj[j];
2074           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2075         }
2076         sctx.rs = rs;
2077         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2078         if (sctx.newshift) goto endofwhile;
2079 
2080         if (*pc2 != 0.0) {
2081           pj     = nbj + bd[prow];
2082           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2083           *pc2   = mul2;
2084           nz_tmp = bi[prow+1] - bd[prow] - 1;
2085           for (j=0; j<nz_tmp; j++) {
2086             idx          = pj[j];
2087             tmp          = rtmp11[idx];
2088             rtmp22[idx] -= mul2 * tmp;
2089           }
2090           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2091         }
2092 
2093         pj  = bj + bi[row];
2094         pc1 = ba + bi[row];
2095         pc2 = ba + bi[row+1];
2096 
2097         sctx.pv       = rtmp22[row+1];
2098         rs            = 0.0;
2099         rtmp11[row]   = 1.0/rtmp11[row];
2100         rtmp22[row+1] = 1.0/rtmp22[row+1];
2101         /* copy row entries from dense representation to sparse */
2102         for (j=0; j<nz; j++) {
2103           idx    = pj[j];
2104           pc1[j] = rtmp11[idx];
2105           pc2[j] = rtmp22[idx];
2106           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2107         }
2108         sctx.rs = rs;
2109         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2110         if (sctx.newshift) goto endofwhile;
2111         break;
2112 
2113       case 3:
2114         for  (j=0; j<nz; j++) {
2115           idx         = bjtmp[j];
2116           rtmp11[idx] = 0.0;
2117           rtmp22[idx] = 0.0;
2118           rtmp33[idx] = 0.0;
2119         }
2120         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2121         idx    = r[row];
2122         nz_tmp = ai[idx+1] - ai[idx];
2123         ajtmp  = aj + ai[idx];
2124         v1     = aa + ai[idx];
2125         v2     = aa + ai[idx+1];
2126         v3     = aa + ai[idx+2];
2127         for (j=0; j<nz_tmp; j++) {
2128           idx         = ics[ajtmp[j]];
2129           rtmp11[idx] = v1[j];
2130           rtmp22[idx] = v2[j];
2131           rtmp33[idx] = v3[j];
2132         }
2133         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2134         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2135         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2136 
2137         /* loop over all pivot row blocks above this row block */
2138         prow = *bjtmp++;
2139         while (prow < row) {
2140           pc1 = rtmp11 + prow;
2141           pc2 = rtmp22 + prow;
2142           pc3 = rtmp33 + prow;
2143           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2144             pv   = ba  + bd[prow];
2145             pj   = nbj + bd[prow];
2146             mul1 = *pc1 * *pv;
2147             mul2 = *pc2 * *pv;
2148             mul3 = *pc3 * *pv;
2149             ++pv;
2150             *pc1 = mul1;
2151             *pc2 = mul2;
2152             *pc3 = mul3;
2153 
2154             nz_tmp = bi[prow+1] - bd[prow] - 1;
2155             /* update this row based on pivot row */
2156             for (j=0; j<nz_tmp; j++) {
2157               tmp          = pv[j];
2158               idx          = pj[j];
2159               rtmp11[idx] -= mul1 * tmp;
2160               rtmp22[idx] -= mul2 * tmp;
2161               rtmp33[idx] -= mul3 * tmp;
2162             }
2163             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2164           }
2165           prow = *bjtmp++;
2166         }
2167 
2168         /* Now take care of diagonal 3x3 block in this set of rows */
2169         /* note: prow = row here */
2170         pc1 = rtmp11 + prow;
2171         pc2 = rtmp22 + prow;
2172         pc3 = rtmp33 + prow;
2173 
2174         sctx.pv = *pc1;
2175         pj      = bj + bi[prow];
2176         rs      = 0.0;
2177         for (j=0; j<nz; j++) {
2178           idx = pj[j];
2179           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2180         }
2181         sctx.rs = rs;
2182         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2183         if (sctx.newshift) goto endofwhile;
2184 
2185         if (*pc2 != 0.0 || *pc3 != 0.0) {
2186           mul2   = (*pc2)/(*pc1);
2187           mul3   = (*pc3)/(*pc1);
2188           *pc2   = mul2;
2189           *pc3   = mul3;
2190           nz_tmp = bi[prow+1] - bd[prow] - 1;
2191           pj     = nbj + bd[prow];
2192           for (j=0; j<nz_tmp; j++) {
2193             idx          = pj[j];
2194             tmp          = rtmp11[idx];
2195             rtmp22[idx] -= mul2 * tmp;
2196             rtmp33[idx] -= mul3 * tmp;
2197           }
2198           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2199         }
2200         ++prow;
2201 
2202         pc2     = rtmp22 + prow;
2203         pc3     = rtmp33 + prow;
2204         sctx.pv = *pc2;
2205         pj      = bj + bi[prow];
2206         rs      = 0.0;
2207         for (j=0; j<nz; j++) {
2208           idx = pj[j];
2209           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2210         }
2211         sctx.rs = rs;
2212         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2213         if (sctx.newshift) goto endofwhile;
2214 
2215         if (*pc3 != 0.0) {
2216           mul3   = (*pc3)/(*pc2);
2217           *pc3   = mul3;
2218           pj     = nbj + bd[prow];
2219           nz_tmp = bi[prow+1] - bd[prow] - 1;
2220           for (j=0; j<nz_tmp; j++) {
2221             idx          = pj[j];
2222             tmp          = rtmp22[idx];
2223             rtmp33[idx] -= mul3 * tmp;
2224           }
2225           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2226         }
2227 
2228         pj  = bj + bi[row];
2229         pc1 = ba + bi[row];
2230         pc2 = ba + bi[row+1];
2231         pc3 = ba + bi[row+2];
2232 
2233         sctx.pv       = rtmp33[row+2];
2234         rs            = 0.0;
2235         rtmp11[row]   = 1.0/rtmp11[row];
2236         rtmp22[row+1] = 1.0/rtmp22[row+1];
2237         rtmp33[row+2] = 1.0/rtmp33[row+2];
2238         /* copy row entries from dense representation to sparse */
2239         for (j=0; j<nz; j++) {
2240           idx    = pj[j];
2241           pc1[j] = rtmp11[idx];
2242           pc2[j] = rtmp22[idx];
2243           pc3[j] = rtmp33[idx];
2244           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2245         }
2246 
2247         sctx.rs = rs;
2248         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2249         if (sctx.newshift) goto endofwhile;
2250         break;
2251 
2252       default:
2253         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2254       }
2255       row += nodesz;                 /* Update the row */
2256     }
2257 endofwhile:;
2258   } while (sctx.newshift);
2259   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2260   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2261   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2262   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2263   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2264 
2265   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2266   /* do not set solve add, since MatSolve_Inode + Add is faster */
2267   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2268   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2269   C->assembled              = PETSC_TRUE;
2270   C->preallocated           = PETSC_TRUE;
2271   if (sctx.nshift) {
2272     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2273       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2274     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2275       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2276     }
2277   }
2278   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2279   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2280   PetscFunctionReturn(0);
2281 }
2282 
2283 
2284 /* ----------------------------------------------------------- */
2285 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2286 {
2287   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2288   IS                iscol = a->col,isrow = a->row;
2289   PetscErrorCode    ierr;
2290   const PetscInt    *r,*c,*rout,*cout;
2291   PetscInt          i,j,n = A->rmap->n;
2292   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2293   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2294   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2295   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2296   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2297   const PetscScalar *b;
2298 
2299   PetscFunctionBegin;
2300   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2301   node_max = a->inode.node_count;
2302   ns       = a->inode.size;     /* Node Size array */
2303 
2304   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2305   ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
2306   tmp  = a->solve_work;
2307 
2308   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2309   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2310 
2311   /* forward solve the lower triangular */
2312   tmps = tmp;
2313   aa   = a_a;
2314   aj   = a_j;
2315   ad   = a->diag;
2316 
2317   for (i = 0,row = 0; i< node_max; ++i) {
2318     nsz = ns[i];
2319     aii = ai[row];
2320     v1  = aa + aii;
2321     vi  = aj + aii;
2322     nz  = ai[row+1]- ai[row];
2323 
2324     if (i < node_max-1) {
2325       /* Prefetch the indices for the next block */
2326       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2327       /* Prefetch the data for the next block */
2328       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2329     }
2330 
2331     switch (nsz) {               /* Each loop in 'case' is unrolled */
2332     case 1:
2333       sum1 = b[r[row]];
2334       for (j=0; j<nz-1; j+=2) {
2335         i0    = vi[j];
2336         i1    = vi[j+1];
2337         tmp0  = tmps[i0];
2338         tmp1  = tmps[i1];
2339         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2340       }
2341       if (j == nz-1) {
2342         tmp0  = tmps[vi[j]];
2343         sum1 -= v1[j]*tmp0;
2344       }
2345       tmp[row++]=sum1;
2346       break;
2347     case 2:
2348       sum1 = b[r[row]];
2349       sum2 = b[r[row+1]];
2350       v2   = aa + ai[row+1];
2351 
2352       for (j=0; j<nz-1; j+=2) {
2353         i0    = vi[j];
2354         i1    = vi[j+1];
2355         tmp0  = tmps[i0];
2356         tmp1  = tmps[i1];
2357         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2358         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2359       }
2360       if (j == nz-1) {
2361         tmp0  = tmps[vi[j]];
2362         sum1 -= v1[j] *tmp0;
2363         sum2 -= v2[j] *tmp0;
2364       }
2365       sum2     -= v2[nz] * sum1;
2366       tmp[row++]=sum1;
2367       tmp[row++]=sum2;
2368       break;
2369     case 3:
2370       sum1 = b[r[row]];
2371       sum2 = b[r[row+1]];
2372       sum3 = b[r[row+2]];
2373       v2   = aa + ai[row+1];
2374       v3   = aa + ai[row+2];
2375 
2376       for (j=0; j<nz-1; j+=2) {
2377         i0    = vi[j];
2378         i1    = vi[j+1];
2379         tmp0  = tmps[i0];
2380         tmp1  = tmps[i1];
2381         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2382         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2383         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2384       }
2385       if (j == nz-1) {
2386         tmp0  = tmps[vi[j]];
2387         sum1 -= v1[j] *tmp0;
2388         sum2 -= v2[j] *tmp0;
2389         sum3 -= v3[j] *tmp0;
2390       }
2391       sum2     -= v2[nz] * sum1;
2392       sum3     -= v3[nz] * sum1;
2393       sum3     -= v3[nz+1] * sum2;
2394       tmp[row++]=sum1;
2395       tmp[row++]=sum2;
2396       tmp[row++]=sum3;
2397       break;
2398 
2399     case 4:
2400       sum1 = b[r[row]];
2401       sum2 = b[r[row+1]];
2402       sum3 = b[r[row+2]];
2403       sum4 = b[r[row+3]];
2404       v2   = aa + ai[row+1];
2405       v3   = aa + ai[row+2];
2406       v4   = aa + ai[row+3];
2407 
2408       for (j=0; j<nz-1; j+=2) {
2409         i0    = vi[j];
2410         i1    = vi[j+1];
2411         tmp0  = tmps[i0];
2412         tmp1  = tmps[i1];
2413         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2414         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2415         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2416         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2417       }
2418       if (j == nz-1) {
2419         tmp0  = tmps[vi[j]];
2420         sum1 -= v1[j] *tmp0;
2421         sum2 -= v2[j] *tmp0;
2422         sum3 -= v3[j] *tmp0;
2423         sum4 -= v4[j] *tmp0;
2424       }
2425       sum2 -= v2[nz] * sum1;
2426       sum3 -= v3[nz] * sum1;
2427       sum4 -= v4[nz] * sum1;
2428       sum3 -= v3[nz+1] * sum2;
2429       sum4 -= v4[nz+1] * sum2;
2430       sum4 -= v4[nz+2] * sum3;
2431 
2432       tmp[row++]=sum1;
2433       tmp[row++]=sum2;
2434       tmp[row++]=sum3;
2435       tmp[row++]=sum4;
2436       break;
2437     case 5:
2438       sum1 = b[r[row]];
2439       sum2 = b[r[row+1]];
2440       sum3 = b[r[row+2]];
2441       sum4 = b[r[row+3]];
2442       sum5 = b[r[row+4]];
2443       v2   = aa + ai[row+1];
2444       v3   = aa + ai[row+2];
2445       v4   = aa + ai[row+3];
2446       v5   = aa + ai[row+4];
2447 
2448       for (j=0; j<nz-1; j+=2) {
2449         i0    = vi[j];
2450         i1    = vi[j+1];
2451         tmp0  = tmps[i0];
2452         tmp1  = tmps[i1];
2453         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2454         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2455         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2456         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2457         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2458       }
2459       if (j == nz-1) {
2460         tmp0  = tmps[vi[j]];
2461         sum1 -= v1[j] *tmp0;
2462         sum2 -= v2[j] *tmp0;
2463         sum3 -= v3[j] *tmp0;
2464         sum4 -= v4[j] *tmp0;
2465         sum5 -= v5[j] *tmp0;
2466       }
2467 
2468       sum2 -= v2[nz] * sum1;
2469       sum3 -= v3[nz] * sum1;
2470       sum4 -= v4[nz] * sum1;
2471       sum5 -= v5[nz] * sum1;
2472       sum3 -= v3[nz+1] * sum2;
2473       sum4 -= v4[nz+1] * sum2;
2474       sum5 -= v5[nz+1] * sum2;
2475       sum4 -= v4[nz+2] * sum3;
2476       sum5 -= v5[nz+2] * sum3;
2477       sum5 -= v5[nz+3] * sum4;
2478 
2479       tmp[row++]=sum1;
2480       tmp[row++]=sum2;
2481       tmp[row++]=sum3;
2482       tmp[row++]=sum4;
2483       tmp[row++]=sum5;
2484       break;
2485     default:
2486       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2487     }
2488   }
2489   /* backward solve the upper triangular */
2490   for (i=node_max -1,row = n-1; i>=0; i--) {
2491     nsz = ns[i];
2492     aii = ad[row+1] + 1;
2493     v1  = aa + aii;
2494     vi  = aj + aii;
2495     nz  = ad[row]- ad[row+1] - 1;
2496 
2497     if (i > 0) {
2498       /* Prefetch the indices for the next block */
2499       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2500       /* Prefetch the data for the next block */
2501       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2502     }
2503 
2504     switch (nsz) {               /* Each loop in 'case' is unrolled */
2505     case 1:
2506       sum1 = tmp[row];
2507 
2508       for (j=0; j<nz-1; j+=2) {
2509         i0    = vi[j];
2510         i1    = vi[j+1];
2511         tmp0  = tmps[i0];
2512         tmp1  = tmps[i1];
2513         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2514       }
2515       if (j == nz-1) {
2516         tmp0  = tmps[vi[j]];
2517         sum1 -= v1[j]*tmp0;
2518       }
2519       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2520       break;
2521     case 2:
2522       sum1 = tmp[row];
2523       sum2 = tmp[row-1];
2524       v2   = aa + ad[row] + 1;
2525       for (j=0; j<nz-1; j+=2) {
2526         i0    = vi[j];
2527         i1    = vi[j+1];
2528         tmp0  = tmps[i0];
2529         tmp1  = tmps[i1];
2530         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2531         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2532       }
2533       if (j == nz-1) {
2534         tmp0  = tmps[vi[j]];
2535         sum1 -= v1[j]* tmp0;
2536         sum2 -= v2[j+1]* tmp0;
2537       }
2538 
2539       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2540       sum2     -= v2[0] * tmp0;
2541       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2542       break;
2543     case 3:
2544       sum1 = tmp[row];
2545       sum2 = tmp[row -1];
2546       sum3 = tmp[row -2];
2547       v2   = aa + ad[row] + 1;
2548       v3   = aa + ad[row -1] + 1;
2549       for (j=0; j<nz-1; j+=2) {
2550         i0    = vi[j];
2551         i1    = vi[j+1];
2552         tmp0  = tmps[i0];
2553         tmp1  = tmps[i1];
2554         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2555         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2556         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2557       }
2558       if (j== nz-1) {
2559         tmp0  = tmps[vi[j]];
2560         sum1 -= v1[j] * tmp0;
2561         sum2 -= v2[j+1] * tmp0;
2562         sum3 -= v3[j+2] * tmp0;
2563       }
2564       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565       sum2     -= v2[0]* tmp0;
2566       sum3     -= v3[1] * tmp0;
2567       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2568       sum3     -= v3[0]* tmp0;
2569       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2570 
2571       break;
2572     case 4:
2573       sum1 = tmp[row];
2574       sum2 = tmp[row -1];
2575       sum3 = tmp[row -2];
2576       sum4 = tmp[row -3];
2577       v2   = aa + ad[row]+1;
2578       v3   = aa + ad[row -1]+1;
2579       v4   = aa + ad[row -2]+1;
2580 
2581       for (j=0; j<nz-1; j+=2) {
2582         i0    = vi[j];
2583         i1    = vi[j+1];
2584         tmp0  = tmps[i0];
2585         tmp1  = tmps[i1];
2586         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2587         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2588         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2589         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2590       }
2591       if (j== nz-1) {
2592         tmp0  = tmps[vi[j]];
2593         sum1 -= v1[j] * tmp0;
2594         sum2 -= v2[j+1] * tmp0;
2595         sum3 -= v3[j+2] * tmp0;
2596         sum4 -= v4[j+3] * tmp0;
2597       }
2598 
2599       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600       sum2     -= v2[0] * tmp0;
2601       sum3     -= v3[1] * tmp0;
2602       sum4     -= v4[2] * tmp0;
2603       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2604       sum3     -= v3[0] * tmp0;
2605       sum4     -= v4[1] * tmp0;
2606       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2607       sum4     -= v4[0] * tmp0;
2608       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2609       break;
2610     case 5:
2611       sum1 = tmp[row];
2612       sum2 = tmp[row -1];
2613       sum3 = tmp[row -2];
2614       sum4 = tmp[row -3];
2615       sum5 = tmp[row -4];
2616       v2   = aa + ad[row]+1;
2617       v3   = aa + ad[row -1]+1;
2618       v4   = aa + ad[row -2]+1;
2619       v5   = aa + ad[row -3]+1;
2620       for (j=0; j<nz-1; j+=2) {
2621         i0    = vi[j];
2622         i1    = vi[j+1];
2623         tmp0  = tmps[i0];
2624         tmp1  = tmps[i1];
2625         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2626         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2630       }
2631       if (j==nz-1) {
2632         tmp0  = tmps[vi[j]];
2633         sum1 -= v1[j] * tmp0;
2634         sum2 -= v2[j+1] * tmp0;
2635         sum3 -= v3[j+2] * tmp0;
2636         sum4 -= v4[j+3] * tmp0;
2637         sum5 -= v5[j+4] * tmp0;
2638       }
2639 
2640       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2641       sum2     -= v2[0] * tmp0;
2642       sum3     -= v3[1] * tmp0;
2643       sum4     -= v4[2] * tmp0;
2644       sum5     -= v5[3] * tmp0;
2645       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2646       sum3     -= v3[0] * tmp0;
2647       sum4     -= v4[1] * tmp0;
2648       sum5     -= v5[2] * tmp0;
2649       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650       sum4     -= v4[0] * tmp0;
2651       sum5     -= v5[1] * tmp0;
2652       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2653       sum5     -= v5[0] * tmp0;
2654       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2655       break;
2656     default:
2657       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2658     }
2659   }
2660   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2661   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2662   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2663   ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
2664   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2665   PetscFunctionReturn(0);
2666 }
2667 
2668 
2669 /*
2670      Makes a longer coloring[] array and calls the usual code with that
2671 */
2672 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2673 {
2674   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2675   PetscErrorCode  ierr;
2676   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2677   PetscInt        *colorused,i;
2678   ISColoringValue *newcolor;
2679 
2680   PetscFunctionBegin;
2681   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2682   /* loop over inodes, marking a color for each column*/
2683   row = 0;
2684   for (i=0; i<m; i++) {
2685     for (j=0; j<ns[i]; j++) {
2686       newcolor[row++] = coloring[i] + j*ncolors;
2687     }
2688   }
2689 
2690   /* eliminate unneeded colors */
2691   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2692   for (i=0; i<n; i++) {
2693     colorused[newcolor[i]] = 1;
2694   }
2695 
2696   for (i=1; i<5*ncolors; i++) {
2697     colorused[i] += colorused[i-1];
2698   }
2699   ncolors = colorused[5*ncolors-1];
2700   for (i=0; i<n; i++) {
2701     newcolor[i] = colorused[newcolor[i]]-1;
2702   }
2703   ierr = PetscFree(colorused);CHKERRQ(ierr);
2704   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2705   ierr = PetscFree(coloring);CHKERRQ(ierr);
2706   PetscFunctionReturn(0);
2707 }
2708 
2709 #include <petsc/private/kernels/blockinvert.h>
2710 
2711 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2712 {
2713   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2714   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2715   MatScalar         *ibdiag,*bdiag,work[25],*t;
2716   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2717   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2718   const PetscScalar *xb, *b;
2719   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2720   PetscErrorCode    ierr;
2721   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2722   PetscInt          sz,k,ipvt[5];
2723   PetscBool         allowzeropivot,zeropivotdetected;
2724   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2725 
2726   PetscFunctionBegin;
2727   allowzeropivot = PetscNot(A->erroriffailure);
2728   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2729   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2730 
2731   if (!a->inode.ibdiagvalid) {
2732     if (!a->inode.ibdiag) {
2733       /* calculate space needed for diagonal blocks */
2734       for (i=0; i<m; i++) {
2735         cnt += sizes[i]*sizes[i];
2736       }
2737       a->inode.bdiagsize = cnt;
2738 
2739       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2740     }
2741 
2742     /* copy over the diagonal blocks and invert them */
2743     ibdiag = a->inode.ibdiag;
2744     bdiag  = a->inode.bdiag;
2745     cnt    = 0;
2746     for (i=0, row = 0; i<m; i++) {
2747       for (j=0; j<sizes[i]; j++) {
2748         for (k=0; k<sizes[i]; k++) {
2749           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2750         }
2751       }
2752       ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr);
2753 
2754       switch (sizes[i]) {
2755       case 1:
2756         /* Create matrix data structure */
2757         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2758           if (allowzeropivot) {
2759             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2760             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2761             A->factorerror_zeropivot_row   = row;
2762             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2763           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2764         }
2765         ibdiag[cnt] = 1.0/ibdiag[cnt];
2766         break;
2767       case 2:
2768         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2769         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2770         break;
2771       case 3:
2772         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2773         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2774         break;
2775       case 4:
2776         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2777         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2778         break;
2779       case 5:
2780         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2781         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2782         break;
2783       default:
2784         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2785       }
2786       cnt += sizes[i]*sizes[i];
2787       row += sizes[i];
2788     }
2789     a->inode.ibdiagvalid = PETSC_TRUE;
2790   }
2791   ibdiag = a->inode.ibdiag;
2792   bdiag  = a->inode.bdiag;
2793   t      = a->inode.ssor_work;
2794 
2795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2796   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2797   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2798   if (flag & SOR_ZERO_INITIAL_GUESS) {
2799     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2800 
2801       for (i=0, row=0; i<m; i++) {
2802         sz  = diag[row] - ii[row];
2803         v1  = a->a + ii[row];
2804         idx = a->j + ii[row];
2805 
2806         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2807         switch (sizes[i]) {
2808         case 1:
2809 
2810           sum1 = b[row];
2811           for (n = 0; n<sz-1; n+=2) {
2812             i1    = idx[0];
2813             i2    = idx[1];
2814             idx  += 2;
2815             tmp0  = x[i1];
2816             tmp1  = x[i2];
2817             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2818           }
2819 
2820           if (n == sz-1) {
2821             tmp0  = x[*idx];
2822             sum1 -= *v1 * tmp0;
2823           }
2824           t[row]   = sum1;
2825           x[row++] = sum1*(*ibdiag++);
2826           break;
2827         case 2:
2828           v2   = a->a + ii[row+1];
2829           sum1 = b[row];
2830           sum2 = b[row+1];
2831           for (n = 0; n<sz-1; n+=2) {
2832             i1    = idx[0];
2833             i2    = idx[1];
2834             idx  += 2;
2835             tmp0  = x[i1];
2836             tmp1  = x[i2];
2837             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2838             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2839           }
2840 
2841           if (n == sz-1) {
2842             tmp0  = x[*idx];
2843             sum1 -= v1[0] * tmp0;
2844             sum2 -= v2[0] * tmp0;
2845           }
2846           t[row]   = sum1;
2847           t[row+1] = sum2;
2848           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2849           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2850           ibdiag  += 4;
2851           break;
2852         case 3:
2853           v2   = a->a + ii[row+1];
2854           v3   = a->a + ii[row+2];
2855           sum1 = b[row];
2856           sum2 = b[row+1];
2857           sum3 = b[row+2];
2858           for (n = 0; n<sz-1; n+=2) {
2859             i1    = idx[0];
2860             i2    = idx[1];
2861             idx  += 2;
2862             tmp0  = x[i1];
2863             tmp1  = x[i2];
2864             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2865             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2866             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2867           }
2868 
2869           if (n == sz-1) {
2870             tmp0  = x[*idx];
2871             sum1 -= v1[0] * tmp0;
2872             sum2 -= v2[0] * tmp0;
2873             sum3 -= v3[0] * tmp0;
2874           }
2875           t[row]   = sum1;
2876           t[row+1] = sum2;
2877           t[row+2] = sum3;
2878           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2879           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2880           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2881           ibdiag  += 9;
2882           break;
2883         case 4:
2884           v2   = a->a + ii[row+1];
2885           v3   = a->a + ii[row+2];
2886           v4   = a->a + ii[row+3];
2887           sum1 = b[row];
2888           sum2 = b[row+1];
2889           sum3 = b[row+2];
2890           sum4 = b[row+3];
2891           for (n = 0; n<sz-1; n+=2) {
2892             i1    = idx[0];
2893             i2    = idx[1];
2894             idx  += 2;
2895             tmp0  = x[i1];
2896             tmp1  = x[i2];
2897             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2898             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2899             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2900             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2901           }
2902 
2903           if (n == sz-1) {
2904             tmp0  = x[*idx];
2905             sum1 -= v1[0] * tmp0;
2906             sum2 -= v2[0] * tmp0;
2907             sum3 -= v3[0] * tmp0;
2908             sum4 -= v4[0] * tmp0;
2909           }
2910           t[row]   = sum1;
2911           t[row+1] = sum2;
2912           t[row+2] = sum3;
2913           t[row+3] = sum4;
2914           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2915           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2916           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2917           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2918           ibdiag  += 16;
2919           break;
2920         case 5:
2921           v2   = a->a + ii[row+1];
2922           v3   = a->a + ii[row+2];
2923           v4   = a->a + ii[row+3];
2924           v5   = a->a + ii[row+4];
2925           sum1 = b[row];
2926           sum2 = b[row+1];
2927           sum3 = b[row+2];
2928           sum4 = b[row+3];
2929           sum5 = b[row+4];
2930           for (n = 0; n<sz-1; n+=2) {
2931             i1    = idx[0];
2932             i2    = idx[1];
2933             idx  += 2;
2934             tmp0  = x[i1];
2935             tmp1  = x[i2];
2936             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2937             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2938             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2939             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2940             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2941           }
2942 
2943           if (n == sz-1) {
2944             tmp0  = x[*idx];
2945             sum1 -= v1[0] * tmp0;
2946             sum2 -= v2[0] * tmp0;
2947             sum3 -= v3[0] * tmp0;
2948             sum4 -= v4[0] * tmp0;
2949             sum5 -= v5[0] * tmp0;
2950           }
2951           t[row]   = sum1;
2952           t[row+1] = sum2;
2953           t[row+2] = sum3;
2954           t[row+3] = sum4;
2955           t[row+4] = sum5;
2956           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2957           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2958           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2959           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2960           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2961           ibdiag  += 25;
2962           break;
2963         default:
2964           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2965         }
2966       }
2967 
2968       xb   = t;
2969       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2970     } else xb = b;
2971     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2972 
2973       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2974       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2975         ibdiag -= sizes[i]*sizes[i];
2976         sz      = ii[row+1] - diag[row] - 1;
2977         v1      = a->a + diag[row] + 1;
2978         idx     = a->j + diag[row] + 1;
2979 
2980         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2981         switch (sizes[i]) {
2982         case 1:
2983 
2984           sum1 = xb[row];
2985           for (n = 0; n<sz-1; n+=2) {
2986             i1    = idx[0];
2987             i2    = idx[1];
2988             idx  += 2;
2989             tmp0  = x[i1];
2990             tmp1  = x[i2];
2991             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2992           }
2993 
2994           if (n == sz-1) {
2995             tmp0  = x[*idx];
2996             sum1 -= *v1*tmp0;
2997           }
2998           x[row--] = sum1*(*ibdiag);
2999           break;
3000 
3001         case 2:
3002 
3003           sum1 = xb[row];
3004           sum2 = xb[row-1];
3005           /* note that sum1 is associated with the second of the two rows */
3006           v2 = a->a + diag[row-1] + 2;
3007           for (n = 0; n<sz-1; n+=2) {
3008             i1    = idx[0];
3009             i2    = idx[1];
3010             idx  += 2;
3011             tmp0  = x[i1];
3012             tmp1  = x[i2];
3013             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3014             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3015           }
3016 
3017           if (n == sz-1) {
3018             tmp0  = x[*idx];
3019             sum1 -= *v1*tmp0;
3020             sum2 -= *v2*tmp0;
3021           }
3022           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3023           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3024           break;
3025         case 3:
3026 
3027           sum1 = xb[row];
3028           sum2 = xb[row-1];
3029           sum3 = xb[row-2];
3030           v2   = a->a + diag[row-1] + 2;
3031           v3   = a->a + diag[row-2] + 3;
3032           for (n = 0; n<sz-1; n+=2) {
3033             i1    = idx[0];
3034             i2    = idx[1];
3035             idx  += 2;
3036             tmp0  = x[i1];
3037             tmp1  = x[i2];
3038             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3039             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3040             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3041           }
3042 
3043           if (n == sz-1) {
3044             tmp0  = x[*idx];
3045             sum1 -= *v1*tmp0;
3046             sum2 -= *v2*tmp0;
3047             sum3 -= *v3*tmp0;
3048           }
3049           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3050           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3051           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3052           break;
3053         case 4:
3054 
3055           sum1 = xb[row];
3056           sum2 = xb[row-1];
3057           sum3 = xb[row-2];
3058           sum4 = xb[row-3];
3059           v2   = a->a + diag[row-1] + 2;
3060           v3   = a->a + diag[row-2] + 3;
3061           v4   = a->a + diag[row-3] + 4;
3062           for (n = 0; n<sz-1; n+=2) {
3063             i1    = idx[0];
3064             i2    = idx[1];
3065             idx  += 2;
3066             tmp0  = x[i1];
3067             tmp1  = x[i2];
3068             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3069             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3070             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3071             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3072           }
3073 
3074           if (n == sz-1) {
3075             tmp0  = x[*idx];
3076             sum1 -= *v1*tmp0;
3077             sum2 -= *v2*tmp0;
3078             sum3 -= *v3*tmp0;
3079             sum4 -= *v4*tmp0;
3080           }
3081           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3082           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3083           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3084           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3085           break;
3086         case 5:
3087 
3088           sum1 = xb[row];
3089           sum2 = xb[row-1];
3090           sum3 = xb[row-2];
3091           sum4 = xb[row-3];
3092           sum5 = xb[row-4];
3093           v2   = a->a + diag[row-1] + 2;
3094           v3   = a->a + diag[row-2] + 3;
3095           v4   = a->a + diag[row-3] + 4;
3096           v5   = a->a + diag[row-4] + 5;
3097           for (n = 0; n<sz-1; n+=2) {
3098             i1    = idx[0];
3099             i2    = idx[1];
3100             idx  += 2;
3101             tmp0  = x[i1];
3102             tmp1  = x[i2];
3103             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3104             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3105             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3106             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3107             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3108           }
3109 
3110           if (n == sz-1) {
3111             tmp0  = x[*idx];
3112             sum1 -= *v1*tmp0;
3113             sum2 -= *v2*tmp0;
3114             sum3 -= *v3*tmp0;
3115             sum4 -= *v4*tmp0;
3116             sum5 -= *v5*tmp0;
3117           }
3118           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3119           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3120           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3121           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3122           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3123           break;
3124         default:
3125           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3126         }
3127       }
3128 
3129       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3130     }
3131     its--;
3132   }
3133   while (its--) {
3134 
3135     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3136       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3137            i<m;
3138            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3139 
3140         sz  = diag[row] - ii[row];
3141         v1  = a->a + ii[row];
3142         idx = a->j + ii[row];
3143         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3144         switch (sizes[i]) {
3145         case 1:
3146           sum1 = b[row];
3147           for (n = 0; n<sz-1; n+=2) {
3148             i1    = idx[0];
3149             i2    = idx[1];
3150             idx  += 2;
3151             tmp0  = x[i1];
3152             tmp1  = x[i2];
3153             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3154           }
3155           if (n == sz-1) {
3156             tmp0  = x[*idx++];
3157             sum1 -= *v1 * tmp0;
3158             v1++;
3159           }
3160           t[row]   = sum1;
3161           sz      = ii[row+1] - diag[row] - 1;
3162           idx     = a->j + diag[row] + 1;
3163           v1 += 1;
3164           for (n = 0; n<sz-1; n+=2) {
3165             i1    = idx[0];
3166             i2    = idx[1];
3167             idx  += 2;
3168             tmp0  = x[i1];
3169             tmp1  = x[i2];
3170             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3171           }
3172           if (n == sz-1) {
3173             tmp0  = x[*idx++];
3174             sum1 -= *v1 * tmp0;
3175           }
3176           /* in MatSOR_SeqAIJ this line would be
3177            *
3178            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3179            *
3180            * but omega == 1, so this becomes
3181            *
3182            * x[row] = sum1*(*ibdiag++);
3183            *
3184            */
3185           x[row] = sum1*(*ibdiag);
3186           break;
3187         case 2:
3188           v2   = a->a + ii[row+1];
3189           sum1 = b[row];
3190           sum2 = b[row+1];
3191           for (n = 0; n<sz-1; n+=2) {
3192             i1    = idx[0];
3193             i2    = idx[1];
3194             idx  += 2;
3195             tmp0  = x[i1];
3196             tmp1  = x[i2];
3197             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3198             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3199           }
3200           if (n == sz-1) {
3201             tmp0  = x[*idx++];
3202             sum1 -= v1[0] * tmp0;
3203             sum2 -= v2[0] * tmp0;
3204             v1++; v2++;
3205           }
3206           t[row]   = sum1;
3207           t[row+1] = sum2;
3208           sz      = ii[row+1] - diag[row] - 2;
3209           idx     = a->j + diag[row] + 2;
3210           v1 += 2;
3211           v2 += 2;
3212           for (n = 0; n<sz-1; n+=2) {
3213             i1    = idx[0];
3214             i2    = idx[1];
3215             idx  += 2;
3216             tmp0  = x[i1];
3217             tmp1  = x[i2];
3218             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3219             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3220           }
3221           if (n == sz-1) {
3222             tmp0  = x[*idx];
3223             sum1 -= v1[0] * tmp0;
3224             sum2 -= v2[0] * tmp0;
3225           }
3226           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3227           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3228           break;
3229         case 3:
3230           v2   = a->a + ii[row+1];
3231           v3   = a->a + ii[row+2];
3232           sum1 = b[row];
3233           sum2 = b[row+1];
3234           sum3 = b[row+2];
3235           for (n = 0; n<sz-1; n+=2) {
3236             i1    = idx[0];
3237             i2    = idx[1];
3238             idx  += 2;
3239             tmp0  = x[i1];
3240             tmp1  = x[i2];
3241             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3242             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3243             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3244           }
3245           if (n == sz-1) {
3246             tmp0  = x[*idx++];
3247             sum1 -= v1[0] * tmp0;
3248             sum2 -= v2[0] * tmp0;
3249             sum3 -= v3[0] * tmp0;
3250             v1++; v2++; v3++;
3251           }
3252           t[row]   = sum1;
3253           t[row+1] = sum2;
3254           t[row+2] = sum3;
3255           sz      = ii[row+1] - diag[row] - 3;
3256           idx     = a->j + diag[row] + 3;
3257           v1 += 3;
3258           v2 += 3;
3259           v3 += 3;
3260           for (n = 0; n<sz-1; n+=2) {
3261             i1    = idx[0];
3262             i2    = idx[1];
3263             idx  += 2;
3264             tmp0  = x[i1];
3265             tmp1  = x[i2];
3266             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3267             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3268             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3269           }
3270           if (n == sz-1) {
3271             tmp0  = x[*idx];
3272             sum1 -= v1[0] * tmp0;
3273             sum2 -= v2[0] * tmp0;
3274             sum3 -= v3[0] * tmp0;
3275           }
3276           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3277           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3278           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3279           break;
3280         case 4:
3281           v2   = a->a + ii[row+1];
3282           v3   = a->a + ii[row+2];
3283           v4   = a->a + ii[row+3];
3284           sum1 = b[row];
3285           sum2 = b[row+1];
3286           sum3 = b[row+2];
3287           sum4 = b[row+3];
3288           for (n = 0; n<sz-1; n+=2) {
3289             i1    = idx[0];
3290             i2    = idx[1];
3291             idx  += 2;
3292             tmp0  = x[i1];
3293             tmp1  = x[i2];
3294             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3295             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3296             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3297             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3298           }
3299           if (n == sz-1) {
3300             tmp0  = x[*idx++];
3301             sum1 -= v1[0] * tmp0;
3302             sum2 -= v2[0] * tmp0;
3303             sum3 -= v3[0] * tmp0;
3304             sum4 -= v4[0] * tmp0;
3305             v1++; v2++; v3++; v4++;
3306           }
3307           t[row]   = sum1;
3308           t[row+1] = sum2;
3309           t[row+2] = sum3;
3310           t[row+3] = sum4;
3311           sz      = ii[row+1] - diag[row] - 4;
3312           idx     = a->j + diag[row] + 4;
3313           v1 += 4;
3314           v2 += 4;
3315           v3 += 4;
3316           v4 += 4;
3317           for (n = 0; n<sz-1; n+=2) {
3318             i1    = idx[0];
3319             i2    = idx[1];
3320             idx  += 2;
3321             tmp0  = x[i1];
3322             tmp1  = x[i2];
3323             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3324             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3325             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3326             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3327           }
3328           if (n == sz-1) {
3329             tmp0  = x[*idx];
3330             sum1 -= v1[0] * tmp0;
3331             sum2 -= v2[0] * tmp0;
3332             sum3 -= v3[0] * tmp0;
3333             sum4 -= v4[0] * tmp0;
3334           }
3335           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3336           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3337           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3338           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3339           break;
3340         case 5:
3341           v2   = a->a + ii[row+1];
3342           v3   = a->a + ii[row+2];
3343           v4   = a->a + ii[row+3];
3344           v5   = a->a + ii[row+4];
3345           sum1 = b[row];
3346           sum2 = b[row+1];
3347           sum3 = b[row+2];
3348           sum4 = b[row+3];
3349           sum5 = b[row+4];
3350           for (n = 0; n<sz-1; n+=2) {
3351             i1    = idx[0];
3352             i2    = idx[1];
3353             idx  += 2;
3354             tmp0  = x[i1];
3355             tmp1  = x[i2];
3356             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3357             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3358             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3359             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3360             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3361           }
3362           if (n == sz-1) {
3363             tmp0  = x[*idx++];
3364             sum1 -= v1[0] * tmp0;
3365             sum2 -= v2[0] * tmp0;
3366             sum3 -= v3[0] * tmp0;
3367             sum4 -= v4[0] * tmp0;
3368             sum5 -= v5[0] * tmp0;
3369             v1++; v2++; v3++; v4++; v5++;
3370           }
3371           t[row]   = sum1;
3372           t[row+1] = sum2;
3373           t[row+2] = sum3;
3374           t[row+3] = sum4;
3375           t[row+4] = sum5;
3376           sz      = ii[row+1] - diag[row] - 5;
3377           idx     = a->j + diag[row] + 5;
3378           v1 += 5;
3379           v2 += 5;
3380           v3 += 5;
3381           v4 += 5;
3382           v5 += 5;
3383           for (n = 0; n<sz-1; n+=2) {
3384             i1    = idx[0];
3385             i2    = idx[1];
3386             idx  += 2;
3387             tmp0  = x[i1];
3388             tmp1  = x[i2];
3389             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3390             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3391             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3392             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3393             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3394           }
3395           if (n == sz-1) {
3396             tmp0  = x[*idx];
3397             sum1 -= v1[0] * tmp0;
3398             sum2 -= v2[0] * tmp0;
3399             sum3 -= v3[0] * tmp0;
3400             sum4 -= v4[0] * tmp0;
3401             sum5 -= v5[0] * tmp0;
3402           }
3403           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3404           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3405           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3406           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3407           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3408           break;
3409         default:
3410           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3411         }
3412       }
3413       xb   = t;
3414       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3415     } else xb = b;
3416 
3417     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3418 
3419       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3420       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3421         ibdiag -= sizes[i]*sizes[i];
3422 
3423         /* set RHS */
3424         if (xb == b) {
3425           /* whole (old way) */
3426           sz      = ii[row+1] - ii[row];
3427           idx     = a->j + ii[row];
3428           switch (sizes[i]) {
3429           case 5:
3430             v5      = a->a + ii[row-4];
3431           case 4: /* fall through */
3432             v4      = a->a + ii[row-3];
3433           case 3:
3434             v3      = a->a + ii[row-2];
3435           case 2:
3436             v2      = a->a + ii[row-1];
3437           case 1:
3438             v1      = a->a + ii[row];
3439             break;
3440           default:
3441             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3442           }
3443         } else {
3444           /* upper, no diag */
3445           sz      = ii[row+1] - diag[row] - 1;
3446           idx     = a->j + diag[row] + 1;
3447           switch (sizes[i]) {
3448           case 5:
3449             v5      = a->a + diag[row-4] + 5;
3450           case 4: /* fall through */
3451             v4      = a->a + diag[row-3] + 4;
3452           case 3:
3453             v3      = a->a + diag[row-2] + 3;
3454           case 2:
3455             v2      = a->a + diag[row-1] + 2;
3456           case 1:
3457             v1      = a->a + diag[row] + 1;
3458           }
3459         }
3460         /* set sum */
3461         switch (sizes[i]) {
3462         case 5:
3463           sum5 = xb[row-4];
3464         case 4: /* fall through */
3465           sum4 = xb[row-3];
3466         case 3:
3467           sum3 = xb[row-2];
3468         case 2:
3469           sum2 = xb[row-1];
3470         case 1:
3471           /* note that sum1 is associated with the last row */
3472           sum1 = xb[row];
3473         }
3474         /* do sums */
3475         for (n = 0; n<sz-1; n+=2) {
3476             i1    = idx[0];
3477             i2    = idx[1];
3478             idx  += 2;
3479             tmp0  = x[i1];
3480             tmp1  = x[i2];
3481             switch (sizes[i]) {
3482             case 5:
3483               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3484             case 4: /* fall through */
3485               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3486             case 3:
3487               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3488             case 2:
3489               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3490             case 1:
3491               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3492             }
3493         }
3494         /* ragged edge */
3495         if (n == sz-1) {
3496           tmp0  = x[*idx];
3497           switch (sizes[i]) {
3498           case 5:
3499             sum5 -= *v5*tmp0;
3500           case 4: /* fall through */
3501             sum4 -= *v4*tmp0;
3502           case 3:
3503             sum3 -= *v3*tmp0;
3504           case 2:
3505             sum2 -= *v2*tmp0;
3506           case 1:
3507             sum1 -= *v1*tmp0;
3508           }
3509         }
3510         /* update */
3511         if (xb == b) {
3512           /* whole (old way) w/ diag */
3513           switch (sizes[i]) {
3514           case 5:
3515             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3516             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3517             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3518             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3519             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3520             break;
3521           case 4:
3522             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3523             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3524             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3525             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3526             break;
3527           case 3:
3528             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3529             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3530             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3531             break;
3532           case 2:
3533             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3534             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3535             break;
3536           case 1:
3537             x[row--] += sum1*(*ibdiag);
3538             break;
3539           }
3540         } else {
3541           /* no diag so set =  */
3542           switch (sizes[i]) {
3543           case 5:
3544             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3545             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3546             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3547             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3548             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3549             break;
3550           case 4:
3551             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3552             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3553             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3554             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3555             break;
3556           case 3:
3557             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3558             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3559             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3560             break;
3561           case 2:
3562             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3563             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3564             break;
3565           case 1:
3566             x[row--] = sum1*(*ibdiag);
3567             break;
3568           }
3569         }
3570       }
3571       if (xb == b) {
3572         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3573       } else {
3574         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3575       }
3576     }
3577   }
3578   if (flag & SOR_EISENSTAT) {
3579     /*
3580           Apply  (U + D)^-1  where D is now the block diagonal
3581     */
3582     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3583     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3584       ibdiag -= sizes[i]*sizes[i];
3585       sz      = ii[row+1] - diag[row] - 1;
3586       v1      = a->a + diag[row] + 1;
3587       idx     = a->j + diag[row] + 1;
3588       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3589       switch (sizes[i]) {
3590       case 1:
3591 
3592         sum1 = b[row];
3593         for (n = 0; n<sz-1; n+=2) {
3594           i1    = idx[0];
3595           i2    = idx[1];
3596           idx  += 2;
3597           tmp0  = x[i1];
3598           tmp1  = x[i2];
3599           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3600         }
3601 
3602         if (n == sz-1) {
3603           tmp0  = x[*idx];
3604           sum1 -= *v1*tmp0;
3605         }
3606         x[row] = sum1*(*ibdiag);row--;
3607         break;
3608 
3609       case 2:
3610 
3611         sum1 = b[row];
3612         sum2 = b[row-1];
3613         /* note that sum1 is associated with the second of the two rows */
3614         v2 = a->a + diag[row-1] + 2;
3615         for (n = 0; n<sz-1; n+=2) {
3616           i1    = idx[0];
3617           i2    = idx[1];
3618           idx  += 2;
3619           tmp0  = x[i1];
3620           tmp1  = x[i2];
3621           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3622           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3623         }
3624 
3625         if (n == sz-1) {
3626           tmp0  = x[*idx];
3627           sum1 -= *v1*tmp0;
3628           sum2 -= *v2*tmp0;
3629         }
3630         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3631         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3632         row     -= 2;
3633         break;
3634       case 3:
3635 
3636         sum1 = b[row];
3637         sum2 = b[row-1];
3638         sum3 = b[row-2];
3639         v2   = a->a + diag[row-1] + 2;
3640         v3   = a->a + diag[row-2] + 3;
3641         for (n = 0; n<sz-1; n+=2) {
3642           i1    = idx[0];
3643           i2    = idx[1];
3644           idx  += 2;
3645           tmp0  = x[i1];
3646           tmp1  = x[i2];
3647           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3648           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3649           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3650         }
3651 
3652         if (n == sz-1) {
3653           tmp0  = x[*idx];
3654           sum1 -= *v1*tmp0;
3655           sum2 -= *v2*tmp0;
3656           sum3 -= *v3*tmp0;
3657         }
3658         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3659         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3660         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3661         row     -= 3;
3662         break;
3663       case 4:
3664 
3665         sum1 = b[row];
3666         sum2 = b[row-1];
3667         sum3 = b[row-2];
3668         sum4 = b[row-3];
3669         v2   = a->a + diag[row-1] + 2;
3670         v3   = a->a + diag[row-2] + 3;
3671         v4   = a->a + diag[row-3] + 4;
3672         for (n = 0; n<sz-1; n+=2) {
3673           i1    = idx[0];
3674           i2    = idx[1];
3675           idx  += 2;
3676           tmp0  = x[i1];
3677           tmp1  = x[i2];
3678           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3679           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3680           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3681           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3682         }
3683 
3684         if (n == sz-1) {
3685           tmp0  = x[*idx];
3686           sum1 -= *v1*tmp0;
3687           sum2 -= *v2*tmp0;
3688           sum3 -= *v3*tmp0;
3689           sum4 -= *v4*tmp0;
3690         }
3691         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3692         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3693         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3694         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3695         row     -= 4;
3696         break;
3697       case 5:
3698 
3699         sum1 = b[row];
3700         sum2 = b[row-1];
3701         sum3 = b[row-2];
3702         sum4 = b[row-3];
3703         sum5 = b[row-4];
3704         v2   = a->a + diag[row-1] + 2;
3705         v3   = a->a + diag[row-2] + 3;
3706         v4   = a->a + diag[row-3] + 4;
3707         v5   = a->a + diag[row-4] + 5;
3708         for (n = 0; n<sz-1; n+=2) {
3709           i1    = idx[0];
3710           i2    = idx[1];
3711           idx  += 2;
3712           tmp0  = x[i1];
3713           tmp1  = x[i2];
3714           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3715           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3716           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3717           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3718           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3719         }
3720 
3721         if (n == sz-1) {
3722           tmp0  = x[*idx];
3723           sum1 -= *v1*tmp0;
3724           sum2 -= *v2*tmp0;
3725           sum3 -= *v3*tmp0;
3726           sum4 -= *v4*tmp0;
3727           sum5 -= *v5*tmp0;
3728         }
3729         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3730         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3731         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3732         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3733         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3734         row     -= 5;
3735         break;
3736       default:
3737         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3738       }
3739     }
3740     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3741 
3742     /*
3743            t = b - D x    where D is the block diagonal
3744     */
3745     cnt = 0;
3746     for (i=0, row=0; i<m; i++) {
3747       switch (sizes[i]) {
3748       case 1:
3749         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3750         break;
3751       case 2:
3752         x1       = x[row]; x2 = x[row+1];
3753         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3754         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3755         t[row]   = b[row] - tmp1;
3756         t[row+1] = b[row+1] - tmp2; row += 2;
3757         cnt     += 4;
3758         break;
3759       case 3:
3760         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3761         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3762         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3763         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3764         t[row]   = b[row] - tmp1;
3765         t[row+1] = b[row+1] - tmp2;
3766         t[row+2] = b[row+2] - tmp3; row += 3;
3767         cnt     += 9;
3768         break;
3769       case 4:
3770         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3771         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3772         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3773         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3774         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3775         t[row]   = b[row] - tmp1;
3776         t[row+1] = b[row+1] - tmp2;
3777         t[row+2] = b[row+2] - tmp3;
3778         t[row+3] = b[row+3] - tmp4; row += 4;
3779         cnt     += 16;
3780         break;
3781       case 5:
3782         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3783         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3784         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3785         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3786         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3787         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3788         t[row]   = b[row] - tmp1;
3789         t[row+1] = b[row+1] - tmp2;
3790         t[row+2] = b[row+2] - tmp3;
3791         t[row+3] = b[row+3] - tmp4;
3792         t[row+4] = b[row+4] - tmp5;row += 5;
3793         cnt     += 25;
3794         break;
3795       default:
3796         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3797       }
3798     }
3799     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3800 
3801 
3802 
3803     /*
3804           Apply (L + D)^-1 where D is the block diagonal
3805     */
3806     for (i=0, row=0; i<m; i++) {
3807       sz  = diag[row] - ii[row];
3808       v1  = a->a + ii[row];
3809       idx = a->j + ii[row];
3810       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3811       switch (sizes[i]) {
3812       case 1:
3813 
3814         sum1 = t[row];
3815         for (n = 0; n<sz-1; n+=2) {
3816           i1    = idx[0];
3817           i2    = idx[1];
3818           idx  += 2;
3819           tmp0  = t[i1];
3820           tmp1  = t[i2];
3821           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3822         }
3823 
3824         if (n == sz-1) {
3825           tmp0  = t[*idx];
3826           sum1 -= *v1 * tmp0;
3827         }
3828         x[row] += t[row] = sum1*(*ibdiag++); row++;
3829         break;
3830       case 2:
3831         v2   = a->a + ii[row+1];
3832         sum1 = t[row];
3833         sum2 = t[row+1];
3834         for (n = 0; n<sz-1; n+=2) {
3835           i1    = idx[0];
3836           i2    = idx[1];
3837           idx  += 2;
3838           tmp0  = t[i1];
3839           tmp1  = t[i2];
3840           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3841           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3842         }
3843 
3844         if (n == sz-1) {
3845           tmp0  = t[*idx];
3846           sum1 -= v1[0] * tmp0;
3847           sum2 -= v2[0] * tmp0;
3848         }
3849         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3850         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3851         ibdiag   += 4; row += 2;
3852         break;
3853       case 3:
3854         v2   = a->a + ii[row+1];
3855         v3   = a->a + ii[row+2];
3856         sum1 = t[row];
3857         sum2 = t[row+1];
3858         sum3 = t[row+2];
3859         for (n = 0; n<sz-1; n+=2) {
3860           i1    = idx[0];
3861           i2    = idx[1];
3862           idx  += 2;
3863           tmp0  = t[i1];
3864           tmp1  = t[i2];
3865           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3866           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3867           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3868         }
3869 
3870         if (n == sz-1) {
3871           tmp0  = t[*idx];
3872           sum1 -= v1[0] * tmp0;
3873           sum2 -= v2[0] * tmp0;
3874           sum3 -= v3[0] * tmp0;
3875         }
3876         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3877         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3878         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3879         ibdiag   += 9; row += 3;
3880         break;
3881       case 4:
3882         v2   = a->a + ii[row+1];
3883         v3   = a->a + ii[row+2];
3884         v4   = a->a + ii[row+3];
3885         sum1 = t[row];
3886         sum2 = t[row+1];
3887         sum3 = t[row+2];
3888         sum4 = t[row+3];
3889         for (n = 0; n<sz-1; n+=2) {
3890           i1    = idx[0];
3891           i2    = idx[1];
3892           idx  += 2;
3893           tmp0  = t[i1];
3894           tmp1  = t[i2];
3895           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3896           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3897           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3898           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3899         }
3900 
3901         if (n == sz-1) {
3902           tmp0  = t[*idx];
3903           sum1 -= v1[0] * tmp0;
3904           sum2 -= v2[0] * tmp0;
3905           sum3 -= v3[0] * tmp0;
3906           sum4 -= v4[0] * tmp0;
3907         }
3908         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3909         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3910         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3911         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3912         ibdiag   += 16; row += 4;
3913         break;
3914       case 5:
3915         v2   = a->a + ii[row+1];
3916         v3   = a->a + ii[row+2];
3917         v4   = a->a + ii[row+3];
3918         v5   = a->a + ii[row+4];
3919         sum1 = t[row];
3920         sum2 = t[row+1];
3921         sum3 = t[row+2];
3922         sum4 = t[row+3];
3923         sum5 = t[row+4];
3924         for (n = 0; n<sz-1; n+=2) {
3925           i1    = idx[0];
3926           i2    = idx[1];
3927           idx  += 2;
3928           tmp0  = t[i1];
3929           tmp1  = t[i2];
3930           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3931           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3932           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3933           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3934           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3935         }
3936 
3937         if (n == sz-1) {
3938           tmp0  = t[*idx];
3939           sum1 -= v1[0] * tmp0;
3940           sum2 -= v2[0] * tmp0;
3941           sum3 -= v3[0] * tmp0;
3942           sum4 -= v4[0] * tmp0;
3943           sum5 -= v5[0] * tmp0;
3944         }
3945         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3946         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3947         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3948         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3949         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3950         ibdiag   += 25; row += 5;
3951         break;
3952       default:
3953         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3954       }
3955     }
3956     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3957   }
3958   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3959   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3960   PetscFunctionReturn(0);
3961 }
3962 
3963 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3964 {
3965   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3966   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3967   const MatScalar   *bdiag = a->inode.bdiag;
3968   const PetscScalar *b;
3969   PetscErrorCode    ierr;
3970   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3971   const PetscInt    *sizes = a->inode.size;
3972 
3973   PetscFunctionBegin;
3974   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3975   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3976   cnt  = 0;
3977   for (i=0, row=0; i<m; i++) {
3978     switch (sizes[i]) {
3979     case 1:
3980       x[row] = b[row]*bdiag[cnt++];row++;
3981       break;
3982     case 2:
3983       x1       = b[row]; x2 = b[row+1];
3984       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3985       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3986       x[row++] = tmp1;
3987       x[row++] = tmp2;
3988       cnt     += 4;
3989       break;
3990     case 3:
3991       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3992       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3993       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3994       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3995       x[row++] = tmp1;
3996       x[row++] = tmp2;
3997       x[row++] = tmp3;
3998       cnt     += 9;
3999       break;
4000     case 4:
4001       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4002       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4003       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4004       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4005       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4006       x[row++] = tmp1;
4007       x[row++] = tmp2;
4008       x[row++] = tmp3;
4009       x[row++] = tmp4;
4010       cnt     += 16;
4011       break;
4012     case 5:
4013       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4014       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4015       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4016       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4017       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4018       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4019       x[row++] = tmp1;
4020       x[row++] = tmp2;
4021       x[row++] = tmp3;
4022       x[row++] = tmp4;
4023       x[row++] = tmp5;
4024       cnt     += 25;
4025       break;
4026     default:
4027       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4028     }
4029   }
4030   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4031   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4032   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4033   PetscFunctionReturn(0);
4034 }
4035 
4036 /*
4037     samestructure indicates that the matrix has not changed its nonzero structure so we
4038     do not need to recompute the inodes
4039 */
4040 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4041 {
4042   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4043   PetscErrorCode ierr;
4044   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4045   PetscBool      flag;
4046   const PetscInt *idx,*idy,*ii;
4047 
4048   PetscFunctionBegin;
4049   if (!a->inode.use) PetscFunctionReturn(0);
4050   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4051 
4052   m = A->rmap->n;
4053   if (a->inode.size) ns = a->inode.size;
4054   else {
4055     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4056   }
4057 
4058   i          = 0;
4059   node_count = 0;
4060   idx        = a->j;
4061   ii         = a->i;
4062   while (i < m) {                /* For each row */
4063     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4064     /* Limits the number of elements in a node to 'a->inode.limit' */
4065     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4066       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4067       if (nzy != nzx) break;
4068       idy += nzx;              /* Same nonzero pattern */
4069       ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr);
4070       if (!flag) break;
4071     }
4072     ns[node_count++] = blk_size;
4073     idx             += blk_size*nzx;
4074     i                = j;
4075   }
4076 
4077   {
4078     PetscBool is_cudatype;
4079     ierr = PetscObjectTypeCompareAny((PetscObject)A,&is_cudatype, MATAIJCUSPARSE, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MATAIJVIENNACL, MATSEQAIJVIENNACL, MATMPIAIJVIENNACL,"");CHKERRQ(ierr);
4080     if (is_cudatype) {
4081       ierr = PetscInfo(A,"Not using Inode routines on GPU matrix\n");CHKERRQ(ierr);
4082       ierr = PetscFree(ns);CHKERRQ(ierr);
4083       a->inode.node_count       = 0;
4084       a->inode.size             = NULL;
4085       a->inode.use              = PETSC_FALSE;
4086       a->inode.checked          = PETSC_TRUE;
4087       a->inode.mat_nonzerostate = A->nonzerostate;
4088       PetscFunctionReturn(0);
4089     }
4090   }
4091 
4092   /* If not enough inodes found,, do not use inode version of the routines */
4093   if (!m || node_count > .8*m) {
4094     ierr = PetscFree(ns);CHKERRQ(ierr);
4095 
4096     a->inode.node_count       = 0;
4097     a->inode.size             = NULL;
4098     a->inode.use              = PETSC_FALSE;
4099     A->ops->mult              = MatMult_SeqAIJ;
4100     A->ops->sor               = MatSOR_SeqAIJ;
4101     A->ops->multadd           = MatMultAdd_SeqAIJ;
4102     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4103     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4104     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4105     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4106     A->ops->coloringpatch     = 0;
4107     A->ops->multdiagonalblock = 0;
4108 
4109     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4110   } else {
4111     if (!A->factortype) {
4112       A->ops->mult              = MatMult_SeqAIJ_Inode;
4113       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4114       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4115       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4116       if (A->rmap->n == A->cmap->n) {
4117         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4118         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4119         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4120         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4121         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4122       }
4123     } else {
4124       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4125     }
4126     a->inode.node_count = node_count;
4127     a->inode.size       = ns;
4128     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4129   }
4130   a->inode.checked          = PETSC_TRUE;
4131   a->inode.mat_nonzerostate = A->nonzerostate;
4132   PetscFunctionReturn(0);
4133 }
4134 
4135 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4136 {
4137   Mat            B =*C;
4138   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4139   PetscErrorCode ierr;
4140   PetscInt       m=A->rmap->n;
4141 
4142   PetscFunctionBegin;
4143   c->inode.use       = a->inode.use;
4144   c->inode.limit     = a->inode.limit;
4145   c->inode.max_limit = a->inode.max_limit;
4146   if (a->inode.size) {
4147     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4148     c->inode.node_count = a->inode.node_count;
4149     ierr                = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr);
4150     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4151     if (!B->factortype) {
4152       B->ops->mult              = MatMult_SeqAIJ_Inode;
4153       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4154       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4155       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4156       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4157       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4158       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4159       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4160       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4161     } else {
4162       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4163     }
4164   } else {
4165     c->inode.size       = 0;
4166     c->inode.node_count = 0;
4167   }
4168   c->inode.ibdiagvalid = PETSC_FALSE;
4169   c->inode.ibdiag      = 0;
4170   c->inode.bdiag       = 0;
4171   PetscFunctionReturn(0);
4172 }
4173 
4174 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4175 {
4176   PetscInt       k;
4177   const PetscInt *vi;
4178 
4179   PetscFunctionBegin;
4180   vi = aj + ai[row];
4181   for (k=0; k<nzl; k++) cols[k] = vi[k];
4182   vi        = aj + adiag[row];
4183   cols[nzl] = vi[0];
4184   vi        = aj + adiag[row+1]+1;
4185   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4186   PetscFunctionReturn(0);
4187 }
4188 /*
4189    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4190    Modified from MatSeqAIJCheckInode().
4191 
4192    Input Parameters:
4193 .  Mat A - ILU or LU matrix factor
4194 
4195 */
4196 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4197 {
4198   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4199   PetscErrorCode ierr;
4200   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4201   PetscInt       *cols1,*cols2,*ns;
4202   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4203   PetscBool      flag;
4204 
4205   PetscFunctionBegin;
4206   if (!a->inode.use)    PetscFunctionReturn(0);
4207   if (a->inode.checked) PetscFunctionReturn(0);
4208 
4209   m = A->rmap->n;
4210   if (a->inode.size) ns = a->inode.size;
4211   else {
4212     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4213   }
4214 
4215   i          = 0;
4216   node_count = 0;
4217   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4218   while (i < m) {                /* For each row */
4219     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4220     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4221     nzx  = nzl1 + nzu1 + 1;
4222     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4223 
4224     /* Limits the number of elements in a node to 'a->inode.limit' */
4225     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4226       nzl2 = ai[j+1] - ai[j];
4227       nzu2 = adiag[j] - adiag[j+1] - 1;
4228       nzy  = nzl2 + nzu2 + 1;
4229       if (nzy != nzx) break;
4230       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4231       ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr);
4232       if (!flag) break;
4233     }
4234     ns[node_count++] = blk_size;
4235     i                = j;
4236   }
4237   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4238   /* If not enough inodes found,, do not use inode version of the routines */
4239   if (!m || node_count > .8*m) {
4240     ierr = PetscFree(ns);CHKERRQ(ierr);
4241 
4242     a->inode.node_count = 0;
4243     a->inode.size       = NULL;
4244     a->inode.use        = PETSC_FALSE;
4245 
4246     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4247   } else {
4248     A->ops->mult              = 0;
4249     A->ops->sor               = 0;
4250     A->ops->multadd           = 0;
4251     A->ops->getrowij          = 0;
4252     A->ops->restorerowij      = 0;
4253     A->ops->getcolumnij       = 0;
4254     A->ops->restorecolumnij   = 0;
4255     A->ops->coloringpatch     = 0;
4256     A->ops->multdiagonalblock = 0;
4257     a->inode.node_count       = node_count;
4258     a->inode.size             = ns;
4259 
4260     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4261   }
4262   a->inode.checked = PETSC_TRUE;
4263   PetscFunctionReturn(0);
4264 }
4265 
4266 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4267 {
4268   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4269 
4270   PetscFunctionBegin;
4271   a->inode.ibdiagvalid = PETSC_FALSE;
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 /*
4276      This is really ugly. if inodes are used this replaces the
4277   permutations with ones that correspond to rows/cols of the matrix
4278   rather then inode blocks
4279 */
4280 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4281 {
4282   PetscErrorCode ierr;
4283 
4284   PetscFunctionBegin;
4285   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4286   PetscFunctionReturn(0);
4287 }
4288 
4289 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4290 {
4291   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4292   PetscErrorCode ierr;
4293   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4294   const PetscInt *ridx,*cidx;
4295   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4296   PetscInt       nslim_col,*ns_col;
4297   IS             ris = *rperm,cis = *cperm;
4298 
4299   PetscFunctionBegin;
4300   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4301   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4302 
4303   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4304   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4305   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4306 
4307   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4308   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4309 
4310   /* Form the inode structure for the rows of permuted matric using inv perm*/
4311   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4312 
4313   /* Construct the permutations for rows*/
4314   for (i=0,row = 0; i<nslim_row; ++i) {
4315     indx      = ridx[i];
4316     start_val = tns[indx];
4317     end_val   = tns[indx + 1];
4318     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4319   }
4320 
4321   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4322   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4323 
4324   /* Construct permutations for columns */
4325   for (i=0,col=0; i<nslim_col; ++i) {
4326     indx      = cidx[i];
4327     start_val = tns[indx];
4328     end_val   = tns[indx + 1];
4329     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4330   }
4331 
4332   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4333   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4334   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4335   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4336 
4337   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4338   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4339 
4340   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4341   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4342   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4343   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4344   ierr = PetscFree(tns);CHKERRQ(ierr);
4345   PetscFunctionReturn(0);
4346 }
4347 
4348 /*@C
4349    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4350 
4351    Not Collective
4352 
4353    Input Parameter:
4354 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4355 
4356    Output Parameter:
4357 +  node_count - no of inodes present in the matrix.
4358 .  sizes      - an array of size node_count,with sizes of each inode.
4359 -  limit      - the max size used to generate the inodes.
4360 
4361    Level: advanced
4362 
4363    Notes:
4364     This routine returns some internal storage information
4365    of the matrix, it is intended to be used by advanced users.
4366    It should be called after the matrix is assembled.
4367    The contents of the sizes[] array should not be changed.
4368    NULL may be passed for information not requested.
4369 
4370 .seealso: MatGetInfo()
4371 @*/
4372 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4373 {
4374   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4375 
4376   PetscFunctionBegin;
4377   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4378   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4379   if (f) {
4380     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4381   }
4382   PetscFunctionReturn(0);
4383 }
4384 
4385 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4386 {
4387   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4388 
4389   PetscFunctionBegin;
4390   if (node_count) *node_count = a->inode.node_count;
4391   if (sizes)      *sizes      = a->inode.size;
4392   if (limit)      *limit      = a->inode.limit;
4393   PetscFunctionReturn(0);
4394 }
4395