xref: /petsc/src/mat/impls/aij/seq/inode.c (revision e0877f539457ad1ce8178bc0750eac5ffa490a67)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
600   zt = z;
601 
602   idx = a->j;
603   v1  = a->a;
604   ii  = a->i;
605 
606   for (i = 0,row = 0; i< node_max; ++i) {
607     nsz = ns[i];
608     n   = ii[1] - ii[0];
609     ii += nsz;
610     sz  = n;                    /* No of non zeros in this row */
611                                 /* Switch on the size of Node */
612     switch (nsz) {               /* Each loop in 'case' is unrolled */
613     case 1:
614       sum1 = *zt++;
615 
616       for (n = 0; n< sz-1; n+=2) {
617         i1    = idx[0];         /* The instructions are ordered to */
618         i2    = idx[1];         /* make the compiler's job easy */
619         idx  += 2;
620         tmp0  = x[i1];
621         tmp1  = x[i2];
622         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623       }
624 
625       if (n   == sz-1) {          /* Take care of the last nonzero  */
626         tmp0  = x[*idx++];
627         sum1 += *v1++ * tmp0;
628       }
629       y[row++]=sum1;
630       break;
631     case 2:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       v2   = v1 + n;
635 
636       for (n = 0; n< sz-1; n+=2) {
637         i1    = idx[0];
638         i2    = idx[1];
639         idx  += 2;
640         tmp0  = x[i1];
641         tmp1  = x[i2];
642         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644       }
645       if (n   == sz-1) {
646         tmp0  = x[*idx++];
647         sum1 += *v1++ * tmp0;
648         sum2 += *v2++ * tmp0;
649       }
650       y[row++]=sum1;
651       y[row++]=sum2;
652       v1      =v2;              /* Since the next block to be processed starts there*/
653       idx    +=sz;
654       break;
655     case 3:
656       sum1 = *zt++;
657       sum2 = *zt++;
658       sum3 = *zt++;
659       v2   = v1 + n;
660       v3   = v2 + n;
661 
662       for (n = 0; n< sz-1; n+=2) {
663         i1    = idx[0];
664         i2    = idx[1];
665         idx  += 2;
666         tmp0  = x[i1];
667         tmp1  = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671       }
672       if (n == sz-1) {
673         tmp0  = x[*idx++];
674         sum1 += *v1++ * tmp0;
675         sum2 += *v2++ * tmp0;
676         sum3 += *v3++ * tmp0;
677       }
678       y[row++]=sum1;
679       y[row++]=sum2;
680       y[row++]=sum3;
681       v1      =v3;              /* Since the next block to be processed starts there*/
682       idx    +=2*sz;
683       break;
684     case 4:
685       sum1 = *zt++;
686       sum2 = *zt++;
687       sum3 = *zt++;
688       sum4 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692 
693       for (n = 0; n< sz-1; n+=2) {
694         i1    = idx[0];
695         i2    = idx[1];
696         idx  += 2;
697         tmp0  = x[i1];
698         tmp1  = x[i2];
699         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703       }
704       if (n == sz-1) {
705         tmp0  = x[*idx++];
706         sum1 += *v1++ * tmp0;
707         sum2 += *v2++ * tmp0;
708         sum3 += *v3++ * tmp0;
709         sum4 += *v4++ * tmp0;
710       }
711       y[row++]=sum1;
712       y[row++]=sum2;
713       y[row++]=sum3;
714       y[row++]=sum4;
715       v1      =v4;              /* Since the next block to be processed starts there*/
716       idx    +=3*sz;
717       break;
718     case 5:
719       sum1 = *zt++;
720       sum2 = *zt++;
721       sum3 = *zt++;
722       sum4 = *zt++;
723       sum5 = *zt++;
724       v2   = v1 + n;
725       v3   = v2 + n;
726       v4   = v3 + n;
727       v5   = v4 + n;
728 
729       for (n = 0; n<sz-1; n+=2) {
730         i1    = idx[0];
731         i2    = idx[1];
732         idx  += 2;
733         tmp0  = x[i1];
734         tmp1  = x[i2];
735         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740       }
741       if (n   == sz-1) {
742         tmp0  = x[*idx++];
743         sum1 += *v1++ * tmp0;
744         sum2 += *v2++ * tmp0;
745         sum3 += *v3++ * tmp0;
746         sum4 += *v4++ * tmp0;
747         sum5 += *v5++ * tmp0;
748       }
749       y[row++]=sum1;
750       y[row++]=sum2;
751       y[row++]=sum3;
752       y[row++]=sum4;
753       y[row++]=sum5;
754       v1      =v5;       /* Since the next block to be processed starts there */
755       idx    +=4*sz;
756       break;
757     default:
758       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759     }
760   }
761   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
763   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
764   PetscFunctionReturn(0);
765 }
766 
767 /* ----------------------------------------------------------- */
768 #undef __FUNCT__
769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
771 {
772   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
773   IS                iscol = a->col,isrow = a->row;
774   PetscErrorCode    ierr;
775   const PetscInt    *r,*c,*rout,*cout;
776   PetscInt          i,j,n = A->rmap->n,nz;
777   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
778   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
779   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
780   PetscScalar       sum1,sum2,sum3,sum4,sum5;
781   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
782   const PetscScalar *b;
783 
784   PetscFunctionBegin;
785   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
786   node_max = a->inode.node_count;
787   ns       = a->inode.size;     /* Node Size array */
788 
789   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
790   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
791   tmp  = a->solve_work;
792 
793   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
794   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
795 
796   /* forward solve the lower triangular */
797   tmps = tmp;
798   aa   = a_a;
799   aj   = a_j;
800   ad   = a->diag;
801 
802   for (i = 0,row = 0; i< node_max; ++i) {
803     nsz = ns[i];
804     aii = ai[row];
805     v1  = aa + aii;
806     vi  = aj + aii;
807     nz  = ad[row]- aii;
808     if (i < node_max-1) {
809       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
810       * but our indexing to determine it's size could. */
811       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
812       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
813       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
814       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
815     }
816 
817     switch (nsz) {               /* Each loop in 'case' is unrolled */
818     case 1:
819       sum1 = b[*r++];
820       for (j=0; j<nz-1; j+=2) {
821         i0    = vi[0];
822         i1    = vi[1];
823         vi   +=2;
824         tmp0  = tmps[i0];
825         tmp1  = tmps[i1];
826         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827       }
828       if (j == nz-1) {
829         tmp0  = tmps[*vi++];
830         sum1 -= *v1++ *tmp0;
831       }
832       tmp[row++]=sum1;
833       break;
834     case 2:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       v2   = aa + ai[row+1];
838 
839       for (j=0; j<nz-1; j+=2) {
840         i0    = vi[0];
841         i1    = vi[1];
842         vi   +=2;
843         tmp0  = tmps[i0];
844         tmp1  = tmps[i1];
845         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847       }
848       if (j == nz-1) {
849         tmp0  = tmps[*vi++];
850         sum1 -= *v1++ *tmp0;
851         sum2 -= *v2++ *tmp0;
852       }
853       sum2     -= *v2++ *sum1;
854       tmp[row++]=sum1;
855       tmp[row++]=sum2;
856       break;
857     case 3:
858       sum1 = b[*r++];
859       sum2 = b[*r++];
860       sum3 = b[*r++];
861       v2   = aa + ai[row+1];
862       v3   = aa + ai[row+2];
863 
864       for (j=0; j<nz-1; j+=2) {
865         i0    = vi[0];
866         i1    = vi[1];
867         vi   +=2;
868         tmp0  = tmps[i0];
869         tmp1  = tmps[i1];
870         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873       }
874       if (j == nz-1) {
875         tmp0  = tmps[*vi++];
876         sum1 -= *v1++ *tmp0;
877         sum2 -= *v2++ *tmp0;
878         sum3 -= *v3++ *tmp0;
879       }
880       sum2 -= *v2++ * sum1;
881       sum3 -= *v3++ * sum1;
882       sum3 -= *v3++ * sum2;
883 
884       tmp[row++]=sum1;
885       tmp[row++]=sum2;
886       tmp[row++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2) {
899         i0    = vi[0];
900         i1    = vi[1];
901         vi   +=2;
902         tmp0  = tmps[i0];
903         tmp1  = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1) {
910         tmp0  = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row++]=sum1;
924       tmp[row++]=sum2;
925       tmp[row++]=sum3;
926       tmp[row++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2) {
940         i0    = vi[0];
941         i1    = vi[1];
942         vi   +=2;
943         tmp0  = tmps[i0];
944         tmp1  = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1) {
952         tmp0  = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row++]=sum1;
972       tmp[row++]=sum2;
973       tmp[row++]=sum3;
974       tmp[row++]=sum4;
975       tmp[row++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1,row = n-1; i>=0; i--) {
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz) {               /* Each loop in 'case' is unrolled */
989     case 1:
990       sum1 = tmp[row];
991 
992       for (j=nz; j>1; j-=2) {
993         vi   -=2;
994         i0    = vi[2];
995         i1    = vi[1];
996         tmp0  = tmps[i0];
997         tmp1  = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1) {
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2:
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz; j>1; j-=2) {
1012         vi   -=2;
1013         i0    = vi[2];
1014         i1    = vi[1];
1015         tmp0  = tmps[i0];
1016         tmp1  = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1) {
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3:
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz; j>1; j-=2) {
1039         vi   -=2;
1040         i0    = vi[2];
1041         i1    = vi[1];
1042         tmp0  = tmps[i0];
1043         tmp1  = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1) {
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4:
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz; j>1; j-=2) {
1075         vi   -=2;
1076         i0    = vi[2];
1077         i1    = vi[1];
1078         tmp0  = tmps[i0];
1079         tmp1  = tmps[i1];
1080         v1   -= 2;
1081         v2   -= 2;
1082         v3   -= 2;
1083         v4   -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1) {
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5:
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz; j>1; j-=2) {
1119         vi   -= 2;
1120         i0    = vi[2];
1121         i1    = vi[1];
1122         tmp0  = tmps[i0];
1123         tmp1  = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1) {
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175 {
1176   Mat             C     =B;
1177   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1178   IS              isrow = b->row,isicol = b->icol;
1179   PetscErrorCode  ierr;
1180   const PetscInt  *r,*ic,*ics;
1181   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1182   PetscInt        i,j,k,nz,nzL,row,*pj;
1183   const PetscInt  *ajtmp,*bjtmp;
1184   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1185   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1186   FactorShiftCtx  sctx;
1187   const PetscInt  *ddiag;
1188   PetscReal       rs;
1189   MatScalar       d;
1190   PetscInt        inod,nodesz,node_max,col;
1191   const PetscInt  *ns;
1192   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1193 
1194   PetscFunctionBegin;
1195   /* MatPivotSetUp(): initialize shift context sctx */
1196   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1197 
1198   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1199     ddiag          = a->diag;
1200     sctx.shift_top = info->zeropivot;
1201     for (i=0; i<n; i++) {
1202       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203       d  = (aa)[ddiag[i]];
1204       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1205       v  = aa+ai[i];
1206       nz = ai[i+1] - ai[i];
1207       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1208       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209     }
1210     sctx.shift_top *= 1.1;
1211     sctx.nshift_max = 5;
1212     sctx.shift_lo   = 0.;
1213     sctx.shift_hi   = 1.;
1214   }
1215 
1216   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1217   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1218 
1219   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1220   ics   = ic;
1221 
1222   node_max = a->inode.node_count;
1223   ns       = a->inode.size;
1224   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1225 
1226   /* If max inode size > 4, split it into two inodes.*/
1227   /* also map the inode sizes according to the ordering */
1228   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1229   for (i=0,j=0; i<node_max; ++i,++j) {
1230     if (ns[i] > 4) {
1231       tmp_vec1[j] = 4;
1232       ++j;
1233       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1234     } else {
1235       tmp_vec1[j] = ns[i];
1236     }
1237   }
1238   /* Use the correct node_max */
1239   node_max = j;
1240 
1241   /* Now reorder the inode info based on mat re-ordering info */
1242   /* First create a row -> inode_size_array_index map */
1243   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1244   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1245   for (i=0,row=0; i<node_max; i++) {
1246     nodesz = tmp_vec1[i];
1247     for (j=0; j<nodesz; j++,row++) {
1248       nsmap[row] = i;
1249     }
1250   }
1251   /* Using nsmap, create a reordered ns structure */
1252   for (i=0,j=0; i< node_max; i++) {
1253     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1254     tmp_vec2[i] = nodesz;
1255     j          += nodesz;
1256   }
1257   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1258   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1259 
1260   /* Now use the correct ns */
1261   ns = tmp_vec2;
1262 
1263   do {
1264     sctx.newshift = PETSC_FALSE;
1265     /* Now loop over each block-row, and do the factorization */
1266     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1267       nodesz = ns[inod];
1268 
1269       switch (nodesz) {
1270       case 1:
1271         /*----------*/
1272         /* zero rtmp1 */
1273         /* L part */
1274         nz    = bi[i+1] - bi[i];
1275         bjtmp = bj + bi[i];
1276         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1277 
1278         /* U part */
1279         nz    = bdiag[i]-bdiag[i+1];
1280         bjtmp = bj + bdiag[i+1]+1;
1281         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1282 
1283         /* load in initial (unfactored row) */
1284         nz    = ai[r[i]+1] - ai[r[i]];
1285         ajtmp = aj + ai[r[i]];
1286         v     = aa + ai[r[i]];
1287         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1288 
1289         /* ZeropivotApply() */
1290         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1291 
1292         /* elimination */
1293         bjtmp = bj + bi[i];
1294         row   = *bjtmp++;
1295         nzL   = bi[i+1] - bi[i];
1296         for (k=0; k < nzL; k++) {
1297           pc = rtmp1 + row;
1298           if (*pc != 0.0) {
1299             pv   = b->a + bdiag[row];
1300             mul1 = *pc * (*pv);
1301             *pc  = mul1;
1302             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1303             pv   = b->a + bdiag[row+1]+1;
1304             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1305             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1306             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1307           }
1308           row = *bjtmp++;
1309         }
1310 
1311         /* finished row so stick it into b->a */
1312         rs = 0.0;
1313         /* L part */
1314         pv = b->a + bi[i];
1315         pj = b->j + bi[i];
1316         nz = bi[i+1] - bi[i];
1317         for (j=0; j<nz; j++) {
1318           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1319         }
1320 
1321         /* U part */
1322         pv = b->a + bdiag[i+1]+1;
1323         pj = b->j + bdiag[i+1]+1;
1324         nz = bdiag[i] - bdiag[i+1]-1;
1325         for (j=0; j<nz; j++) {
1326           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* Check zero pivot */
1330         sctx.rs = rs;
1331         sctx.pv = rtmp1[i];
1332         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1333         if (sctx.newshift) break;
1334 
1335         /* Mark diagonal and invert diagonal for simplier triangular solves */
1336         pv  = b->a + bdiag[i];
1337         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1338         break;
1339 
1340       case 2:
1341         /*----------*/
1342         /* zero rtmp1 and rtmp2 */
1343         /* L part */
1344         nz    = bi[i+1] - bi[i];
1345         bjtmp = bj + bi[i];
1346         for  (j=0; j<nz; j++) {
1347           col        = bjtmp[j];
1348           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1349         }
1350 
1351         /* U part */
1352         nz    = bdiag[i]-bdiag[i+1];
1353         bjtmp = bj + bdiag[i+1]+1;
1354         for  (j=0; j<nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357         }
1358 
1359         /* load in initial (unfactored row) */
1360         nz    = ai[r[i]+1] - ai[r[i]];
1361         ajtmp = aj + ai[r[i]];
1362         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1363         for (j=0; j<nz; j++) {
1364           col        = ics[ajtmp[j]];
1365           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1366         }
1367         /* ZeropivotApply(): shift the diagonal of the matrix  */
1368         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1369 
1370         /* elimination */
1371         bjtmp = bj + bi[i];
1372         row   = *bjtmp++; /* pivot row */
1373         nzL   = bi[i+1] - bi[i];
1374         for (k=0; k < nzL; k++) {
1375           pc1 = rtmp1 + row;
1376           pc2 = rtmp2 + row;
1377           if (*pc1 != 0.0 || *pc2 != 0.0) {
1378             pv   = b->a + bdiag[row];
1379             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1380             *pc1 = mul1;       *pc2 = mul2;
1381 
1382             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1383             pv = b->a + bdiag[row+1]+1;
1384             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1385             for (j=0; j<nz; j++) {
1386               col         = pj[j];
1387               rtmp1[col] -= mul1 * pv[j];
1388               rtmp2[col] -= mul2 * pv[j];
1389             }
1390             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1391           }
1392           row = *bjtmp++;
1393         }
1394 
1395         /* finished row i; check zero pivot, then stick row i into b->a */
1396         rs = 0.0;
1397         /* L part */
1398         pc1 = b->a + bi[i];
1399         pj  = b->j + bi[i];
1400         nz  = bi[i+1] - bi[i];
1401         for (j=0; j<nz; j++) {
1402           col    = pj[j];
1403           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1404         }
1405         /* U part */
1406         pc1 = b->a + bdiag[i+1]+1;
1407         pj  = b->j + bdiag[i+1]+1;
1408         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1409         for (j=0; j<nz; j++) {
1410           col    = pj[j];
1411           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412         }
1413 
1414         sctx.rs = rs;
1415         sctx.pv = rtmp1[i];
1416         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1417         if (sctx.newshift) break;
1418         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1419         *pc1 = 1.0/sctx.pv;
1420 
1421         /* Now take care of diagonal 2x2 block. */
1422         pc2 = rtmp2 + i;
1423         if (*pc2 != 0.0) {
1424           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1425           *pc2 = mul1;          /* insert L entry */
1426           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1427           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428           for (j=0; j<nz; j++) {
1429             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1430           }
1431           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1432         }
1433 
1434         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1435         rs = 0.0;
1436         /* L part */
1437         pc2 = b->a + bi[i+1];
1438         pj  = b->j + bi[i+1];
1439         nz  = bi[i+2] - bi[i+1];
1440         for (j=0; j<nz; j++) {
1441           col    = pj[j];
1442           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1443         }
1444         /* U part */
1445         pc2 = b->a + bdiag[i+2]+1;
1446         pj  = b->j + bdiag[i+2]+1;
1447         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1448         for (j=0; j<nz; j++) {
1449           col    = pj[j];
1450           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451         }
1452 
1453         sctx.rs = rs;
1454         sctx.pv = rtmp2[i+1];
1455         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1456         if (sctx.newshift) break;
1457         pc2  = b->a + bdiag[i+1];
1458         *pc2 = 1.0/sctx.pv;
1459         break;
1460 
1461       case 3:
1462         /*----------*/
1463         /* zero rtmp */
1464         /* L part */
1465         nz    = bi[i+1] - bi[i];
1466         bjtmp = bj + bi[i];
1467         for  (j=0; j<nz; j++) {
1468           col        = bjtmp[j];
1469           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1470         }
1471 
1472         /* U part */
1473         nz    = bdiag[i]-bdiag[i+1];
1474         bjtmp = bj + bdiag[i+1]+1;
1475         for  (j=0; j<nz; j++) {
1476           col        = bjtmp[j];
1477           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478         }
1479 
1480         /* load in initial (unfactored row) */
1481         nz    = ai[r[i]+1] - ai[r[i]];
1482         ajtmp = aj + ai[r[i]];
1483         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1484         for (j=0; j<nz; j++) {
1485           col        = ics[ajtmp[j]];
1486           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1487         }
1488         /* ZeropivotApply(): shift the diagonal of the matrix  */
1489         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1490 
1491         /* elimination */
1492         bjtmp = bj + bi[i];
1493         row   = *bjtmp++; /* pivot row */
1494         nzL   = bi[i+1] - bi[i];
1495         for (k=0; k < nzL; k++) {
1496           pc1 = rtmp1 + row;
1497           pc2 = rtmp2 + row;
1498           pc3 = rtmp3 + row;
1499           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1500             pv   = b->a + bdiag[row];
1501             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1502             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1503 
1504             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1505             pv = b->a + bdiag[row+1]+1;
1506             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1507             for (j=0; j<nz; j++) {
1508               col         = pj[j];
1509               rtmp1[col] -= mul1 * pv[j];
1510               rtmp2[col] -= mul2 * pv[j];
1511               rtmp3[col] -= mul3 * pv[j];
1512             }
1513             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1514           }
1515           row = *bjtmp++;
1516         }
1517 
1518         /* finished row i; check zero pivot, then stick row i into b->a */
1519         rs = 0.0;
1520         /* L part */
1521         pc1 = b->a + bi[i];
1522         pj  = b->j + bi[i];
1523         nz  = bi[i+1] - bi[i];
1524         for (j=0; j<nz; j++) {
1525           col    = pj[j];
1526           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1527         }
1528         /* U part */
1529         pc1 = b->a + bdiag[i+1]+1;
1530         pj  = b->j + bdiag[i+1]+1;
1531         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1532         for (j=0; j<nz; j++) {
1533           col    = pj[j];
1534           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535         }
1536 
1537         sctx.rs = rs;
1538         sctx.pv = rtmp1[i];
1539         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1540         if (sctx.newshift) break;
1541         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1542         *pc1 = 1.0/sctx.pv;
1543 
1544         /* Now take care of 1st column of diagonal 3x3 block. */
1545         pc2 = rtmp2 + i;
1546         pc3 = rtmp3 + i;
1547         if (*pc2 != 0.0 || *pc3 != 0.0) {
1548           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1549           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1550           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1551           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1552           for (j=0; j<nz; j++) {
1553             col         = pj[j];
1554             rtmp2[col] -= mul2 * rtmp1[col];
1555             rtmp3[col] -= mul3 * rtmp1[col];
1556           }
1557           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1558         }
1559 
1560         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1561         rs = 0.0;
1562         /* L part */
1563         pc2 = b->a + bi[i+1];
1564         pj  = b->j + bi[i+1];
1565         nz  = bi[i+2] - bi[i+1];
1566         for (j=0; j<nz; j++) {
1567           col    = pj[j];
1568           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1569         }
1570         /* U part */
1571         pc2 = b->a + bdiag[i+2]+1;
1572         pj  = b->j + bdiag[i+2]+1;
1573         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1574         for (j=0; j<nz; j++) {
1575           col    = pj[j];
1576           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577         }
1578 
1579         sctx.rs = rs;
1580         sctx.pv = rtmp2[i+1];
1581         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1582         if (sctx.newshift) break;
1583         pc2  = b->a + bdiag[i+1];
1584         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1585 
1586         /* Now take care of 2nd column of diagonal 3x3 block. */
1587         pc3 = rtmp3 + i+1;
1588         if (*pc3 != 0.0) {
1589           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1590           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1591           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1592           for (j=0; j<nz; j++) {
1593             col         = pj[j];
1594             rtmp3[col] -= mul3 * rtmp2[col];
1595           }
1596           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1597         }
1598 
1599         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1600         rs = 0.0;
1601         /* L part */
1602         pc3 = b->a + bi[i+2];
1603         pj  = b->j + bi[i+2];
1604         nz  = bi[i+3] - bi[i+2];
1605         for (j=0; j<nz; j++) {
1606           col    = pj[j];
1607           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1608         }
1609         /* U part */
1610         pc3 = b->a + bdiag[i+3]+1;
1611         pj  = b->j + bdiag[i+3]+1;
1612         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1613         for (j=0; j<nz; j++) {
1614           col    = pj[j];
1615           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616         }
1617 
1618         sctx.rs = rs;
1619         sctx.pv = rtmp3[i+2];
1620         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1621         if (sctx.newshift) break;
1622         pc3  = b->a + bdiag[i+2];
1623         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1624         break;
1625       case 4:
1626         /*----------*/
1627         /* zero rtmp */
1628         /* L part */
1629         nz    = bi[i+1] - bi[i];
1630         bjtmp = bj + bi[i];
1631         for  (j=0; j<nz; j++) {
1632           col        = bjtmp[j];
1633           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1634         }
1635 
1636         /* U part */
1637         nz    = bdiag[i]-bdiag[i+1];
1638         bjtmp = bj + bdiag[i+1]+1;
1639         for  (j=0; j<nz; j++) {
1640           col        = bjtmp[j];
1641           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1642         }
1643 
1644         /* load in initial (unfactored row) */
1645         nz    = ai[r[i]+1] - ai[r[i]];
1646         ajtmp = aj + ai[r[i]];
1647         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1648         for (j=0; j<nz; j++) {
1649           col        = ics[ajtmp[j]];
1650           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1651         }
1652         /* ZeropivotApply(): shift the diagonal of the matrix  */
1653         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1654 
1655         /* elimination */
1656         bjtmp = bj + bi[i];
1657         row   = *bjtmp++; /* pivot row */
1658         nzL   = bi[i+1] - bi[i];
1659         for (k=0; k < nzL; k++) {
1660           pc1 = rtmp1 + row;
1661           pc2 = rtmp2 + row;
1662           pc3 = rtmp3 + row;
1663           pc4 = rtmp4 + row;
1664           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1665             pv   = b->a + bdiag[row];
1666             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1667             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1668 
1669             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1670             pv = b->a + bdiag[row+1]+1;
1671             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1672             for (j=0; j<nz; j++) {
1673               col         = pj[j];
1674               rtmp1[col] -= mul1 * pv[j];
1675               rtmp2[col] -= mul2 * pv[j];
1676               rtmp3[col] -= mul3 * pv[j];
1677               rtmp4[col] -= mul4 * pv[j];
1678             }
1679             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1680           }
1681           row = *bjtmp++;
1682         }
1683 
1684         /* finished row i; check zero pivot, then stick row i into b->a */
1685         rs = 0.0;
1686         /* L part */
1687         pc1 = b->a + bi[i];
1688         pj  = b->j + bi[i];
1689         nz  = bi[i+1] - bi[i];
1690         for (j=0; j<nz; j++) {
1691           col    = pj[j];
1692           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1693         }
1694         /* U part */
1695         pc1 = b->a + bdiag[i+1]+1;
1696         pj  = b->j + bdiag[i+1]+1;
1697         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1698         for (j=0; j<nz; j++) {
1699           col    = pj[j];
1700           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701         }
1702 
1703         sctx.rs = rs;
1704         sctx.pv = rtmp1[i];
1705         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1706         if (sctx.newshift) break;
1707         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1708         *pc1 = 1.0/sctx.pv;
1709 
1710         /* Now take care of 1st column of diagonal 4x4 block. */
1711         pc2 = rtmp2 + i;
1712         pc3 = rtmp3 + i;
1713         pc4 = rtmp4 + i;
1714         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1715           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1716           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1717           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1718           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1719           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1720           for (j=0; j<nz; j++) {
1721             col         = pj[j];
1722             rtmp2[col] -= mul2 * rtmp1[col];
1723             rtmp3[col] -= mul3 * rtmp1[col];
1724             rtmp4[col] -= mul4 * rtmp1[col];
1725           }
1726           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1727         }
1728 
1729         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1730         rs = 0.0;
1731         /* L part */
1732         pc2 = b->a + bi[i+1];
1733         pj  = b->j + bi[i+1];
1734         nz  = bi[i+2] - bi[i+1];
1735         for (j=0; j<nz; j++) {
1736           col    = pj[j];
1737           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1738         }
1739         /* U part */
1740         pc2 = b->a + bdiag[i+2]+1;
1741         pj  = b->j + bdiag[i+2]+1;
1742         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1743         for (j=0; j<nz; j++) {
1744           col    = pj[j];
1745           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746         }
1747 
1748         sctx.rs = rs;
1749         sctx.pv = rtmp2[i+1];
1750         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1751         if (sctx.newshift) break;
1752         pc2  = b->a + bdiag[i+1];
1753         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1754 
1755         /* Now take care of 2nd column of diagonal 4x4 block. */
1756         pc3 = rtmp3 + i+1;
1757         pc4 = rtmp4 + i+1;
1758         if (*pc3 != 0.0 || *pc4 != 0.0) {
1759           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1760           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1761           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1762           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1763           for (j=0; j<nz; j++) {
1764             col         = pj[j];
1765             rtmp3[col] -= mul3 * rtmp2[col];
1766             rtmp4[col] -= mul4 * rtmp2[col];
1767           }
1768           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1769         }
1770 
1771         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1772         rs = 0.0;
1773         /* L part */
1774         pc3 = b->a + bi[i+2];
1775         pj  = b->j + bi[i+2];
1776         nz  = bi[i+3] - bi[i+2];
1777         for (j=0; j<nz; j++) {
1778           col    = pj[j];
1779           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1780         }
1781         /* U part */
1782         pc3 = b->a + bdiag[i+3]+1;
1783         pj  = b->j + bdiag[i+3]+1;
1784         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1785         for (j=0; j<nz; j++) {
1786           col    = pj[j];
1787           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788         }
1789 
1790         sctx.rs = rs;
1791         sctx.pv = rtmp3[i+2];
1792         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1793         if (sctx.newshift) break;
1794         pc3  = b->a + bdiag[i+2];
1795         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1796 
1797         /* Now take care of 3rd column of diagonal 4x4 block. */
1798         pc4 = rtmp4 + i+2;
1799         if (*pc4 != 0.0) {
1800           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1801           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1802           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1803           for (j=0; j<nz; j++) {
1804             col         = pj[j];
1805             rtmp4[col] -= mul4 * rtmp3[col];
1806           }
1807           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1808         }
1809 
1810         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1811         rs = 0.0;
1812         /* L part */
1813         pc4 = b->a + bi[i+3];
1814         pj  = b->j + bi[i+3];
1815         nz  = bi[i+4] - bi[i+3];
1816         for (j=0; j<nz; j++) {
1817           col    = pj[j];
1818           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1819         }
1820         /* U part */
1821         pc4 = b->a + bdiag[i+4]+1;
1822         pj  = b->j + bdiag[i+4]+1;
1823         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1824         for (j=0; j<nz; j++) {
1825           col    = pj[j];
1826           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827         }
1828 
1829         sctx.rs = rs;
1830         sctx.pv = rtmp4[i+3];
1831         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1832         if (sctx.newshift) break;
1833         pc4  = b->a + bdiag[i+3];
1834         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1835         break;
1836 
1837       default:
1838         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1839       }
1840       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1841       i += nodesz;                 /* Update the row */
1842     }
1843 
1844     /* MatPivotRefine() */
1845     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1846       /*
1847        * if no shift in this attempt & shifting & started shifting & can refine,
1848        * then try lower shift
1849        */
1850       sctx.shift_hi       = sctx.shift_fraction;
1851       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1852       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1853       sctx.newshift       = PETSC_TRUE;
1854       sctx.nshift++;
1855     }
1856   } while (sctx.newshift);
1857 
1858   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1859   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1860   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1861   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1862 
1863   if (b->inode.size) {
1864     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1865   } else {
1866     C->ops->solve           = MatSolve_SeqAIJ;
1867   }
1868   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1869   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1870   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1871   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1872   C->assembled              = PETSC_TRUE;
1873   C->preallocated           = PETSC_TRUE;
1874 
1875   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1876 
1877   /* MatShiftView(A,info,&sctx) */
1878   if (sctx.nshift) {
1879     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1880       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1881     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1882       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1883     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1884       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1885     }
1886   }
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 #undef __FUNCT__
1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1893 {
1894   Mat             C     = B;
1895   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1896   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1897   PetscErrorCode  ierr;
1898   const PetscInt  *r,*ic,*c,*ics;
1899   PetscInt        n   = A->rmap->n,*bi = b->i;
1900   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1901   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1902   PetscInt        *ai = a->i,*aj = a->j;
1903   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1904   PetscScalar     mul1,mul2,mul3,tmp;
1905   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1906   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1907   PetscReal       rs=0.0;
1908   FactorShiftCtx  sctx;
1909 
1910   PetscFunctionBegin;
1911   sctx.shift_top      = 0;
1912   sctx.nshift_max     = 0;
1913   sctx.shift_lo       = 0;
1914   sctx.shift_hi       = 0;
1915   sctx.shift_fraction = 0;
1916 
1917   /* if both shift schemes are chosen by user, only use info->shiftpd */
1918   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1919     sctx.shift_top = 0;
1920     for (i=0; i<n; i++) {
1921       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1922       rs    = 0.0;
1923       ajtmp = aj + ai[i];
1924       rtmp1 = aa + ai[i];
1925       nz    = ai[i+1] - ai[i];
1926       for (j=0; j<nz; j++) {
1927         if (*ajtmp != i) {
1928           rs += PetscAbsScalar(*rtmp1++);
1929         } else {
1930           rs -= PetscRealPart(*rtmp1++);
1931         }
1932         ajtmp++;
1933       }
1934       if (rs>sctx.shift_top) sctx.shift_top = rs;
1935     }
1936     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1937     sctx.shift_top *= 1.1;
1938     sctx.nshift_max = 5;
1939     sctx.shift_lo   = 0.;
1940     sctx.shift_hi   = 1.;
1941   }
1942   sctx.shift_amount = 0;
1943   sctx.nshift       = 0;
1944 
1945   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1946   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1947   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1948   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1949   ics    = ic;
1950 
1951   node_max = a->inode.node_count;
1952   ns       = a->inode.size;
1953   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1954 
1955   /* If max inode size > 3, split it into two inodes.*/
1956   /* also map the inode sizes according to the ordering */
1957   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1958   for (i=0,j=0; i<node_max; ++i,++j) {
1959     if (ns[i]>3) {
1960       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1961       ++j;
1962       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1963     } else {
1964       tmp_vec1[j] = ns[i];
1965     }
1966   }
1967   /* Use the correct node_max */
1968   node_max = j;
1969 
1970   /* Now reorder the inode info based on mat re-ordering info */
1971   /* First create a row -> inode_size_array_index map */
1972   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1973   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1974   for (i=0,row=0; i<node_max; i++) {
1975     nodesz = tmp_vec1[i];
1976     for (j=0; j<nodesz; j++,row++) {
1977       nsmap[row] = i;
1978     }
1979   }
1980   /* Using nsmap, create a reordered ns structure */
1981   for (i=0,j=0; i< node_max; i++) {
1982     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1983     tmp_vec2[i] = nodesz;
1984     j          += nodesz;
1985   }
1986   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1987   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1988   /* Now use the correct ns */
1989   ns = tmp_vec2;
1990 
1991   do {
1992     sctx.newshift = PETSC_FALSE;
1993     /* Now loop over each block-row, and do the factorization */
1994     for (i=0,row=0; i<node_max; i++) {
1995       nodesz = ns[i];
1996       nz     = bi[row+1] - bi[row];
1997       bjtmp  = bj + bi[row];
1998 
1999       switch (nodesz) {
2000       case 1:
2001         for  (j=0; j<nz; j++) {
2002           idx         = bjtmp[j];
2003           rtmp11[idx] = 0.0;
2004         }
2005 
2006         /* load in initial (unfactored row) */
2007         idx    = r[row];
2008         nz_tmp = ai[idx+1] - ai[idx];
2009         ajtmp  = aj + ai[idx];
2010         v1     = aa + ai[idx];
2011 
2012         for (j=0; j<nz_tmp; j++) {
2013           idx         = ics[ajtmp[j]];
2014           rtmp11[idx] = v1[j];
2015         }
2016         rtmp11[ics[r[row]]] += sctx.shift_amount;
2017 
2018         prow = *bjtmp++;
2019         while (prow < row) {
2020           pc1 = rtmp11 + prow;
2021           if (*pc1 != 0.0) {
2022             pv     = ba + bd[prow];
2023             pj     = nbj + bd[prow];
2024             mul1   = *pc1 * *pv++;
2025             *pc1   = mul1;
2026             nz_tmp = bi[prow+1] - bd[prow] - 1;
2027             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2028             for (j=0; j<nz_tmp; j++) {
2029               tmp          = pv[j];
2030               idx          = pj[j];
2031               rtmp11[idx] -= mul1 * tmp;
2032             }
2033           }
2034           prow = *bjtmp++;
2035         }
2036         pj  = bj + bi[row];
2037         pc1 = ba + bi[row];
2038 
2039         sctx.pv     = rtmp11[row];
2040         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2041         rs          = 0.0;
2042         for (j=0; j<nz; j++) {
2043           idx    = pj[j];
2044           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2045           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2046         }
2047         sctx.rs = rs;
2048         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2049         if (sctx.newshift) goto endofwhile;
2050         break;
2051 
2052       case 2:
2053         for (j=0; j<nz; j++) {
2054           idx         = bjtmp[j];
2055           rtmp11[idx] = 0.0;
2056           rtmp22[idx] = 0.0;
2057         }
2058 
2059         /* load in initial (unfactored row) */
2060         idx    = r[row];
2061         nz_tmp = ai[idx+1] - ai[idx];
2062         ajtmp  = aj + ai[idx];
2063         v1     = aa + ai[idx];
2064         v2     = aa + ai[idx+1];
2065         for (j=0; j<nz_tmp; j++) {
2066           idx         = ics[ajtmp[j]];
2067           rtmp11[idx] = v1[j];
2068           rtmp22[idx] = v2[j];
2069         }
2070         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2071         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2072 
2073         prow = *bjtmp++;
2074         while (prow < row) {
2075           pc1 = rtmp11 + prow;
2076           pc2 = rtmp22 + prow;
2077           if (*pc1 != 0.0 || *pc2 != 0.0) {
2078             pv   = ba + bd[prow];
2079             pj   = nbj + bd[prow];
2080             mul1 = *pc1 * *pv;
2081             mul2 = *pc2 * *pv;
2082             ++pv;
2083             *pc1 = mul1;
2084             *pc2 = mul2;
2085 
2086             nz_tmp = bi[prow+1] - bd[prow] - 1;
2087             for (j=0; j<nz_tmp; j++) {
2088               tmp          = pv[j];
2089               idx          = pj[j];
2090               rtmp11[idx] -= mul1 * tmp;
2091               rtmp22[idx] -= mul2 * tmp;
2092             }
2093             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2094           }
2095           prow = *bjtmp++;
2096         }
2097 
2098         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2099         pc1 = rtmp11 + prow;
2100         pc2 = rtmp22 + prow;
2101 
2102         sctx.pv = *pc1;
2103         pj      = bj + bi[prow];
2104         rs      = 0.0;
2105         for (j=0; j<nz; j++) {
2106           idx = pj[j];
2107           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2108         }
2109         sctx.rs = rs;
2110         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2111         if (sctx.newshift) goto endofwhile;
2112 
2113         if (*pc2 != 0.0) {
2114           pj     = nbj + bd[prow];
2115           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2116           *pc2   = mul2;
2117           nz_tmp = bi[prow+1] - bd[prow] - 1;
2118           for (j=0; j<nz_tmp; j++) {
2119             idx          = pj[j];
2120             tmp          = rtmp11[idx];
2121             rtmp22[idx] -= mul2 * tmp;
2122           }
2123           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2124         }
2125 
2126         pj  = bj + bi[row];
2127         pc1 = ba + bi[row];
2128         pc2 = ba + bi[row+1];
2129 
2130         sctx.pv       = rtmp22[row+1];
2131         rs            = 0.0;
2132         rtmp11[row]   = 1.0/rtmp11[row];
2133         rtmp22[row+1] = 1.0/rtmp22[row+1];
2134         /* copy row entries from dense representation to sparse */
2135         for (j=0; j<nz; j++) {
2136           idx    = pj[j];
2137           pc1[j] = rtmp11[idx];
2138           pc2[j] = rtmp22[idx];
2139           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2140         }
2141         sctx.rs = rs;
2142         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2143         if (sctx.newshift) goto endofwhile;
2144         break;
2145 
2146       case 3:
2147         for  (j=0; j<nz; j++) {
2148           idx         = bjtmp[j];
2149           rtmp11[idx] = 0.0;
2150           rtmp22[idx] = 0.0;
2151           rtmp33[idx] = 0.0;
2152         }
2153         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2154         idx    = r[row];
2155         nz_tmp = ai[idx+1] - ai[idx];
2156         ajtmp  = aj + ai[idx];
2157         v1     = aa + ai[idx];
2158         v2     = aa + ai[idx+1];
2159         v3     = aa + ai[idx+2];
2160         for (j=0; j<nz_tmp; j++) {
2161           idx         = ics[ajtmp[j]];
2162           rtmp11[idx] = v1[j];
2163           rtmp22[idx] = v2[j];
2164           rtmp33[idx] = v3[j];
2165         }
2166         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2167         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2168         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2169 
2170         /* loop over all pivot row blocks above this row block */
2171         prow = *bjtmp++;
2172         while (prow < row) {
2173           pc1 = rtmp11 + prow;
2174           pc2 = rtmp22 + prow;
2175           pc3 = rtmp33 + prow;
2176           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2177             pv   = ba  + bd[prow];
2178             pj   = nbj + bd[prow];
2179             mul1 = *pc1 * *pv;
2180             mul2 = *pc2 * *pv;
2181             mul3 = *pc3 * *pv;
2182             ++pv;
2183             *pc1 = mul1;
2184             *pc2 = mul2;
2185             *pc3 = mul3;
2186 
2187             nz_tmp = bi[prow+1] - bd[prow] - 1;
2188             /* update this row based on pivot row */
2189             for (j=0; j<nz_tmp; j++) {
2190               tmp          = pv[j];
2191               idx          = pj[j];
2192               rtmp11[idx] -= mul1 * tmp;
2193               rtmp22[idx] -= mul2 * tmp;
2194               rtmp33[idx] -= mul3 * tmp;
2195             }
2196             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2197           }
2198           prow = *bjtmp++;
2199         }
2200 
2201         /* Now take care of diagonal 3x3 block in this set of rows */
2202         /* note: prow = row here */
2203         pc1 = rtmp11 + prow;
2204         pc2 = rtmp22 + prow;
2205         pc3 = rtmp33 + prow;
2206 
2207         sctx.pv = *pc1;
2208         pj      = bj + bi[prow];
2209         rs      = 0.0;
2210         for (j=0; j<nz; j++) {
2211           idx = pj[j];
2212           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2213         }
2214         sctx.rs = rs;
2215         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2216         if (sctx.newshift) goto endofwhile;
2217 
2218         if (*pc2 != 0.0 || *pc3 != 0.0) {
2219           mul2   = (*pc2)/(*pc1);
2220           mul3   = (*pc3)/(*pc1);
2221           *pc2   = mul2;
2222           *pc3   = mul3;
2223           nz_tmp = bi[prow+1] - bd[prow] - 1;
2224           pj     = nbj + bd[prow];
2225           for (j=0; j<nz_tmp; j++) {
2226             idx          = pj[j];
2227             tmp          = rtmp11[idx];
2228             rtmp22[idx] -= mul2 * tmp;
2229             rtmp33[idx] -= mul3 * tmp;
2230           }
2231           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2232         }
2233         ++prow;
2234 
2235         pc2     = rtmp22 + prow;
2236         pc3     = rtmp33 + prow;
2237         sctx.pv = *pc2;
2238         pj      = bj + bi[prow];
2239         rs      = 0.0;
2240         for (j=0; j<nz; j++) {
2241           idx = pj[j];
2242           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2243         }
2244         sctx.rs = rs;
2245         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2246         if (sctx.newshift) goto endofwhile;
2247 
2248         if (*pc3 != 0.0) {
2249           mul3   = (*pc3)/(*pc2);
2250           *pc3   = mul3;
2251           pj     = nbj + bd[prow];
2252           nz_tmp = bi[prow+1] - bd[prow] - 1;
2253           for (j=0; j<nz_tmp; j++) {
2254             idx          = pj[j];
2255             tmp          = rtmp22[idx];
2256             rtmp33[idx] -= mul3 * tmp;
2257           }
2258           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2259         }
2260 
2261         pj  = bj + bi[row];
2262         pc1 = ba + bi[row];
2263         pc2 = ba + bi[row+1];
2264         pc3 = ba + bi[row+2];
2265 
2266         sctx.pv       = rtmp33[row+2];
2267         rs            = 0.0;
2268         rtmp11[row]   = 1.0/rtmp11[row];
2269         rtmp22[row+1] = 1.0/rtmp22[row+1];
2270         rtmp33[row+2] = 1.0/rtmp33[row+2];
2271         /* copy row entries from dense representation to sparse */
2272         for (j=0; j<nz; j++) {
2273           idx    = pj[j];
2274           pc1[j] = rtmp11[idx];
2275           pc2[j] = rtmp22[idx];
2276           pc3[j] = rtmp33[idx];
2277           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2278         }
2279 
2280         sctx.rs = rs;
2281         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2282         if (sctx.newshift) goto endofwhile;
2283         break;
2284 
2285       default:
2286         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2287       }
2288       row += nodesz;                 /* Update the row */
2289     }
2290 endofwhile:;
2291   } while (sctx.newshift);
2292   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2293   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2294   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2295   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2296   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2297 
2298   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2299   /* do not set solve add, since MatSolve_Inode + Add is faster */
2300   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2301   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2302   C->assembled              = PETSC_TRUE;
2303   C->preallocated           = PETSC_TRUE;
2304   if (sctx.nshift) {
2305     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2306       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2307     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2308       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2309     }
2310   }
2311   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2312   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2313   PetscFunctionReturn(0);
2314 }
2315 
2316 
2317 /* ----------------------------------------------------------- */
2318 #undef __FUNCT__
2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2321 {
2322   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2323   IS                iscol = a->col,isrow = a->row;
2324   PetscErrorCode    ierr;
2325   const PetscInt    *r,*c,*rout,*cout;
2326   PetscInt          i,j,n = A->rmap->n;
2327   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2328   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2329   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2330   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2331   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2332   const PetscScalar *b;
2333 
2334   PetscFunctionBegin;
2335   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2336   node_max = a->inode.node_count;
2337   ns       = a->inode.size;     /* Node Size array */
2338 
2339   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2341   tmp  = a->solve_work;
2342 
2343   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2344   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2345 
2346   /* forward solve the lower triangular */
2347   tmps = tmp;
2348   aa   = a_a;
2349   aj   = a_j;
2350   ad   = a->diag;
2351 
2352   for (i = 0,row = 0; i< node_max; ++i) {
2353     nsz = ns[i];
2354     aii = ai[row];
2355     v1  = aa + aii;
2356     vi  = aj + aii;
2357     nz  = ai[row+1]- ai[row];
2358 
2359     if (i < node_max-1) {
2360       /* Prefetch the indices for the next block */
2361       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2362       /* Prefetch the data for the next block */
2363       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2364     }
2365 
2366     switch (nsz) {               /* Each loop in 'case' is unrolled */
2367     case 1:
2368       sum1 = b[r[row]];
2369       for (j=0; j<nz-1; j+=2) {
2370         i0    = vi[j];
2371         i1    = vi[j+1];
2372         tmp0  = tmps[i0];
2373         tmp1  = tmps[i1];
2374         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2375       }
2376       if (j == nz-1) {
2377         tmp0  = tmps[vi[j]];
2378         sum1 -= v1[j]*tmp0;
2379       }
2380       tmp[row++]=sum1;
2381       break;
2382     case 2:
2383       sum1 = b[r[row]];
2384       sum2 = b[r[row+1]];
2385       v2   = aa + ai[row+1];
2386 
2387       for (j=0; j<nz-1; j+=2) {
2388         i0    = vi[j];
2389         i1    = vi[j+1];
2390         tmp0  = tmps[i0];
2391         tmp1  = tmps[i1];
2392         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394       }
2395       if (j == nz-1) {
2396         tmp0  = tmps[vi[j]];
2397         sum1 -= v1[j] *tmp0;
2398         sum2 -= v2[j] *tmp0;
2399       }
2400       sum2     -= v2[nz] * sum1;
2401       tmp[row++]=sum1;
2402       tmp[row++]=sum2;
2403       break;
2404     case 3:
2405       sum1 = b[r[row]];
2406       sum2 = b[r[row+1]];
2407       sum3 = b[r[row+2]];
2408       v2   = aa + ai[row+1];
2409       v3   = aa + ai[row+2];
2410 
2411       for (j=0; j<nz-1; j+=2) {
2412         i0    = vi[j];
2413         i1    = vi[j+1];
2414         tmp0  = tmps[i0];
2415         tmp1  = tmps[i1];
2416         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2417         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2418         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2419       }
2420       if (j == nz-1) {
2421         tmp0  = tmps[vi[j]];
2422         sum1 -= v1[j] *tmp0;
2423         sum2 -= v2[j] *tmp0;
2424         sum3 -= v3[j] *tmp0;
2425       }
2426       sum2     -= v2[nz] * sum1;
2427       sum3     -= v3[nz] * sum1;
2428       sum3     -= v3[nz+1] * sum2;
2429       tmp[row++]=sum1;
2430       tmp[row++]=sum2;
2431       tmp[row++]=sum3;
2432       break;
2433 
2434     case 4:
2435       sum1 = b[r[row]];
2436       sum2 = b[r[row+1]];
2437       sum3 = b[r[row+2]];
2438       sum4 = b[r[row+3]];
2439       v2   = aa + ai[row+1];
2440       v3   = aa + ai[row+2];
2441       v4   = aa + ai[row+3];
2442 
2443       for (j=0; j<nz-1; j+=2) {
2444         i0    = vi[j];
2445         i1    = vi[j+1];
2446         tmp0  = tmps[i0];
2447         tmp1  = tmps[i1];
2448         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2449         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2450         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2451         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2452       }
2453       if (j == nz-1) {
2454         tmp0  = tmps[vi[j]];
2455         sum1 -= v1[j] *tmp0;
2456         sum2 -= v2[j] *tmp0;
2457         sum3 -= v3[j] *tmp0;
2458         sum4 -= v4[j] *tmp0;
2459       }
2460       sum2 -= v2[nz] * sum1;
2461       sum3 -= v3[nz] * sum1;
2462       sum4 -= v4[nz] * sum1;
2463       sum3 -= v3[nz+1] * sum2;
2464       sum4 -= v4[nz+1] * sum2;
2465       sum4 -= v4[nz+2] * sum3;
2466 
2467       tmp[row++]=sum1;
2468       tmp[row++]=sum2;
2469       tmp[row++]=sum3;
2470       tmp[row++]=sum4;
2471       break;
2472     case 5:
2473       sum1 = b[r[row]];
2474       sum2 = b[r[row+1]];
2475       sum3 = b[r[row+2]];
2476       sum4 = b[r[row+3]];
2477       sum5 = b[r[row+4]];
2478       v2   = aa + ai[row+1];
2479       v3   = aa + ai[row+2];
2480       v4   = aa + ai[row+3];
2481       v5   = aa + ai[row+4];
2482 
2483       for (j=0; j<nz-1; j+=2) {
2484         i0    = vi[j];
2485         i1    = vi[j+1];
2486         tmp0  = tmps[i0];
2487         tmp1  = tmps[i1];
2488         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2489         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2490         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2491         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2492         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2493       }
2494       if (j == nz-1) {
2495         tmp0  = tmps[vi[j]];
2496         sum1 -= v1[j] *tmp0;
2497         sum2 -= v2[j] *tmp0;
2498         sum3 -= v3[j] *tmp0;
2499         sum4 -= v4[j] *tmp0;
2500         sum5 -= v5[j] *tmp0;
2501       }
2502 
2503       sum2 -= v2[nz] * sum1;
2504       sum3 -= v3[nz] * sum1;
2505       sum4 -= v4[nz] * sum1;
2506       sum5 -= v5[nz] * sum1;
2507       sum3 -= v3[nz+1] * sum2;
2508       sum4 -= v4[nz+1] * sum2;
2509       sum5 -= v5[nz+1] * sum2;
2510       sum4 -= v4[nz+2] * sum3;
2511       sum5 -= v5[nz+2] * sum3;
2512       sum5 -= v5[nz+3] * sum4;
2513 
2514       tmp[row++]=sum1;
2515       tmp[row++]=sum2;
2516       tmp[row++]=sum3;
2517       tmp[row++]=sum4;
2518       tmp[row++]=sum5;
2519       break;
2520     default:
2521       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2522     }
2523   }
2524   /* backward solve the upper triangular */
2525   for (i=node_max -1,row = n-1; i>=0; i--) {
2526     nsz = ns[i];
2527     aii = ad[row+1] + 1;
2528     v1  = aa + aii;
2529     vi  = aj + aii;
2530     nz  = ad[row]- ad[row+1] - 1;
2531 
2532     if (i > 0) {
2533       /* Prefetch the indices for the next block */
2534       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2535       /* Prefetch the data for the next block */
2536       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2537     }
2538 
2539     switch (nsz) {               /* Each loop in 'case' is unrolled */
2540     case 1:
2541       sum1 = tmp[row];
2542 
2543       for (j=0; j<nz-1; j+=2) {
2544         i0    = vi[j];
2545         i1    = vi[j+1];
2546         tmp0  = tmps[i0];
2547         tmp1  = tmps[i1];
2548         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2549       }
2550       if (j == nz-1) {
2551         tmp0  = tmps[vi[j]];
2552         sum1 -= v1[j]*tmp0;
2553       }
2554       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2555       break;
2556     case 2:
2557       sum1 = tmp[row];
2558       sum2 = tmp[row-1];
2559       v2   = aa + ad[row] + 1;
2560       for (j=0; j<nz-1; j+=2) {
2561         i0    = vi[j];
2562         i1    = vi[j+1];
2563         tmp0  = tmps[i0];
2564         tmp1  = tmps[i1];
2565         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567       }
2568       if (j == nz-1) {
2569         tmp0  = tmps[vi[j]];
2570         sum1 -= v1[j]* tmp0;
2571         sum2 -= v2[j+1]* tmp0;
2572       }
2573 
2574       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2575       sum2     -= v2[0] * tmp0;
2576       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2577       break;
2578     case 3:
2579       sum1 = tmp[row];
2580       sum2 = tmp[row -1];
2581       sum3 = tmp[row -2];
2582       v2   = aa + ad[row] + 1;
2583       v3   = aa + ad[row -1] + 1;
2584       for (j=0; j<nz-1; j+=2) {
2585         i0    = vi[j];
2586         i1    = vi[j+1];
2587         tmp0  = tmps[i0];
2588         tmp1  = tmps[i1];
2589         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2590         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2591         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2592       }
2593       if (j== nz-1) {
2594         tmp0  = tmps[vi[j]];
2595         sum1 -= v1[j] * tmp0;
2596         sum2 -= v2[j+1] * tmp0;
2597         sum3 -= v3[j+2] * tmp0;
2598       }
2599       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600       sum2     -= v2[0]* tmp0;
2601       sum3     -= v3[1] * tmp0;
2602       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2603       sum3     -= v3[0]* tmp0;
2604       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2605 
2606       break;
2607     case 4:
2608       sum1 = tmp[row];
2609       sum2 = tmp[row -1];
2610       sum3 = tmp[row -2];
2611       sum4 = tmp[row -3];
2612       v2   = aa + ad[row]+1;
2613       v3   = aa + ad[row -1]+1;
2614       v4   = aa + ad[row -2]+1;
2615 
2616       for (j=0; j<nz-1; j+=2) {
2617         i0    = vi[j];
2618         i1    = vi[j+1];
2619         tmp0  = tmps[i0];
2620         tmp1  = tmps[i1];
2621         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2622         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2623         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2624         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2625       }
2626       if (j== nz-1) {
2627         tmp0  = tmps[vi[j]];
2628         sum1 -= v1[j] * tmp0;
2629         sum2 -= v2[j+1] * tmp0;
2630         sum3 -= v3[j+2] * tmp0;
2631         sum4 -= v4[j+3] * tmp0;
2632       }
2633 
2634       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2635       sum2     -= v2[0] * tmp0;
2636       sum3     -= v3[1] * tmp0;
2637       sum4     -= v4[2] * tmp0;
2638       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2639       sum3     -= v3[0] * tmp0;
2640       sum4     -= v4[1] * tmp0;
2641       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2642       sum4     -= v4[0] * tmp0;
2643       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2644       break;
2645     case 5:
2646       sum1 = tmp[row];
2647       sum2 = tmp[row -1];
2648       sum3 = tmp[row -2];
2649       sum4 = tmp[row -3];
2650       sum5 = tmp[row -4];
2651       v2   = aa + ad[row]+1;
2652       v3   = aa + ad[row -1]+1;
2653       v4   = aa + ad[row -2]+1;
2654       v5   = aa + ad[row -3]+1;
2655       for (j=0; j<nz-1; j+=2) {
2656         i0    = vi[j];
2657         i1    = vi[j+1];
2658         tmp0  = tmps[i0];
2659         tmp1  = tmps[i1];
2660         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2661         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2662         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2663         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2664         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2665       }
2666       if (j==nz-1) {
2667         tmp0  = tmps[vi[j]];
2668         sum1 -= v1[j] * tmp0;
2669         sum2 -= v2[j+1] * tmp0;
2670         sum3 -= v3[j+2] * tmp0;
2671         sum4 -= v4[j+3] * tmp0;
2672         sum5 -= v5[j+4] * tmp0;
2673       }
2674 
2675       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2676       sum2     -= v2[0] * tmp0;
2677       sum3     -= v3[1] * tmp0;
2678       sum4     -= v4[2] * tmp0;
2679       sum5     -= v5[3] * tmp0;
2680       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2681       sum3     -= v3[0] * tmp0;
2682       sum4     -= v4[1] * tmp0;
2683       sum5     -= v5[2] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2685       sum4     -= v4[0] * tmp0;
2686       sum5     -= v5[1] * tmp0;
2687       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2688       sum5     -= v5[0] * tmp0;
2689       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2690       break;
2691     default:
2692       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2693     }
2694   }
2695   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2696   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2699   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2700   PetscFunctionReturn(0);
2701 }
2702 
2703 
2704 /*
2705      Makes a longer coloring[] array and calls the usual code with that
2706 */
2707 #undef __FUNCT__
2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2710 {
2711   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2712   PetscErrorCode  ierr;
2713   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2714   PetscInt        *colorused,i;
2715   ISColoringValue *newcolor;
2716 
2717   PetscFunctionBegin;
2718   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2719   /* loop over inodes, marking a color for each column*/
2720   row = 0;
2721   for (i=0; i<m; i++) {
2722     for (j=0; j<ns[i]; j++) {
2723       newcolor[row++] = coloring[i] + j*ncolors;
2724     }
2725   }
2726 
2727   /* eliminate unneeded colors */
2728   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2729   for (i=0; i<n; i++) {
2730     colorused[newcolor[i]] = 1;
2731   }
2732 
2733   for (i=1; i<5*ncolors; i++) {
2734     colorused[i] += colorused[i-1];
2735   }
2736   ncolors = colorused[5*ncolors-1];
2737   for (i=0; i<n; i++) {
2738     newcolor[i] = colorused[newcolor[i]]-1;
2739   }
2740   ierr = PetscFree(colorused);CHKERRQ(ierr);
2741   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2742   ierr = PetscFree(coloring);CHKERRQ(ierr);
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 #include <petsc/private/kernels/blockinvert.h>
2747 
2748 #undef __FUNCT__
2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2751 {
2752   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2753   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2754   MatScalar         *ibdiag,*bdiag,work[25],*t;
2755   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2756   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2757   const PetscScalar *xb, *b;
2758   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2759   PetscErrorCode    ierr;
2760   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2761   PetscInt          sz,k,ipvt[5];
2762   PetscBool         allowzeropivot,zeropivotdetected=PETSC_FALSE;
2763   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2764 
2765   PetscFunctionBegin;
2766   allowzeropivot = PetscNot(A->erroriffailure);
2767   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2768   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2769   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2770 
2771   if (!a->inode.ibdiagvalid) {
2772     if (!a->inode.ibdiag) {
2773       /* calculate space needed for diagonal blocks */
2774       for (i=0; i<m; i++) {
2775         cnt += sizes[i]*sizes[i];
2776       }
2777       a->inode.bdiagsize = cnt;
2778 
2779       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2780     }
2781 
2782     /* copy over the diagonal blocks and invert them */
2783     ibdiag = a->inode.ibdiag;
2784     bdiag  = a->inode.bdiag;
2785     cnt    = 0;
2786     for (i=0, row = 0; i<m; i++) {
2787       for (j=0; j<sizes[i]; j++) {
2788         for (k=0; k<sizes[i]; k++) {
2789           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2790         }
2791       }
2792       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2793 
2794       switch (sizes[i]) {
2795       case 1:
2796         /* Create matrix data structure */
2797         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2798           if (allowzeropivot) {
2799             zeropivotdetected = PETSC_TRUE;
2800             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2801           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2802         }
2803         ibdiag[cnt] = 1.0/ibdiag[cnt];
2804         break;
2805       case 2:
2806         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2807         break;
2808       case 3:
2809         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2810         break;
2811       case 4:
2812         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2813         break;
2814       case 5:
2815         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2816         break;
2817       default:
2818         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2819       }
2820       if (zeropivotdetected) {
2821         A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2822       }
2823 
2824       cnt += sizes[i]*sizes[i];
2825       row += sizes[i];
2826     }
2827     a->inode.ibdiagvalid = PETSC_TRUE;
2828   }
2829   ibdiag = a->inode.ibdiag;
2830   bdiag  = a->inode.bdiag;
2831   t      = a->inode.ssor_work;
2832 
2833   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2834   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2835   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2836   if (flag & SOR_ZERO_INITIAL_GUESS) {
2837     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2838 
2839       for (i=0, row=0; i<m; i++) {
2840         sz  = diag[row] - ii[row];
2841         v1  = a->a + ii[row];
2842         idx = a->j + ii[row];
2843 
2844         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2845         switch (sizes[i]) {
2846         case 1:
2847 
2848           sum1 = b[row];
2849           for (n = 0; n<sz-1; n+=2) {
2850             i1    = idx[0];
2851             i2    = idx[1];
2852             idx  += 2;
2853             tmp0  = x[i1];
2854             tmp1  = x[i2];
2855             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2856           }
2857 
2858           if (n == sz-1) {
2859             tmp0  = x[*idx];
2860             sum1 -= *v1 * tmp0;
2861           }
2862           t[row]   = sum1;
2863           x[row++] = sum1*(*ibdiag++);
2864           break;
2865         case 2:
2866           v2   = a->a + ii[row+1];
2867           sum1 = b[row];
2868           sum2 = b[row+1];
2869           for (n = 0; n<sz-1; n+=2) {
2870             i1    = idx[0];
2871             i2    = idx[1];
2872             idx  += 2;
2873             tmp0  = x[i1];
2874             tmp1  = x[i2];
2875             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2876             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2877           }
2878 
2879           if (n == sz-1) {
2880             tmp0  = x[*idx];
2881             sum1 -= v1[0] * tmp0;
2882             sum2 -= v2[0] * tmp0;
2883           }
2884           t[row]   = sum1;
2885           t[row+1] = sum2;
2886           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2887           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2888           ibdiag  += 4;
2889           break;
2890         case 3:
2891           v2   = a->a + ii[row+1];
2892           v3   = a->a + ii[row+2];
2893           sum1 = b[row];
2894           sum2 = b[row+1];
2895           sum3 = b[row+2];
2896           for (n = 0; n<sz-1; n+=2) {
2897             i1    = idx[0];
2898             i2    = idx[1];
2899             idx  += 2;
2900             tmp0  = x[i1];
2901             tmp1  = x[i2];
2902             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2903             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2904             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2905           }
2906 
2907           if (n == sz-1) {
2908             tmp0  = x[*idx];
2909             sum1 -= v1[0] * tmp0;
2910             sum2 -= v2[0] * tmp0;
2911             sum3 -= v3[0] * tmp0;
2912           }
2913           t[row]   = sum1;
2914           t[row+1] = sum2;
2915           t[row+2] = sum3;
2916           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2917           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2918           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2919           ibdiag  += 9;
2920           break;
2921         case 4:
2922           v2   = a->a + ii[row+1];
2923           v3   = a->a + ii[row+2];
2924           v4   = a->a + ii[row+3];
2925           sum1 = b[row];
2926           sum2 = b[row+1];
2927           sum3 = b[row+2];
2928           sum4 = b[row+3];
2929           for (n = 0; n<sz-1; n+=2) {
2930             i1    = idx[0];
2931             i2    = idx[1];
2932             idx  += 2;
2933             tmp0  = x[i1];
2934             tmp1  = x[i2];
2935             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2936             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2937             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2938             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2939           }
2940 
2941           if (n == sz-1) {
2942             tmp0  = x[*idx];
2943             sum1 -= v1[0] * tmp0;
2944             sum2 -= v2[0] * tmp0;
2945             sum3 -= v3[0] * tmp0;
2946             sum4 -= v4[0] * tmp0;
2947           }
2948           t[row]   = sum1;
2949           t[row+1] = sum2;
2950           t[row+2] = sum3;
2951           t[row+3] = sum4;
2952           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2953           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2954           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2955           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2956           ibdiag  += 16;
2957           break;
2958         case 5:
2959           v2   = a->a + ii[row+1];
2960           v3   = a->a + ii[row+2];
2961           v4   = a->a + ii[row+3];
2962           v5   = a->a + ii[row+4];
2963           sum1 = b[row];
2964           sum2 = b[row+1];
2965           sum3 = b[row+2];
2966           sum4 = b[row+3];
2967           sum5 = b[row+4];
2968           for (n = 0; n<sz-1; n+=2) {
2969             i1    = idx[0];
2970             i2    = idx[1];
2971             idx  += 2;
2972             tmp0  = x[i1];
2973             tmp1  = x[i2];
2974             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2975             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2976             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2977             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2978             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2979           }
2980 
2981           if (n == sz-1) {
2982             tmp0  = x[*idx];
2983             sum1 -= v1[0] * tmp0;
2984             sum2 -= v2[0] * tmp0;
2985             sum3 -= v3[0] * tmp0;
2986             sum4 -= v4[0] * tmp0;
2987             sum5 -= v5[0] * tmp0;
2988           }
2989           t[row]   = sum1;
2990           t[row+1] = sum2;
2991           t[row+2] = sum3;
2992           t[row+3] = sum4;
2993           t[row+4] = sum5;
2994           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2995           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2996           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2997           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2998           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2999           ibdiag  += 25;
3000           break;
3001         default:
3002           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3003         }
3004       }
3005 
3006       xb   = t;
3007       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3008     } else xb = b;
3009     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3010 
3011       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3012       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3013         ibdiag -= sizes[i]*sizes[i];
3014         sz      = ii[row+1] - diag[row] - 1;
3015         v1      = a->a + diag[row] + 1;
3016         idx     = a->j + diag[row] + 1;
3017 
3018         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3019         switch (sizes[i]) {
3020         case 1:
3021 
3022           sum1 = xb[row];
3023           for (n = 0; n<sz-1; n+=2) {
3024             i1    = idx[0];
3025             i2    = idx[1];
3026             idx  += 2;
3027             tmp0  = x[i1];
3028             tmp1  = x[i2];
3029             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3030           }
3031 
3032           if (n == sz-1) {
3033             tmp0  = x[*idx];
3034             sum1 -= *v1*tmp0;
3035           }
3036           x[row--] = sum1*(*ibdiag);
3037           break;
3038 
3039         case 2:
3040 
3041           sum1 = xb[row];
3042           sum2 = xb[row-1];
3043           /* note that sum1 is associated with the second of the two rows */
3044           v2 = a->a + diag[row-1] + 2;
3045           for (n = 0; n<sz-1; n+=2) {
3046             i1    = idx[0];
3047             i2    = idx[1];
3048             idx  += 2;
3049             tmp0  = x[i1];
3050             tmp1  = x[i2];
3051             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3052             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3053           }
3054 
3055           if (n == sz-1) {
3056             tmp0  = x[*idx];
3057             sum1 -= *v1*tmp0;
3058             sum2 -= *v2*tmp0;
3059           }
3060           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3061           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3062           break;
3063         case 3:
3064 
3065           sum1 = xb[row];
3066           sum2 = xb[row-1];
3067           sum3 = xb[row-2];
3068           v2   = a->a + diag[row-1] + 2;
3069           v3   = a->a + diag[row-2] + 3;
3070           for (n = 0; n<sz-1; n+=2) {
3071             i1    = idx[0];
3072             i2    = idx[1];
3073             idx  += 2;
3074             tmp0  = x[i1];
3075             tmp1  = x[i2];
3076             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3077             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3078             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3079           }
3080 
3081           if (n == sz-1) {
3082             tmp0  = x[*idx];
3083             sum1 -= *v1*tmp0;
3084             sum2 -= *v2*tmp0;
3085             sum3 -= *v3*tmp0;
3086           }
3087           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3088           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3089           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3090           break;
3091         case 4:
3092 
3093           sum1 = xb[row];
3094           sum2 = xb[row-1];
3095           sum3 = xb[row-2];
3096           sum4 = xb[row-3];
3097           v2   = a->a + diag[row-1] + 2;
3098           v3   = a->a + diag[row-2] + 3;
3099           v4   = a->a + diag[row-3] + 4;
3100           for (n = 0; n<sz-1; n+=2) {
3101             i1    = idx[0];
3102             i2    = idx[1];
3103             idx  += 2;
3104             tmp0  = x[i1];
3105             tmp1  = x[i2];
3106             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3107             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3108             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3109             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3110           }
3111 
3112           if (n == sz-1) {
3113             tmp0  = x[*idx];
3114             sum1 -= *v1*tmp0;
3115             sum2 -= *v2*tmp0;
3116             sum3 -= *v3*tmp0;
3117             sum4 -= *v4*tmp0;
3118           }
3119           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3120           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3121           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3122           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3123           break;
3124         case 5:
3125 
3126           sum1 = xb[row];
3127           sum2 = xb[row-1];
3128           sum3 = xb[row-2];
3129           sum4 = xb[row-3];
3130           sum5 = xb[row-4];
3131           v2   = a->a + diag[row-1] + 2;
3132           v3   = a->a + diag[row-2] + 3;
3133           v4   = a->a + diag[row-3] + 4;
3134           v5   = a->a + diag[row-4] + 5;
3135           for (n = 0; n<sz-1; n+=2) {
3136             i1    = idx[0];
3137             i2    = idx[1];
3138             idx  += 2;
3139             tmp0  = x[i1];
3140             tmp1  = x[i2];
3141             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3142             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3143             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3144             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3145             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3146           }
3147 
3148           if (n == sz-1) {
3149             tmp0  = x[*idx];
3150             sum1 -= *v1*tmp0;
3151             sum2 -= *v2*tmp0;
3152             sum3 -= *v3*tmp0;
3153             sum4 -= *v4*tmp0;
3154             sum5 -= *v5*tmp0;
3155           }
3156           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3157           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3158           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3159           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3160           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3161           break;
3162         default:
3163           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3164         }
3165       }
3166 
3167       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3168     }
3169     its--;
3170   }
3171   while (its--) {
3172 
3173     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3174       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3175            i<m;
3176            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3177 
3178         sz  = diag[row] - ii[row];
3179         v1  = a->a + ii[row];
3180         idx = a->j + ii[row];
3181         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3182         switch (sizes[i]) {
3183         case 1:
3184           sum1 = b[row];
3185           for (n = 0; n<sz-1; n+=2) {
3186             i1    = idx[0];
3187             i2    = idx[1];
3188             idx  += 2;
3189             tmp0  = x[i1];
3190             tmp1  = x[i2];
3191             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3192           }
3193           if (n == sz-1) {
3194             tmp0  = x[*idx++];
3195             sum1 -= *v1 * tmp0;
3196             v1++;
3197           }
3198           t[row]   = sum1;
3199           sz      = ii[row+1] - diag[row] - 1;
3200           idx     = a->j + diag[row] + 1;
3201           v1 += 1;
3202           for (n = 0; n<sz-1; n+=2) {
3203             i1    = idx[0];
3204             i2    = idx[1];
3205             idx  += 2;
3206             tmp0  = x[i1];
3207             tmp1  = x[i2];
3208             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3209           }
3210           if (n == sz-1) {
3211             tmp0  = x[*idx++];
3212             sum1 -= *v1 * tmp0;
3213           }
3214           /* in MatSOR_SeqAIJ this line would be
3215            *
3216            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3217            *
3218            * but omega == 1, so this becomes
3219            *
3220            * x[row] = sum1*(*ibdiag++);
3221            *
3222            */
3223           x[row] = sum1*(*ibdiag);
3224           break;
3225         case 2:
3226           v2   = a->a + ii[row+1];
3227           sum1 = b[row];
3228           sum2 = b[row+1];
3229           for (n = 0; n<sz-1; n+=2) {
3230             i1    = idx[0];
3231             i2    = idx[1];
3232             idx  += 2;
3233             tmp0  = x[i1];
3234             tmp1  = x[i2];
3235             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3236             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3237           }
3238           if (n == sz-1) {
3239             tmp0  = x[*idx++];
3240             sum1 -= v1[0] * tmp0;
3241             sum2 -= v2[0] * tmp0;
3242             v1++; v2++;
3243           }
3244           t[row]   = sum1;
3245           t[row+1] = sum2;
3246           sz      = ii[row+1] - diag[row] - 2;
3247           idx     = a->j + diag[row] + 2;
3248           v1 += 2;
3249           v2 += 2;
3250           for (n = 0; n<sz-1; n+=2) {
3251             i1    = idx[0];
3252             i2    = idx[1];
3253             idx  += 2;
3254             tmp0  = x[i1];
3255             tmp1  = x[i2];
3256             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3257             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3258           }
3259           if (n == sz-1) {
3260             tmp0  = x[*idx];
3261             sum1 -= v1[0] * tmp0;
3262             sum2 -= v2[0] * tmp0;
3263           }
3264           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3265           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3266           break;
3267         case 3:
3268           v2   = a->a + ii[row+1];
3269           v3   = a->a + ii[row+2];
3270           sum1 = b[row];
3271           sum2 = b[row+1];
3272           sum3 = b[row+2];
3273           for (n = 0; n<sz-1; n+=2) {
3274             i1    = idx[0];
3275             i2    = idx[1];
3276             idx  += 2;
3277             tmp0  = x[i1];
3278             tmp1  = x[i2];
3279             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3280             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3281             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3282           }
3283           if (n == sz-1) {
3284             tmp0  = x[*idx++];
3285             sum1 -= v1[0] * tmp0;
3286             sum2 -= v2[0] * tmp0;
3287             sum3 -= v3[0] * tmp0;
3288             v1++; v2++; v3++;
3289           }
3290           t[row]   = sum1;
3291           t[row+1] = sum2;
3292           t[row+2] = sum3;
3293           sz      = ii[row+1] - diag[row] - 3;
3294           idx     = a->j + diag[row] + 3;
3295           v1 += 3;
3296           v2 += 3;
3297           v3 += 3;
3298           for (n = 0; n<sz-1; n+=2) {
3299             i1    = idx[0];
3300             i2    = idx[1];
3301             idx  += 2;
3302             tmp0  = x[i1];
3303             tmp1  = x[i2];
3304             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3305             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3306             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3307           }
3308           if (n == sz-1) {
3309             tmp0  = x[*idx];
3310             sum1 -= v1[0] * tmp0;
3311             sum2 -= v2[0] * tmp0;
3312             sum3 -= v3[0] * tmp0;
3313           }
3314           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3315           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3316           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3317           break;
3318         case 4:
3319           v2   = a->a + ii[row+1];
3320           v3   = a->a + ii[row+2];
3321           v4   = a->a + ii[row+3];
3322           sum1 = b[row];
3323           sum2 = b[row+1];
3324           sum3 = b[row+2];
3325           sum4 = b[row+3];
3326           for (n = 0; n<sz-1; n+=2) {
3327             i1    = idx[0];
3328             i2    = idx[1];
3329             idx  += 2;
3330             tmp0  = x[i1];
3331             tmp1  = x[i2];
3332             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3333             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3334             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3335             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3336           }
3337           if (n == sz-1) {
3338             tmp0  = x[*idx++];
3339             sum1 -= v1[0] * tmp0;
3340             sum2 -= v2[0] * tmp0;
3341             sum3 -= v3[0] * tmp0;
3342             sum4 -= v4[0] * tmp0;
3343             v1++; v2++; v3++; v4++;
3344           }
3345           t[row]   = sum1;
3346           t[row+1] = sum2;
3347           t[row+2] = sum3;
3348           t[row+3] = sum4;
3349           sz      = ii[row+1] - diag[row] - 4;
3350           idx     = a->j + diag[row] + 4;
3351           v1 += 4;
3352           v2 += 4;
3353           v3 += 4;
3354           v4 += 4;
3355           for (n = 0; n<sz-1; n+=2) {
3356             i1    = idx[0];
3357             i2    = idx[1];
3358             idx  += 2;
3359             tmp0  = x[i1];
3360             tmp1  = x[i2];
3361             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3362             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3363             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3364             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3365           }
3366           if (n == sz-1) {
3367             tmp0  = x[*idx];
3368             sum1 -= v1[0] * tmp0;
3369             sum2 -= v2[0] * tmp0;
3370             sum3 -= v3[0] * tmp0;
3371             sum4 -= v4[0] * tmp0;
3372           }
3373           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3374           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3375           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3376           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3377           break;
3378         case 5:
3379           v2   = a->a + ii[row+1];
3380           v3   = a->a + ii[row+2];
3381           v4   = a->a + ii[row+3];
3382           v5   = a->a + ii[row+4];
3383           sum1 = b[row];
3384           sum2 = b[row+1];
3385           sum3 = b[row+2];
3386           sum4 = b[row+3];
3387           sum5 = b[row+4];
3388           for (n = 0; n<sz-1; n+=2) {
3389             i1    = idx[0];
3390             i2    = idx[1];
3391             idx  += 2;
3392             tmp0  = x[i1];
3393             tmp1  = x[i2];
3394             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3395             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3396             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3397             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3398             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3399           }
3400           if (n == sz-1) {
3401             tmp0  = x[*idx++];
3402             sum1 -= v1[0] * tmp0;
3403             sum2 -= v2[0] * tmp0;
3404             sum3 -= v3[0] * tmp0;
3405             sum4 -= v4[0] * tmp0;
3406             sum5 -= v5[0] * tmp0;
3407             v1++; v2++; v3++; v4++; v5++;
3408           }
3409           t[row]   = sum1;
3410           t[row+1] = sum2;
3411           t[row+2] = sum3;
3412           t[row+3] = sum4;
3413           t[row+4] = sum5;
3414           sz      = ii[row+1] - diag[row] - 5;
3415           idx     = a->j + diag[row] + 5;
3416           v1 += 5;
3417           v2 += 5;
3418           v3 += 5;
3419           v4 += 5;
3420           v5 += 5;
3421           for (n = 0; n<sz-1; n+=2) {
3422             i1    = idx[0];
3423             i2    = idx[1];
3424             idx  += 2;
3425             tmp0  = x[i1];
3426             tmp1  = x[i2];
3427             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3428             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3429             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3430             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3431             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3432           }
3433           if (n == sz-1) {
3434             tmp0  = x[*idx];
3435             sum1 -= v1[0] * tmp0;
3436             sum2 -= v2[0] * tmp0;
3437             sum3 -= v3[0] * tmp0;
3438             sum4 -= v4[0] * tmp0;
3439             sum5 -= v5[0] * tmp0;
3440           }
3441           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3442           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3443           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3444           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3445           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3446           break;
3447         default:
3448           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3449         }
3450       }
3451       xb   = t;
3452       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3453     } else xb = b;
3454 
3455     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3456 
3457       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3458       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3459         ibdiag -= sizes[i]*sizes[i];
3460 
3461         /* set RHS */
3462         if (xb == b) {
3463           /* whole (old way) */
3464           sz      = ii[row+1] - ii[row];
3465           idx     = a->j + ii[row];
3466           switch (sizes[i]) {
3467           case 5:
3468             v5      = a->a + ii[row-4];
3469           case 4: /* fall through */
3470             v4      = a->a + ii[row-3];
3471           case 3:
3472             v3      = a->a + ii[row-2];
3473           case 2:
3474             v2      = a->a + ii[row-1];
3475           case 1:
3476             v1      = a->a + ii[row];
3477             break;
3478           default:
3479             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3480           }
3481         } else {
3482           /* upper, no diag */
3483           sz      = ii[row+1] - diag[row] - 1;
3484           idx     = a->j + diag[row] + 1;
3485           switch (sizes[i]) {
3486           case 5:
3487             v5      = a->a + diag[row-4] + 5;
3488           case 4: /* fall through */
3489             v4      = a->a + diag[row-3] + 4;
3490           case 3:
3491             v3      = a->a + diag[row-2] + 3;
3492           case 2:
3493             v2      = a->a + diag[row-1] + 2;
3494           case 1:
3495             v1      = a->a + diag[row] + 1;
3496           }
3497         }
3498         /* set sum */
3499         switch (sizes[i]) {
3500         case 5:
3501           sum5 = xb[row-4];
3502         case 4: /* fall through */
3503           sum4 = xb[row-3];
3504         case 3:
3505           sum3 = xb[row-2];
3506         case 2:
3507           sum2 = xb[row-1];
3508         case 1:
3509           /* note that sum1 is associated with the last row */
3510           sum1 = xb[row];
3511         }
3512         /* do sums */
3513         for (n = 0; n<sz-1; n+=2) {
3514             i1    = idx[0];
3515             i2    = idx[1];
3516             idx  += 2;
3517             tmp0  = x[i1];
3518             tmp1  = x[i2];
3519             switch (sizes[i]) {
3520             case 5:
3521               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3522             case 4: /* fall through */
3523               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3524             case 3:
3525               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3526             case 2:
3527               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3528             case 1:
3529               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3530             }
3531         }
3532         /* ragged edge */
3533         if (n == sz-1) {
3534           tmp0  = x[*idx];
3535           switch (sizes[i]) {
3536           case 5:
3537             sum5 -= *v5*tmp0;
3538           case 4: /* fall through */
3539             sum4 -= *v4*tmp0;
3540           case 3:
3541             sum3 -= *v3*tmp0;
3542           case 2:
3543             sum2 -= *v2*tmp0;
3544           case 1:
3545             sum1 -= *v1*tmp0;
3546           }
3547         }
3548         /* update */
3549         if (xb == b) {
3550           /* whole (old way) w/ diag */
3551           switch (sizes[i]) {
3552           case 5:
3553             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3554             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3555             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3556             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3557             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3558             break;
3559           case 4:
3560             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3561             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3562             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3563             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3564             break;
3565           case 3:
3566             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3567             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3568             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3569             break;
3570           case 2:
3571             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3572             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3573             break;
3574           case 1:
3575             x[row--] += sum1*(*ibdiag);
3576             break;
3577           }
3578         } else {
3579           /* no diag so set =  */
3580           switch (sizes[i]) {
3581           case 5:
3582             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3583             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3584             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3585             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3586             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3587             break;
3588           case 4:
3589             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3590             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3591             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3592             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3593             break;
3594           case 3:
3595             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3596             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3597             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3598             break;
3599           case 2:
3600             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3601             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3602             break;
3603           case 1:
3604             x[row--] = sum1*(*ibdiag);
3605             break;
3606           }
3607         }
3608       }
3609       if (xb == b) {
3610         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3611       } else {
3612         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3613       }
3614     }
3615   }
3616   if (flag & SOR_EISENSTAT) {
3617     /*
3618           Apply  (U + D)^-1  where D is now the block diagonal
3619     */
3620     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3621     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3622       ibdiag -= sizes[i]*sizes[i];
3623       sz      = ii[row+1] - diag[row] - 1;
3624       v1      = a->a + diag[row] + 1;
3625       idx     = a->j + diag[row] + 1;
3626       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3627       switch (sizes[i]) {
3628       case 1:
3629 
3630         sum1 = b[row];
3631         for (n = 0; n<sz-1; n+=2) {
3632           i1    = idx[0];
3633           i2    = idx[1];
3634           idx  += 2;
3635           tmp0  = x[i1];
3636           tmp1  = x[i2];
3637           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3638         }
3639 
3640         if (n == sz-1) {
3641           tmp0  = x[*idx];
3642           sum1 -= *v1*tmp0;
3643         }
3644         x[row] = sum1*(*ibdiag);row--;
3645         break;
3646 
3647       case 2:
3648 
3649         sum1 = b[row];
3650         sum2 = b[row-1];
3651         /* note that sum1 is associated with the second of the two rows */
3652         v2 = a->a + diag[row-1] + 2;
3653         for (n = 0; n<sz-1; n+=2) {
3654           i1    = idx[0];
3655           i2    = idx[1];
3656           idx  += 2;
3657           tmp0  = x[i1];
3658           tmp1  = x[i2];
3659           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3660           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3661         }
3662 
3663         if (n == sz-1) {
3664           tmp0  = x[*idx];
3665           sum1 -= *v1*tmp0;
3666           sum2 -= *v2*tmp0;
3667         }
3668         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3669         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3670         row     -= 2;
3671         break;
3672       case 3:
3673 
3674         sum1 = b[row];
3675         sum2 = b[row-1];
3676         sum3 = b[row-2];
3677         v2   = a->a + diag[row-1] + 2;
3678         v3   = a->a + diag[row-2] + 3;
3679         for (n = 0; n<sz-1; n+=2) {
3680           i1    = idx[0];
3681           i2    = idx[1];
3682           idx  += 2;
3683           tmp0  = x[i1];
3684           tmp1  = x[i2];
3685           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3686           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3687           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3688         }
3689 
3690         if (n == sz-1) {
3691           tmp0  = x[*idx];
3692           sum1 -= *v1*tmp0;
3693           sum2 -= *v2*tmp0;
3694           sum3 -= *v3*tmp0;
3695         }
3696         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3697         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3698         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3699         row     -= 3;
3700         break;
3701       case 4:
3702 
3703         sum1 = b[row];
3704         sum2 = b[row-1];
3705         sum3 = b[row-2];
3706         sum4 = b[row-3];
3707         v2   = a->a + diag[row-1] + 2;
3708         v3   = a->a + diag[row-2] + 3;
3709         v4   = a->a + diag[row-3] + 4;
3710         for (n = 0; n<sz-1; n+=2) {
3711           i1    = idx[0];
3712           i2    = idx[1];
3713           idx  += 2;
3714           tmp0  = x[i1];
3715           tmp1  = x[i2];
3716           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3717           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3718           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3719           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3720         }
3721 
3722         if (n == sz-1) {
3723           tmp0  = x[*idx];
3724           sum1 -= *v1*tmp0;
3725           sum2 -= *v2*tmp0;
3726           sum3 -= *v3*tmp0;
3727           sum4 -= *v4*tmp0;
3728         }
3729         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3730         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3731         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3732         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3733         row     -= 4;
3734         break;
3735       case 5:
3736 
3737         sum1 = b[row];
3738         sum2 = b[row-1];
3739         sum3 = b[row-2];
3740         sum4 = b[row-3];
3741         sum5 = b[row-4];
3742         v2   = a->a + diag[row-1] + 2;
3743         v3   = a->a + diag[row-2] + 3;
3744         v4   = a->a + diag[row-3] + 4;
3745         v5   = a->a + diag[row-4] + 5;
3746         for (n = 0; n<sz-1; n+=2) {
3747           i1    = idx[0];
3748           i2    = idx[1];
3749           idx  += 2;
3750           tmp0  = x[i1];
3751           tmp1  = x[i2];
3752           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3753           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3754           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3755           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3756           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3757         }
3758 
3759         if (n == sz-1) {
3760           tmp0  = x[*idx];
3761           sum1 -= *v1*tmp0;
3762           sum2 -= *v2*tmp0;
3763           sum3 -= *v3*tmp0;
3764           sum4 -= *v4*tmp0;
3765           sum5 -= *v5*tmp0;
3766         }
3767         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3768         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3769         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3770         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3771         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3772         row     -= 5;
3773         break;
3774       default:
3775         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3776       }
3777     }
3778     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3779 
3780     /*
3781            t = b - D x    where D is the block diagonal
3782     */
3783     cnt = 0;
3784     for (i=0, row=0; i<m; i++) {
3785       switch (sizes[i]) {
3786       case 1:
3787         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3788         break;
3789       case 2:
3790         x1       = x[row]; x2 = x[row+1];
3791         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3792         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3793         t[row]   = b[row] - tmp1;
3794         t[row+1] = b[row+1] - tmp2; row += 2;
3795         cnt     += 4;
3796         break;
3797       case 3:
3798         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3799         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3800         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3801         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3802         t[row]   = b[row] - tmp1;
3803         t[row+1] = b[row+1] - tmp2;
3804         t[row+2] = b[row+2] - tmp3; row += 3;
3805         cnt     += 9;
3806         break;
3807       case 4:
3808         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3809         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3810         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3811         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3812         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3813         t[row]   = b[row] - tmp1;
3814         t[row+1] = b[row+1] - tmp2;
3815         t[row+2] = b[row+2] - tmp3;
3816         t[row+3] = b[row+3] - tmp4; row += 4;
3817         cnt     += 16;
3818         break;
3819       case 5:
3820         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3821         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3822         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3823         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3824         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3825         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3826         t[row]   = b[row] - tmp1;
3827         t[row+1] = b[row+1] - tmp2;
3828         t[row+2] = b[row+2] - tmp3;
3829         t[row+3] = b[row+3] - tmp4;
3830         t[row+4] = b[row+4] - tmp5;row += 5;
3831         cnt     += 25;
3832         break;
3833       default:
3834         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3835       }
3836     }
3837     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3838 
3839 
3840 
3841     /*
3842           Apply (L + D)^-1 where D is the block diagonal
3843     */
3844     for (i=0, row=0; i<m; i++) {
3845       sz  = diag[row] - ii[row];
3846       v1  = a->a + ii[row];
3847       idx = a->j + ii[row];
3848       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3849       switch (sizes[i]) {
3850       case 1:
3851 
3852         sum1 = t[row];
3853         for (n = 0; n<sz-1; n+=2) {
3854           i1    = idx[0];
3855           i2    = idx[1];
3856           idx  += 2;
3857           tmp0  = t[i1];
3858           tmp1  = t[i2];
3859           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3860         }
3861 
3862         if (n == sz-1) {
3863           tmp0  = t[*idx];
3864           sum1 -= *v1 * tmp0;
3865         }
3866         x[row] += t[row] = sum1*(*ibdiag++); row++;
3867         break;
3868       case 2:
3869         v2   = a->a + ii[row+1];
3870         sum1 = t[row];
3871         sum2 = t[row+1];
3872         for (n = 0; n<sz-1; n+=2) {
3873           i1    = idx[0];
3874           i2    = idx[1];
3875           idx  += 2;
3876           tmp0  = t[i1];
3877           tmp1  = t[i2];
3878           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3879           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3880         }
3881 
3882         if (n == sz-1) {
3883           tmp0  = t[*idx];
3884           sum1 -= v1[0] * tmp0;
3885           sum2 -= v2[0] * tmp0;
3886         }
3887         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3888         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3889         ibdiag   += 4; row += 2;
3890         break;
3891       case 3:
3892         v2   = a->a + ii[row+1];
3893         v3   = a->a + ii[row+2];
3894         sum1 = t[row];
3895         sum2 = t[row+1];
3896         sum3 = t[row+2];
3897         for (n = 0; n<sz-1; n+=2) {
3898           i1    = idx[0];
3899           i2    = idx[1];
3900           idx  += 2;
3901           tmp0  = t[i1];
3902           tmp1  = t[i2];
3903           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3904           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3905           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3906         }
3907 
3908         if (n == sz-1) {
3909           tmp0  = t[*idx];
3910           sum1 -= v1[0] * tmp0;
3911           sum2 -= v2[0] * tmp0;
3912           sum3 -= v3[0] * tmp0;
3913         }
3914         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3915         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3916         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3917         ibdiag   += 9; row += 3;
3918         break;
3919       case 4:
3920         v2   = a->a + ii[row+1];
3921         v3   = a->a + ii[row+2];
3922         v4   = a->a + ii[row+3];
3923         sum1 = t[row];
3924         sum2 = t[row+1];
3925         sum3 = t[row+2];
3926         sum4 = t[row+3];
3927         for (n = 0; n<sz-1; n+=2) {
3928           i1    = idx[0];
3929           i2    = idx[1];
3930           idx  += 2;
3931           tmp0  = t[i1];
3932           tmp1  = t[i2];
3933           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3934           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3935           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3936           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3937         }
3938 
3939         if (n == sz-1) {
3940           tmp0  = t[*idx];
3941           sum1 -= v1[0] * tmp0;
3942           sum2 -= v2[0] * tmp0;
3943           sum3 -= v3[0] * tmp0;
3944           sum4 -= v4[0] * tmp0;
3945         }
3946         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3947         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3948         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3949         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3950         ibdiag   += 16; row += 4;
3951         break;
3952       case 5:
3953         v2   = a->a + ii[row+1];
3954         v3   = a->a + ii[row+2];
3955         v4   = a->a + ii[row+3];
3956         v5   = a->a + ii[row+4];
3957         sum1 = t[row];
3958         sum2 = t[row+1];
3959         sum3 = t[row+2];
3960         sum4 = t[row+3];
3961         sum5 = t[row+4];
3962         for (n = 0; n<sz-1; n+=2) {
3963           i1    = idx[0];
3964           i2    = idx[1];
3965           idx  += 2;
3966           tmp0  = t[i1];
3967           tmp1  = t[i2];
3968           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3969           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3970           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3971           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3972           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3973         }
3974 
3975         if (n == sz-1) {
3976           tmp0  = t[*idx];
3977           sum1 -= v1[0] * tmp0;
3978           sum2 -= v2[0] * tmp0;
3979           sum3 -= v3[0] * tmp0;
3980           sum4 -= v4[0] * tmp0;
3981           sum5 -= v5[0] * tmp0;
3982         }
3983         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3984         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3985         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3986         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3987         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3988         ibdiag   += 25; row += 5;
3989         break;
3990       default:
3991         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3992       }
3993     }
3994     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3995   }
3996   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3997   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3998   PetscFunctionReturn(0);
3999 }
4000 
4001 #undef __FUNCT__
4002 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
4003 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4004 {
4005   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4006   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4007   const MatScalar   *bdiag = a->inode.bdiag;
4008   const PetscScalar *b;
4009   PetscErrorCode    ierr;
4010   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4011   const PetscInt    *sizes = a->inode.size;
4012 
4013   PetscFunctionBegin;
4014   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4015   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4016   cnt  = 0;
4017   for (i=0, row=0; i<m; i++) {
4018     switch (sizes[i]) {
4019     case 1:
4020       x[row] = b[row]*bdiag[cnt++];row++;
4021       break;
4022     case 2:
4023       x1       = b[row]; x2 = b[row+1];
4024       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4025       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4026       x[row++] = tmp1;
4027       x[row++] = tmp2;
4028       cnt     += 4;
4029       break;
4030     case 3:
4031       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4032       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4033       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4034       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4035       x[row++] = tmp1;
4036       x[row++] = tmp2;
4037       x[row++] = tmp3;
4038       cnt     += 9;
4039       break;
4040     case 4:
4041       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4042       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4043       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4044       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4045       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4046       x[row++] = tmp1;
4047       x[row++] = tmp2;
4048       x[row++] = tmp3;
4049       x[row++] = tmp4;
4050       cnt     += 16;
4051       break;
4052     case 5:
4053       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4054       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4055       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4056       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4057       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4058       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4059       x[row++] = tmp1;
4060       x[row++] = tmp2;
4061       x[row++] = tmp3;
4062       x[row++] = tmp4;
4063       x[row++] = tmp5;
4064       cnt     += 25;
4065       break;
4066     default:
4067       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4068     }
4069   }
4070   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4071   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4072   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4073   PetscFunctionReturn(0);
4074 }
4075 
4076 /*
4077     samestructure indicates that the matrix has not changed its nonzero structure so we
4078     do not need to recompute the inodes
4079 */
4080 #undef __FUNCT__
4081 #define __FUNCT__ "MatSeqAIJCheckInode"
4082 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4083 {
4084   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4085   PetscErrorCode ierr;
4086   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4087   PetscBool      flag;
4088   const PetscInt *idx,*idy,*ii;
4089 
4090   PetscFunctionBegin;
4091   if (!a->inode.use) PetscFunctionReturn(0);
4092   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4093 
4094   m = A->rmap->n;
4095   if (a->inode.size) ns = a->inode.size;
4096   else {
4097     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4098   }
4099 
4100   i          = 0;
4101   node_count = 0;
4102   idx        = a->j;
4103   ii         = a->i;
4104   while (i < m) {                /* For each row */
4105     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4106     /* Limits the number of elements in a node to 'a->inode.limit' */
4107     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4108       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4109       if (nzy != nzx) break;
4110       idy += nzx;              /* Same nonzero pattern */
4111       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4112       if (!flag) break;
4113     }
4114     ns[node_count++] = blk_size;
4115     idx             += blk_size*nzx;
4116     i                = j;
4117   }
4118   /* If not enough inodes found,, do not use inode version of the routines */
4119   if (!m || node_count > .8*m) {
4120     ierr = PetscFree(ns);CHKERRQ(ierr);
4121 
4122     a->inode.node_count       = 0;
4123     a->inode.size             = NULL;
4124     a->inode.use              = PETSC_FALSE;
4125     A->ops->mult              = MatMult_SeqAIJ;
4126     A->ops->sor               = MatSOR_SeqAIJ;
4127     A->ops->multadd           = MatMultAdd_SeqAIJ;
4128     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4129     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4130     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4131     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4132     A->ops->coloringpatch     = 0;
4133     A->ops->multdiagonalblock = 0;
4134 
4135     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4136   } else {
4137     if (!A->factortype) {
4138       A->ops->mult              = MatMult_SeqAIJ_Inode;
4139       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4140       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4141       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4142       if (A->rmap->n == A->cmap->n) {
4143         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4144         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4145         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4146         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4147         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4148       }
4149     } else {
4150       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4151     }
4152     a->inode.node_count = node_count;
4153     a->inode.size       = ns;
4154     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4155   }
4156   a->inode.checked          = PETSC_TRUE;
4157   a->inode.mat_nonzerostate = A->nonzerostate;
4158   PetscFunctionReturn(0);
4159 }
4160 
4161 #undef __FUNCT__
4162 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4163 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4164 {
4165   Mat            B =*C;
4166   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4167   PetscErrorCode ierr;
4168   PetscInt       m=A->rmap->n;
4169 
4170   PetscFunctionBegin;
4171   c->inode.use       = a->inode.use;
4172   c->inode.limit     = a->inode.limit;
4173   c->inode.max_limit = a->inode.max_limit;
4174   if (a->inode.size) {
4175     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4176     c->inode.node_count = a->inode.node_count;
4177     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4178     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4179     if (!B->factortype) {
4180       B->ops->mult              = MatMult_SeqAIJ_Inode;
4181       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4182       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4183       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4184       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4185       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4186       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4187       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4188       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4189     } else {
4190       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4191     }
4192   } else {
4193     c->inode.size       = 0;
4194     c->inode.node_count = 0;
4195   }
4196   c->inode.ibdiagvalid = PETSC_FALSE;
4197   c->inode.ibdiag      = 0;
4198   c->inode.bdiag       = 0;
4199   PetscFunctionReturn(0);
4200 }
4201 
4202 #undef __FUNCT__
4203 #define __FUNCT__ "MatGetRow_FactoredLU"
4204 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4205 {
4206   PetscInt       k;
4207   const PetscInt *vi;
4208 
4209   PetscFunctionBegin;
4210   vi = aj + ai[row];
4211   for (k=0; k<nzl; k++) cols[k] = vi[k];
4212   vi        = aj + adiag[row];
4213   cols[nzl] = vi[0];
4214   vi        = aj + adiag[row+1]+1;
4215   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4216   PetscFunctionReturn(0);
4217 }
4218 /*
4219    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4220    Modified from MatSeqAIJCheckInode().
4221 
4222    Input Parameters:
4223 .  Mat A - ILU or LU matrix factor
4224 
4225 */
4226 #undef __FUNCT__
4227 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU"
4228 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4229 {
4230   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4231   PetscErrorCode ierr;
4232   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4233   PetscInt       *cols1,*cols2,*ns;
4234   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4235   PetscBool      flag;
4236 
4237   PetscFunctionBegin;
4238   if (!a->inode.use)    PetscFunctionReturn(0);
4239   if (a->inode.checked) PetscFunctionReturn(0);
4240 
4241   m = A->rmap->n;
4242   if (a->inode.size) ns = a->inode.size;
4243   else {
4244     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4245   }
4246 
4247   i          = 0;
4248   node_count = 0;
4249   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4250   while (i < m) {                /* For each row */
4251     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4252     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4253     nzx  = nzl1 + nzu1 + 1;
4254     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4255 
4256     /* Limits the number of elements in a node to 'a->inode.limit' */
4257     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4258       nzl2 = ai[j+1] - ai[j];
4259       nzu2 = adiag[j] - adiag[j+1] - 1;
4260       nzy  = nzl2 + nzu2 + 1;
4261       if (nzy != nzx) break;
4262       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4263       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4264       if (!flag) break;
4265     }
4266     ns[node_count++] = blk_size;
4267     i                = j;
4268   }
4269   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4270   /* If not enough inodes found,, do not use inode version of the routines */
4271   if (!m || node_count > .8*m) {
4272     ierr = PetscFree(ns);CHKERRQ(ierr);
4273 
4274     a->inode.node_count = 0;
4275     a->inode.size       = NULL;
4276     a->inode.use        = PETSC_FALSE;
4277 
4278     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4279   } else {
4280     A->ops->mult              = 0;
4281     A->ops->sor               = 0;
4282     A->ops->multadd           = 0;
4283     A->ops->getrowij          = 0;
4284     A->ops->restorerowij      = 0;
4285     A->ops->getcolumnij       = 0;
4286     A->ops->restorecolumnij   = 0;
4287     A->ops->coloringpatch     = 0;
4288     A->ops->multdiagonalblock = 0;
4289     a->inode.node_count       = node_count;
4290     a->inode.size             = ns;
4291 
4292     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4293   }
4294   a->inode.checked = PETSC_TRUE;
4295   PetscFunctionReturn(0);
4296 }
4297 
4298 #undef __FUNCT__
4299 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4300 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4301 {
4302   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4303 
4304   PetscFunctionBegin;
4305   a->inode.ibdiagvalid = PETSC_FALSE;
4306   PetscFunctionReturn(0);
4307 }
4308 
4309 /*
4310      This is really ugly. if inodes are used this replaces the
4311   permutations with ones that correspond to rows/cols of the matrix
4312   rather then inode blocks
4313 */
4314 #undef __FUNCT__
4315 #define __FUNCT__ "MatInodeAdjustForInodes"
4316 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4317 {
4318   PetscErrorCode ierr;
4319 
4320   PetscFunctionBegin;
4321   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4322   PetscFunctionReturn(0);
4323 }
4324 
4325 #undef __FUNCT__
4326 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4327 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4328 {
4329   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4330   PetscErrorCode ierr;
4331   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4332   const PetscInt *ridx,*cidx;
4333   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4334   PetscInt       nslim_col,*ns_col;
4335   IS             ris = *rperm,cis = *cperm;
4336 
4337   PetscFunctionBegin;
4338   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4339   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4340 
4341   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4342   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4343   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4344 
4345   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4346   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4347 
4348   /* Form the inode structure for the rows of permuted matric using inv perm*/
4349   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4350 
4351   /* Construct the permutations for rows*/
4352   for (i=0,row = 0; i<nslim_row; ++i) {
4353     indx      = ridx[i];
4354     start_val = tns[indx];
4355     end_val   = tns[indx + 1];
4356     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4357   }
4358 
4359   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4360   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4361 
4362   /* Construct permutations for columns */
4363   for (i=0,col=0; i<nslim_col; ++i) {
4364     indx      = cidx[i];
4365     start_val = tns[indx];
4366     end_val   = tns[indx + 1];
4367     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4368   }
4369 
4370   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4371   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4372   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4373   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4374 
4375   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4376   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4377 
4378   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4379   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4380   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4381   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4382   ierr = PetscFree(tns);CHKERRQ(ierr);
4383   PetscFunctionReturn(0);
4384 }
4385 
4386 #undef __FUNCT__
4387 #define __FUNCT__ "MatInodeGetInodeSizes"
4388 /*@C
4389    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4390 
4391    Not Collective
4392 
4393    Input Parameter:
4394 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4395 
4396    Output Parameter:
4397 +  node_count - no of inodes present in the matrix.
4398 .  sizes      - an array of size node_count,with sizes of each inode.
4399 -  limit      - the max size used to generate the inodes.
4400 
4401    Level: advanced
4402 
4403    Notes: This routine returns some internal storage information
4404    of the matrix, it is intended to be used by advanced users.
4405    It should be called after the matrix is assembled.
4406    The contents of the sizes[] array should not be changed.
4407    NULL may be passed for information not requested.
4408 
4409 .keywords: matrix, seqaij, get, inode
4410 
4411 .seealso: MatGetInfo()
4412 @*/
4413 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4414 {
4415   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4416 
4417   PetscFunctionBegin;
4418   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4419   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4420   if (f) {
4421     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4422   }
4423   PetscFunctionReturn(0);
4424 }
4425 
4426 #undef __FUNCT__
4427 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4428 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4429 {
4430   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4431 
4432   PetscFunctionBegin;
4433   if (node_count) *node_count = a->inode.node_count;
4434   if (sizes)      *sizes      = a->inode.size;
4435   if (limit)      *limit      = a->inode.limit;
4436   PetscFunctionReturn(0);
4437 }
4438