xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 7453f77509fc006cbcc7de2dd772f60dc49feac5)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     if (j==jmax) continue; /* empty row */
101     col  = *j++ + ishift;
102     i2   = tvc[col];
103     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104       ia[i1+1]++;
105       ia[i2+1]++;
106       i2++;                     /* Start col of next node */
107       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108       i2 = tvc[col];
109     }
110     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
111   }
112 
113   /* shift ia[i] to point to next row */
114   for (i1=1; i1<nslim_row+1; i1++) {
115     row        = ia[i1-1];
116     ia[i1]    += row;
117     work[i1-1] = row - oshift;
118   }
119 
120   /* allocate space for column pointers */
121   nz   = ia[nslim_row] + (!ishift);
122   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
123   *jja = ja;
124 
125   /* loop over lower triangular part putting into ja */
126   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127     j    = aj + ai[row] + ishift;
128     jmax = aj + ai[row+1] + ishift;
129     if (j==jmax) continue; /* empty row */
130     col  = *j++ + ishift;
131     i2   = tvc[col];
132     while (i2<i1 && j<jmax) {
133       ja[work[i2]++] = i1 + oshift;
134       ja[work[i1]++] = i2 + oshift;
135       ++i2;
136       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137       i2 = tvc[col];
138     }
139     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
140 
141   }
142   ierr = PetscFree(work);CHKERRQ(ierr);
143   ierr = PetscFree(tns);CHKERRQ(ierr);
144   ierr = PetscFree(tvc);CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 /*
149       This builds nonsymmetric version of nonzero structure,
150 */
151 #undef __FUNCT__
152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154 {
155   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
156   PetscErrorCode ierr;
157   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158   PetscInt       *tns,*tvc,nsz,i1,i2;
159   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1) {
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
176   }
177   /* allocate space for row pointers */
178   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
179   *iia = ia;
180   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
181   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
182 
183   /* determine the number of columns in each row */
184   ia[0] = oshift;
185   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186     j   = aj + ai[row] + ishift;
187     nz  = ai[row+1] - ai[row];
188     if (!nz) continue; /* empty row */
189     col = *j++ + ishift;
190     i2  = tvc[col];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211   /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     nz  = ai[row+1] - ai[row];
215     if (!nz) continue; /* empty row */
216     col = *j++ + ishift;
217     i2  = tvc[col];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
235 {
236   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267   PetscFunctionReturn(0);
268 }
269 
270 /* ----------------------------------------------------------- */
271 
272 #undef __FUNCT__
273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275 {
276   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
277   PetscErrorCode ierr;
278   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
280 
281   PetscFunctionBegin;
282   nslim_row = a->inode.node_count;
283   n         = A->cmap->n;
284 
285   /* Create The column_inode for this matrix */
286   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
287 
288   /* allocate space for reformated column_inode structure */
289   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
290   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
291   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
292 
293   for (i1=0,col=0; i1<nslim_col; ++i1) {
294     nsz = ns_col[i1];
295     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
296   }
297   /* allocate space for column pointers */
298   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
299   *iia = ia;
300   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
301   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
302 
303   /* determine the number of columns in each row */
304   ia[0] = oshift;
305   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
306     j   = aj + ai[row] + ishift;
307     col = *j++ + ishift;
308     i2  = tvc[col];
309     nz  = ai[row+1] - ai[row];
310     while (nz-- > 0) {           /* off-diagonal elemets */
311       /* ia[i1+1]++; */
312       ia[i2+1]++;
313       i2++;
314       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
315       if (nz > 0) i2 = tvc[col];
316     }
317   }
318 
319   /* shift ia[i] to point to next col */
320   for (i1=1; i1<nslim_col+1; i1++) {
321     col        = ia[i1-1];
322     ia[i1]    += col;
323     work[i1-1] = col - oshift;
324   }
325 
326   /* allocate space for column pointers */
327   nz   = ia[nslim_col] + (!ishift);
328   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
329   *jja = ja;
330 
331   /* loop over matrix putting into ja */
332   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
333     j   = aj + ai[row] + ishift;
334     col = *j++ + ishift;
335     i2  = tvc[col];
336     nz  = ai[row+1] - ai[row];
337     while (nz-- > 0) {
338       /* ja[work[i1]++] = i2 + oshift; */
339       ja[work[i2]++] = i1 + oshift;
340       i2++;
341       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
342       if (nz > 0) i2 = tvc[col];
343     }
344   }
345   ierr = PetscFree(ns_col);CHKERRQ(ierr);
346   ierr = PetscFree(work);CHKERRQ(ierr);
347   ierr = PetscFree(tns);CHKERRQ(ierr);
348   ierr = PetscFree(tvc);CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 #undef __FUNCT__
353 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
354 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
355 {
356   PetscErrorCode ierr;
357 
358   PetscFunctionBegin;
359   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
360   if (!ia) PetscFunctionReturn(0);
361 
362   if (!blockcompressed) {
363     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
364   } else if (symmetric) {
365     /* Since the indices are symmetric it does'nt matter */
366     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
367   } else {
368     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 #undef __FUNCT__
374 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
375 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
376 {
377   PetscErrorCode ierr;
378 
379   PetscFunctionBegin;
380   if (!ia) PetscFunctionReturn(0);
381   if (!blockcompressed) {
382     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
383   } else {
384     ierr = PetscFree(*ia);CHKERRQ(ierr);
385     ierr = PetscFree(*ja);CHKERRQ(ierr);
386   }
387   PetscFunctionReturn(0);
388 }
389 
390 /* ----------------------------------------------------------- */
391 
392 #undef __FUNCT__
393 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
394 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
395 {
396   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
397   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
398   PetscScalar       *y;
399   const PetscScalar *x;
400   const MatScalar   *v1,*v2,*v3,*v4,*v5;
401   PetscErrorCode    ierr;
402   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
403   const PetscInt    *idx,*ns,*ii;
404 
405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407 #endif
408 
409   PetscFunctionBegin;
410   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
411   node_max = a->inode.node_count;
412   ns       = a->inode.size;     /* Node Size array */
413   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
414   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
415   idx      = a->j;
416   v1       = a->a;
417   ii       = a->i;
418 
419   for (i = 0,row = 0; i< node_max; ++i) {
420     nsz         = ns[i];
421     n           = ii[1] - ii[0];
422     nonzerorow += (n>0)*nsz;
423     ii         += nsz;
424     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
425     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
426     sz = n;                     /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz) {               /* Each loop in 'case' is unrolled */
429     case 1:
430       sum1 = 0.;
431 
432       for (n = 0; n< sz-1; n+=2) {
433         i1    = idx[0];         /* The instructions are ordered to */
434         i2    = idx[1];         /* make the compiler's job easy */
435         idx  += 2;
436         tmp0  = x[i1];
437         tmp1  = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439       }
440 
441       if (n == sz-1) {          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ *tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1 = 0.;
449       sum2 = 0.;
450       v2   = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1    = idx[0];
454         i2    = idx[1];
455         idx  += 2;
456         tmp0  = x[i1];
457         tmp1  = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1) {
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1 = 0.;
473       sum2 = 0.;
474       sum3 = 0.;
475       v2   = v1 + n;
476       v3   = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1    = idx[0];
480         i2    = idx[1];
481         idx  += 2;
482         tmp0  = x[i1];
483         tmp1  = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1) {
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1      =v3;              /* Since the next block to be processed starts there*/
498       idx    +=2*sz;
499       break;
500     case 4:
501       sum1 = 0.;
502       sum2 = 0.;
503       sum3 = 0.;
504       sum4 = 0.;
505       v2   = v1 + n;
506       v3   = v2 + n;
507       v4   = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1    = idx[0];
511         i2    = idx[1];
512         idx  += 2;
513         tmp0  = x[i1];
514         tmp1  = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1) {
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1 = 0.;
536       sum2 = 0.;
537       sum3 = 0.;
538       sum4 = 0.;
539       sum5 = 0.;
540       v2   = v1 + n;
541       v3   = v2 + n;
542       v4   = v3 + n;
543       v5   = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1    = idx[0];
547         i2    = idx[1];
548         idx  += 2;
549         tmp0  = x[i1];
550         tmp1  = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1) {
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default:
574       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_SeqAIJ_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
586 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   const MatScalar   *v1,*v2,*v3,*v4,*v5;
591   const PetscScalar *x;
592   PetscScalar       *y,*z,*zt;
593   PetscErrorCode    ierr;
594   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
595   const PetscInt    *idx,*ns,*ii;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601 
602   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
603   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
604   zt = z;
605 
606   idx = a->j;
607   v1  = a->a;
608   ii  = a->i;
609 
610   for (i = 0,row = 0; i< node_max; ++i) {
611     nsz = ns[i];
612     n   = ii[1] - ii[0];
613     ii += nsz;
614     sz  = n;                    /* No of non zeros in this row */
615                                 /* Switch on the size of Node */
616     switch (nsz) {               /* Each loop in 'case' is unrolled */
617     case 1:
618       sum1 = *zt++;
619 
620       for (n = 0; n< sz-1; n+=2) {
621         i1    = idx[0];         /* The instructions are ordered to */
622         i2    = idx[1];         /* make the compiler's job easy */
623         idx  += 2;
624         tmp0  = x[i1];
625         tmp1  = x[i2];
626         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627       }
628 
629       if (n   == sz-1) {          /* Take care of the last nonzero  */
630         tmp0  = x[*idx++];
631         sum1 += *v1++ * tmp0;
632       }
633       y[row++]=sum1;
634       break;
635     case 2:
636       sum1 = *zt++;
637       sum2 = *zt++;
638       v2   = v1 + n;
639 
640       for (n = 0; n< sz-1; n+=2) {
641         i1    = idx[0];
642         i2    = idx[1];
643         idx  += 2;
644         tmp0  = x[i1];
645         tmp1  = x[i2];
646         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648       }
649       if (n   == sz-1) {
650         tmp0  = x[*idx++];
651         sum1 += *v1++ * tmp0;
652         sum2 += *v2++ * tmp0;
653       }
654       y[row++]=sum1;
655       y[row++]=sum2;
656       v1      =v2;              /* Since the next block to be processed starts there*/
657       idx    +=sz;
658       break;
659     case 3:
660       sum1 = *zt++;
661       sum2 = *zt++;
662       sum3 = *zt++;
663       v2   = v1 + n;
664       v3   = v2 + n;
665 
666       for (n = 0; n< sz-1; n+=2) {
667         i1    = idx[0];
668         i2    = idx[1];
669         idx  += 2;
670         tmp0  = x[i1];
671         tmp1  = x[i2];
672         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675       }
676       if (n == sz-1) {
677         tmp0  = x[*idx++];
678         sum1 += *v1++ * tmp0;
679         sum2 += *v2++ * tmp0;
680         sum3 += *v3++ * tmp0;
681       }
682       y[row++]=sum1;
683       y[row++]=sum2;
684       y[row++]=sum3;
685       v1      =v3;              /* Since the next block to be processed starts there*/
686       idx    +=2*sz;
687       break;
688     case 4:
689       sum1 = *zt++;
690       sum2 = *zt++;
691       sum3 = *zt++;
692       sum4 = *zt++;
693       v2   = v1 + n;
694       v3   = v2 + n;
695       v4   = v3 + n;
696 
697       for (n = 0; n< sz-1; n+=2) {
698         i1    = idx[0];
699         i2    = idx[1];
700         idx  += 2;
701         tmp0  = x[i1];
702         tmp1  = x[i2];
703         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707       }
708       if (n == sz-1) {
709         tmp0  = x[*idx++];
710         sum1 += *v1++ * tmp0;
711         sum2 += *v2++ * tmp0;
712         sum3 += *v3++ * tmp0;
713         sum4 += *v4++ * tmp0;
714       }
715       y[row++]=sum1;
716       y[row++]=sum2;
717       y[row++]=sum3;
718       y[row++]=sum4;
719       v1      =v4;              /* Since the next block to be processed starts there*/
720       idx    +=3*sz;
721       break;
722     case 5:
723       sum1 = *zt++;
724       sum2 = *zt++;
725       sum3 = *zt++;
726       sum4 = *zt++;
727       sum5 = *zt++;
728       v2   = v1 + n;
729       v3   = v2 + n;
730       v4   = v3 + n;
731       v5   = v4 + n;
732 
733       for (n = 0; n<sz-1; n+=2) {
734         i1    = idx[0];
735         i2    = idx[1];
736         idx  += 2;
737         tmp0  = x[i1];
738         tmp1  = x[i2];
739         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744       }
745       if (n   == sz-1) {
746         tmp0  = x[*idx++];
747         sum1 += *v1++ * tmp0;
748         sum2 += *v2++ * tmp0;
749         sum3 += *v3++ * tmp0;
750         sum4 += *v4++ * tmp0;
751         sum5 += *v5++ * tmp0;
752       }
753       y[row++]=sum1;
754       y[row++]=sum2;
755       y[row++]=sum3;
756       y[row++]=sum4;
757       y[row++]=sum5;
758       v1      =v5;       /* Since the next block to be processed starts there */
759       idx    +=4*sz;
760       break;
761     default:
762       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
763     }
764   }
765   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
766   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
767   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
768   PetscFunctionReturn(0);
769 }
770 
771 /* ----------------------------------------------------------- */
772 #undef __FUNCT__
773 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
774 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
775 {
776   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
777   IS                iscol = a->col,isrow = a->row;
778   PetscErrorCode    ierr;
779   const PetscInt    *r,*c,*rout,*cout;
780   PetscInt          i,j,n = A->rmap->n,nz;
781   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
782   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
783   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
784   PetscScalar       sum1,sum2,sum3,sum4,sum5;
785   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
786   const PetscScalar *b;
787 
788   PetscFunctionBegin;
789   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
790   node_max = a->inode.node_count;
791   ns       = a->inode.size;     /* Node Size array */
792 
793   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795   tmp  = a->solve_work;
796 
797   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
799 
800   /* forward solve the lower triangular */
801   tmps = tmp;
802   aa   = a_a;
803   aj   = a_j;
804   ad   = a->diag;
805 
806   for (i = 0,row = 0; i< node_max; ++i) {
807     nsz = ns[i];
808     aii = ai[row];
809     v1  = aa + aii;
810     vi  = aj + aii;
811     nz  = ad[row]- aii;
812     if (i < node_max-1) {
813       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
814       * but our indexing to determine it's size could. */
815       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
816       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
817       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
818       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
819     }
820 
821     switch (nsz) {               /* Each loop in 'case' is unrolled */
822     case 1:
823       sum1 = b[*r++];
824       for (j=0; j<nz-1; j+=2) {
825         i0    = vi[0];
826         i1    = vi[1];
827         vi   +=2;
828         tmp0  = tmps[i0];
829         tmp1  = tmps[i1];
830         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
831       }
832       if (j == nz-1) {
833         tmp0  = tmps[*vi++];
834         sum1 -= *v1++ *tmp0;
835       }
836       tmp[row++]=sum1;
837       break;
838     case 2:
839       sum1 = b[*r++];
840       sum2 = b[*r++];
841       v2   = aa + ai[row+1];
842 
843       for (j=0; j<nz-1; j+=2) {
844         i0    = vi[0];
845         i1    = vi[1];
846         vi   +=2;
847         tmp0  = tmps[i0];
848         tmp1  = tmps[i1];
849         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
850         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
851       }
852       if (j == nz-1) {
853         tmp0  = tmps[*vi++];
854         sum1 -= *v1++ *tmp0;
855         sum2 -= *v2++ *tmp0;
856       }
857       sum2     -= *v2++ *sum1;
858       tmp[row++]=sum1;
859       tmp[row++]=sum2;
860       break;
861     case 3:
862       sum1 = b[*r++];
863       sum2 = b[*r++];
864       sum3 = b[*r++];
865       v2   = aa + ai[row+1];
866       v3   = aa + ai[row+2];
867 
868       for (j=0; j<nz-1; j+=2) {
869         i0    = vi[0];
870         i1    = vi[1];
871         vi   +=2;
872         tmp0  = tmps[i0];
873         tmp1  = tmps[i1];
874         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
875         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
876         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
877       }
878       if (j == nz-1) {
879         tmp0  = tmps[*vi++];
880         sum1 -= *v1++ *tmp0;
881         sum2 -= *v2++ *tmp0;
882         sum3 -= *v3++ *tmp0;
883       }
884       sum2 -= *v2++ * sum1;
885       sum3 -= *v3++ * sum1;
886       sum3 -= *v3++ * sum2;
887 
888       tmp[row++]=sum1;
889       tmp[row++]=sum2;
890       tmp[row++]=sum3;
891       break;
892 
893     case 4:
894       sum1 = b[*r++];
895       sum2 = b[*r++];
896       sum3 = b[*r++];
897       sum4 = b[*r++];
898       v2   = aa + ai[row+1];
899       v3   = aa + ai[row+2];
900       v4   = aa + ai[row+3];
901 
902       for (j=0; j<nz-1; j+=2) {
903         i0    = vi[0];
904         i1    = vi[1];
905         vi   +=2;
906         tmp0  = tmps[i0];
907         tmp1  = tmps[i1];
908         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
909         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
910         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
911         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
912       }
913       if (j == nz-1) {
914         tmp0  = tmps[*vi++];
915         sum1 -= *v1++ *tmp0;
916         sum2 -= *v2++ *tmp0;
917         sum3 -= *v3++ *tmp0;
918         sum4 -= *v4++ *tmp0;
919       }
920       sum2 -= *v2++ * sum1;
921       sum3 -= *v3++ * sum1;
922       sum4 -= *v4++ * sum1;
923       sum3 -= *v3++ * sum2;
924       sum4 -= *v4++ * sum2;
925       sum4 -= *v4++ * sum3;
926 
927       tmp[row++]=sum1;
928       tmp[row++]=sum2;
929       tmp[row++]=sum3;
930       tmp[row++]=sum4;
931       break;
932     case 5:
933       sum1 = b[*r++];
934       sum2 = b[*r++];
935       sum3 = b[*r++];
936       sum4 = b[*r++];
937       sum5 = b[*r++];
938       v2   = aa + ai[row+1];
939       v3   = aa + ai[row+2];
940       v4   = aa + ai[row+3];
941       v5   = aa + ai[row+4];
942 
943       for (j=0; j<nz-1; j+=2) {
944         i0    = vi[0];
945         i1    = vi[1];
946         vi   +=2;
947         tmp0  = tmps[i0];
948         tmp1  = tmps[i1];
949         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
950         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
951         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
952         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
953         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
954       }
955       if (j == nz-1) {
956         tmp0  = tmps[*vi++];
957         sum1 -= *v1++ *tmp0;
958         sum2 -= *v2++ *tmp0;
959         sum3 -= *v3++ *tmp0;
960         sum4 -= *v4++ *tmp0;
961         sum5 -= *v5++ *tmp0;
962       }
963 
964       sum2 -= *v2++ * sum1;
965       sum3 -= *v3++ * sum1;
966       sum4 -= *v4++ * sum1;
967       sum5 -= *v5++ * sum1;
968       sum3 -= *v3++ * sum2;
969       sum4 -= *v4++ * sum2;
970       sum5 -= *v5++ * sum2;
971       sum4 -= *v4++ * sum3;
972       sum5 -= *v5++ * sum3;
973       sum5 -= *v5++ * sum4;
974 
975       tmp[row++]=sum1;
976       tmp[row++]=sum2;
977       tmp[row++]=sum3;
978       tmp[row++]=sum4;
979       tmp[row++]=sum5;
980       break;
981     default:
982       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
983     }
984   }
985   /* backward solve the upper triangular */
986   for (i=node_max -1,row = n-1; i>=0; i--) {
987     nsz = ns[i];
988     aii = ai[row+1] -1;
989     v1  = aa + aii;
990     vi  = aj + aii;
991     nz  = aii- ad[row];
992     switch (nsz) {               /* Each loop in 'case' is unrolled */
993     case 1:
994       sum1 = tmp[row];
995 
996       for (j=nz; j>1; j-=2) {
997         vi   -=2;
998         i0    = vi[2];
999         i1    = vi[1];
1000         tmp0  = tmps[i0];
1001         tmp1  = tmps[i1];
1002         v1   -= 2;
1003         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1004       }
1005       if (j==1) {
1006         tmp0  = tmps[*vi--];
1007         sum1 -= *v1-- * tmp0;
1008       }
1009       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010       break;
1011     case 2:
1012       sum1 = tmp[row];
1013       sum2 = tmp[row -1];
1014       v2   = aa + ai[row]-1;
1015       for (j=nz; j>1; j-=2) {
1016         vi   -=2;
1017         i0    = vi[2];
1018         i1    = vi[1];
1019         tmp0  = tmps[i0];
1020         tmp1  = tmps[i1];
1021         v1   -= 2;
1022         v2   -= 2;
1023         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025       }
1026       if (j==1) {
1027         tmp0  = tmps[*vi--];
1028         sum1 -= *v1-- * tmp0;
1029         sum2 -= *v2-- * tmp0;
1030       }
1031 
1032       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1033       sum2   -= *v2-- * tmp0;
1034       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1035       break;
1036     case 3:
1037       sum1 = tmp[row];
1038       sum2 = tmp[row -1];
1039       sum3 = tmp[row -2];
1040       v2   = aa + ai[row]-1;
1041       v3   = aa + ai[row -1]-1;
1042       for (j=nz; j>1; j-=2) {
1043         vi   -=2;
1044         i0    = vi[2];
1045         i1    = vi[1];
1046         tmp0  = tmps[i0];
1047         tmp1  = tmps[i1];
1048         v1   -= 2;
1049         v2   -= 2;
1050         v3   -= 2;
1051         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1052         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1053         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1054       }
1055       if (j==1) {
1056         tmp0  = tmps[*vi--];
1057         sum1 -= *v1-- * tmp0;
1058         sum2 -= *v2-- * tmp0;
1059         sum3 -= *v3-- * tmp0;
1060       }
1061       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1062       sum2   -= *v2-- * tmp0;
1063       sum3   -= *v3-- * tmp0;
1064       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1065       sum3   -= *v3-- * tmp0;
1066       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1067 
1068       break;
1069     case 4:
1070       sum1 = tmp[row];
1071       sum2 = tmp[row -1];
1072       sum3 = tmp[row -2];
1073       sum4 = tmp[row -3];
1074       v2   = aa + ai[row]-1;
1075       v3   = aa + ai[row -1]-1;
1076       v4   = aa + ai[row -2]-1;
1077 
1078       for (j=nz; j>1; j-=2) {
1079         vi   -=2;
1080         i0    = vi[2];
1081         i1    = vi[1];
1082         tmp0  = tmps[i0];
1083         tmp1  = tmps[i1];
1084         v1   -= 2;
1085         v2   -= 2;
1086         v3   -= 2;
1087         v4   -= 2;
1088         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092       }
1093       if (j==1) {
1094         tmp0  = tmps[*vi--];
1095         sum1 -= *v1-- * tmp0;
1096         sum2 -= *v2-- * tmp0;
1097         sum3 -= *v3-- * tmp0;
1098         sum4 -= *v4-- * tmp0;
1099       }
1100 
1101       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1102       sum2   -= *v2-- * tmp0;
1103       sum3   -= *v3-- * tmp0;
1104       sum4   -= *v4-- * tmp0;
1105       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1106       sum3   -= *v3-- * tmp0;
1107       sum4   -= *v4-- * tmp0;
1108       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1109       sum4   -= *v4-- * tmp0;
1110       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1111       break;
1112     case 5:
1113       sum1 = tmp[row];
1114       sum2 = tmp[row -1];
1115       sum3 = tmp[row -2];
1116       sum4 = tmp[row -3];
1117       sum5 = tmp[row -4];
1118       v2   = aa + ai[row]-1;
1119       v3   = aa + ai[row -1]-1;
1120       v4   = aa + ai[row -2]-1;
1121       v5   = aa + ai[row -3]-1;
1122       for (j=nz; j>1; j-=2) {
1123         vi   -= 2;
1124         i0    = vi[2];
1125         i1    = vi[1];
1126         tmp0  = tmps[i0];
1127         tmp1  = tmps[i1];
1128         v1   -= 2;
1129         v2   -= 2;
1130         v3   -= 2;
1131         v4   -= 2;
1132         v5   -= 2;
1133         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1134         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1135         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1136         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1137         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1138       }
1139       if (j==1) {
1140         tmp0  = tmps[*vi--];
1141         sum1 -= *v1-- * tmp0;
1142         sum2 -= *v2-- * tmp0;
1143         sum3 -= *v3-- * tmp0;
1144         sum4 -= *v4-- * tmp0;
1145         sum5 -= *v5-- * tmp0;
1146       }
1147 
1148       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1149       sum2   -= *v2-- * tmp0;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1158       sum4   -= *v4-- * tmp0;
1159       sum5   -= *v5-- * tmp0;
1160       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1161       sum5   -= *v5-- * tmp0;
1162       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1163       break;
1164     default:
1165       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1166     }
1167   }
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1178 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1179 {
1180   Mat             C     =B;
1181   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1182   IS              isrow = b->row,isicol = b->icol;
1183   PetscErrorCode  ierr;
1184   const PetscInt  *r,*ic,*ics;
1185   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1186   PetscInt        i,j,k,nz,nzL,row,*pj;
1187   const PetscInt  *ajtmp,*bjtmp;
1188   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1189   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1190   FactorShiftCtx  sctx;
1191   const PetscInt  *ddiag;
1192   PetscReal       rs;
1193   MatScalar       d;
1194   PetscInt        inod,nodesz,node_max,col;
1195   const PetscInt  *ns;
1196   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1197 
1198   PetscFunctionBegin;
1199   /* MatPivotSetUp(): initialize shift context sctx */
1200   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1201 
1202   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1203     ddiag          = a->diag;
1204     sctx.shift_top = info->zeropivot;
1205     for (i=0; i<n; i++) {
1206       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1207       d  = (aa)[ddiag[i]];
1208       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1209       v  = aa+ai[i];
1210       nz = ai[i+1] - ai[i];
1211       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1212       if (rs>sctx.shift_top) sctx.shift_top = rs;
1213     }
1214     sctx.shift_top *= 1.1;
1215     sctx.nshift_max = 5;
1216     sctx.shift_lo   = 0.;
1217     sctx.shift_hi   = 1.;
1218   }
1219 
1220   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1221   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1222 
1223   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1224   ics   = ic;
1225 
1226   node_max = a->inode.node_count;
1227   ns       = a->inode.size;
1228   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1229 
1230   /* If max inode size > 4, split it into two inodes.*/
1231   /* also map the inode sizes according to the ordering */
1232   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1233   for (i=0,j=0; i<node_max; ++i,++j) {
1234     if (ns[i] > 4) {
1235       tmp_vec1[j] = 4;
1236       ++j;
1237       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1238     } else {
1239       tmp_vec1[j] = ns[i];
1240     }
1241   }
1242   /* Use the correct node_max */
1243   node_max = j;
1244 
1245   /* Now reorder the inode info based on mat re-ordering info */
1246   /* First create a row -> inode_size_array_index map */
1247   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1248   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1249   for (i=0,row=0; i<node_max; i++) {
1250     nodesz = tmp_vec1[i];
1251     for (j=0; j<nodesz; j++,row++) {
1252       nsmap[row] = i;
1253     }
1254   }
1255   /* Using nsmap, create a reordered ns structure */
1256   for (i=0,j=0; i< node_max; i++) {
1257     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1258     tmp_vec2[i] = nodesz;
1259     j          += nodesz;
1260   }
1261   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1262   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1263 
1264   /* Now use the correct ns */
1265   ns = tmp_vec2;
1266 
1267   do {
1268     sctx.newshift = PETSC_FALSE;
1269     /* Now loop over each block-row, and do the factorization */
1270     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1271       nodesz = ns[inod];
1272 
1273       switch (nodesz) {
1274       case 1:
1275         /*----------*/
1276         /* zero rtmp1 */
1277         /* L part */
1278         nz    = bi[i+1] - bi[i];
1279         bjtmp = bj + bi[i];
1280         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1281 
1282         /* U part */
1283         nz    = bdiag[i]-bdiag[i+1];
1284         bjtmp = bj + bdiag[i+1]+1;
1285         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1286 
1287         /* load in initial (unfactored row) */
1288         nz    = ai[r[i]+1] - ai[r[i]];
1289         ajtmp = aj + ai[r[i]];
1290         v     = aa + ai[r[i]];
1291         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1292 
1293         /* ZeropivotApply() */
1294         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1295 
1296         /* elimination */
1297         bjtmp = bj + bi[i];
1298         row   = *bjtmp++;
1299         nzL   = bi[i+1] - bi[i];
1300         for (k=0; k < nzL; k++) {
1301           pc = rtmp1 + row;
1302           if (*pc != 0.0) {
1303             pv   = b->a + bdiag[row];
1304             mul1 = *pc * (*pv);
1305             *pc  = mul1;
1306             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1307             pv   = b->a + bdiag[row+1]+1;
1308             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1309             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1310             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1311           }
1312           row = *bjtmp++;
1313         }
1314 
1315         /* finished row so stick it into b->a */
1316         rs = 0.0;
1317         /* L part */
1318         pv = b->a + bi[i];
1319         pj = b->j + bi[i];
1320         nz = bi[i+1] - bi[i];
1321         for (j=0; j<nz; j++) {
1322           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1323         }
1324 
1325         /* U part */
1326         pv = b->a + bdiag[i+1]+1;
1327         pj = b->j + bdiag[i+1]+1;
1328         nz = bdiag[i] - bdiag[i+1]-1;
1329         for (j=0; j<nz; j++) {
1330           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1331         }
1332 
1333         /* Check zero pivot */
1334         sctx.rs = rs;
1335         sctx.pv = rtmp1[i];
1336         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1337         if (sctx.newshift) break;
1338 
1339         /* Mark diagonal and invert diagonal for simplier triangular solves */
1340         pv  = b->a + bdiag[i];
1341         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1342         break;
1343 
1344       case 2:
1345         /*----------*/
1346         /* zero rtmp1 and rtmp2 */
1347         /* L part */
1348         nz    = bi[i+1] - bi[i];
1349         bjtmp = bj + bi[i];
1350         for  (j=0; j<nz; j++) {
1351           col        = bjtmp[j];
1352           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1353         }
1354 
1355         /* U part */
1356         nz    = bdiag[i]-bdiag[i+1];
1357         bjtmp = bj + bdiag[i+1]+1;
1358         for  (j=0; j<nz; j++) {
1359           col        = bjtmp[j];
1360           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1361         }
1362 
1363         /* load in initial (unfactored row) */
1364         nz    = ai[r[i]+1] - ai[r[i]];
1365         ajtmp = aj + ai[r[i]];
1366         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1367         for (j=0; j<nz; j++) {
1368           col        = ics[ajtmp[j]];
1369           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1370         }
1371         /* ZeropivotApply(): shift the diagonal of the matrix  */
1372         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1373 
1374         /* elimination */
1375         bjtmp = bj + bi[i];
1376         row   = *bjtmp++; /* pivot row */
1377         nzL   = bi[i+1] - bi[i];
1378         for (k=0; k < nzL; k++) {
1379           pc1 = rtmp1 + row;
1380           pc2 = rtmp2 + row;
1381           if (*pc1 != 0.0 || *pc2 != 0.0) {
1382             pv   = b->a + bdiag[row];
1383             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1384             *pc1 = mul1;       *pc2 = mul2;
1385 
1386             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1387             pv = b->a + bdiag[row+1]+1;
1388             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1389             for (j=0; j<nz; j++) {
1390               col         = pj[j];
1391               rtmp1[col] -= mul1 * pv[j];
1392               rtmp2[col] -= mul2 * pv[j];
1393             }
1394             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1395           }
1396           row = *bjtmp++;
1397         }
1398 
1399         /* finished row i; check zero pivot, then stick row i into b->a */
1400         rs = 0.0;
1401         /* L part */
1402         pc1 = b->a + bi[i];
1403         pj  = b->j + bi[i];
1404         nz  = bi[i+1] - bi[i];
1405         for (j=0; j<nz; j++) {
1406           col    = pj[j];
1407           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1408         }
1409         /* U part */
1410         pc1 = b->a + bdiag[i+1]+1;
1411         pj  = b->j + bdiag[i+1]+1;
1412         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1413         for (j=0; j<nz; j++) {
1414           col    = pj[j];
1415           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1416         }
1417 
1418         sctx.rs = rs;
1419         sctx.pv = rtmp1[i];
1420         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1421         if (sctx.newshift) break;
1422         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1423         *pc1 = 1.0/sctx.pv;
1424 
1425         /* Now take care of diagonal 2x2 block. */
1426         pc2 = rtmp2 + i;
1427         if (*pc2 != 0.0) {
1428           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1429           *pc2 = mul1;          /* insert L entry */
1430           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1431           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1432           for (j=0; j<nz; j++) {
1433             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1434           }
1435           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1436         }
1437 
1438         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1439         rs = 0.0;
1440         /* L part */
1441         pc2 = b->a + bi[i+1];
1442         pj  = b->j + bi[i+1];
1443         nz  = bi[i+2] - bi[i+1];
1444         for (j=0; j<nz; j++) {
1445           col    = pj[j];
1446           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1447         }
1448         /* U part */
1449         pc2 = b->a + bdiag[i+2]+1;
1450         pj  = b->j + bdiag[i+2]+1;
1451         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1452         for (j=0; j<nz; j++) {
1453           col    = pj[j];
1454           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1455         }
1456 
1457         sctx.rs = rs;
1458         sctx.pv = rtmp2[i+1];
1459         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1460         if (sctx.newshift) break;
1461         pc2  = b->a + bdiag[i+1];
1462         *pc2 = 1.0/sctx.pv;
1463         break;
1464 
1465       case 3:
1466         /*----------*/
1467         /* zero rtmp */
1468         /* L part */
1469         nz    = bi[i+1] - bi[i];
1470         bjtmp = bj + bi[i];
1471         for  (j=0; j<nz; j++) {
1472           col        = bjtmp[j];
1473           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1474         }
1475 
1476         /* U part */
1477         nz    = bdiag[i]-bdiag[i+1];
1478         bjtmp = bj + bdiag[i+1]+1;
1479         for  (j=0; j<nz; j++) {
1480           col        = bjtmp[j];
1481           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482         }
1483 
1484         /* load in initial (unfactored row) */
1485         nz    = ai[r[i]+1] - ai[r[i]];
1486         ajtmp = aj + ai[r[i]];
1487         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1488         for (j=0; j<nz; j++) {
1489           col        = ics[ajtmp[j]];
1490           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1491         }
1492         /* ZeropivotApply(): shift the diagonal of the matrix  */
1493         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1494 
1495         /* elimination */
1496         bjtmp = bj + bi[i];
1497         row   = *bjtmp++; /* pivot row */
1498         nzL   = bi[i+1] - bi[i];
1499         for (k=0; k < nzL; k++) {
1500           pc1 = rtmp1 + row;
1501           pc2 = rtmp2 + row;
1502           pc3 = rtmp3 + row;
1503           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1504             pv   = b->a + bdiag[row];
1505             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1506             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1507 
1508             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1509             pv = b->a + bdiag[row+1]+1;
1510             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1511             for (j=0; j<nz; j++) {
1512               col         = pj[j];
1513               rtmp1[col] -= mul1 * pv[j];
1514               rtmp2[col] -= mul2 * pv[j];
1515               rtmp3[col] -= mul3 * pv[j];
1516             }
1517             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1518           }
1519           row = *bjtmp++;
1520         }
1521 
1522         /* finished row i; check zero pivot, then stick row i into b->a */
1523         rs = 0.0;
1524         /* L part */
1525         pc1 = b->a + bi[i];
1526         pj  = b->j + bi[i];
1527         nz  = bi[i+1] - bi[i];
1528         for (j=0; j<nz; j++) {
1529           col    = pj[j];
1530           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1531         }
1532         /* U part */
1533         pc1 = b->a + bdiag[i+1]+1;
1534         pj  = b->j + bdiag[i+1]+1;
1535         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1536         for (j=0; j<nz; j++) {
1537           col    = pj[j];
1538           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539         }
1540 
1541         sctx.rs = rs;
1542         sctx.pv = rtmp1[i];
1543         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1544         if (sctx.newshift) break;
1545         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1546         *pc1 = 1.0/sctx.pv;
1547 
1548         /* Now take care of 1st column of diagonal 3x3 block. */
1549         pc2 = rtmp2 + i;
1550         pc3 = rtmp3 + i;
1551         if (*pc2 != 0.0 || *pc3 != 0.0) {
1552           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1553           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1554           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1555           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1556           for (j=0; j<nz; j++) {
1557             col         = pj[j];
1558             rtmp2[col] -= mul2 * rtmp1[col];
1559             rtmp3[col] -= mul3 * rtmp1[col];
1560           }
1561           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1562         }
1563 
1564         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1565         rs = 0.0;
1566         /* L part */
1567         pc2 = b->a + bi[i+1];
1568         pj  = b->j + bi[i+1];
1569         nz  = bi[i+2] - bi[i+1];
1570         for (j=0; j<nz; j++) {
1571           col    = pj[j];
1572           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1573         }
1574         /* U part */
1575         pc2 = b->a + bdiag[i+2]+1;
1576         pj  = b->j + bdiag[i+2]+1;
1577         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1578         for (j=0; j<nz; j++) {
1579           col    = pj[j];
1580           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581         }
1582 
1583         sctx.rs = rs;
1584         sctx.pv = rtmp2[i+1];
1585         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1586         if (sctx.newshift) break;
1587         pc2  = b->a + bdiag[i+1];
1588         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1589 
1590         /* Now take care of 2nd column of diagonal 3x3 block. */
1591         pc3 = rtmp3 + i+1;
1592         if (*pc3 != 0.0) {
1593           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1594           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1595           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1596           for (j=0; j<nz; j++) {
1597             col         = pj[j];
1598             rtmp3[col] -= mul3 * rtmp2[col];
1599           }
1600           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1601         }
1602 
1603         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1604         rs = 0.0;
1605         /* L part */
1606         pc3 = b->a + bi[i+2];
1607         pj  = b->j + bi[i+2];
1608         nz  = bi[i+3] - bi[i+2];
1609         for (j=0; j<nz; j++) {
1610           col    = pj[j];
1611           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1612         }
1613         /* U part */
1614         pc3 = b->a + bdiag[i+3]+1;
1615         pj  = b->j + bdiag[i+3]+1;
1616         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1617         for (j=0; j<nz; j++) {
1618           col    = pj[j];
1619           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620         }
1621 
1622         sctx.rs = rs;
1623         sctx.pv = rtmp3[i+2];
1624         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1625         if (sctx.newshift) break;
1626         pc3  = b->a + bdiag[i+2];
1627         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1628         break;
1629       case 4:
1630         /*----------*/
1631         /* zero rtmp */
1632         /* L part */
1633         nz    = bi[i+1] - bi[i];
1634         bjtmp = bj + bi[i];
1635         for  (j=0; j<nz; j++) {
1636           col        = bjtmp[j];
1637           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1638         }
1639 
1640         /* U part */
1641         nz    = bdiag[i]-bdiag[i+1];
1642         bjtmp = bj + bdiag[i+1]+1;
1643         for  (j=0; j<nz; j++) {
1644           col        = bjtmp[j];
1645           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1646         }
1647 
1648         /* load in initial (unfactored row) */
1649         nz    = ai[r[i]+1] - ai[r[i]];
1650         ajtmp = aj + ai[r[i]];
1651         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1652         for (j=0; j<nz; j++) {
1653           col        = ics[ajtmp[j]];
1654           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1655         }
1656         /* ZeropivotApply(): shift the diagonal of the matrix  */
1657         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1658 
1659         /* elimination */
1660         bjtmp = bj + bi[i];
1661         row   = *bjtmp++; /* pivot row */
1662         nzL   = bi[i+1] - bi[i];
1663         for (k=0; k < nzL; k++) {
1664           pc1 = rtmp1 + row;
1665           pc2 = rtmp2 + row;
1666           pc3 = rtmp3 + row;
1667           pc4 = rtmp4 + row;
1668           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1669             pv   = b->a + bdiag[row];
1670             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1671             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1672 
1673             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1674             pv = b->a + bdiag[row+1]+1;
1675             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1676             for (j=0; j<nz; j++) {
1677               col         = pj[j];
1678               rtmp1[col] -= mul1 * pv[j];
1679               rtmp2[col] -= mul2 * pv[j];
1680               rtmp3[col] -= mul3 * pv[j];
1681               rtmp4[col] -= mul4 * pv[j];
1682             }
1683             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1684           }
1685           row = *bjtmp++;
1686         }
1687 
1688         /* finished row i; check zero pivot, then stick row i into b->a */
1689         rs = 0.0;
1690         /* L part */
1691         pc1 = b->a + bi[i];
1692         pj  = b->j + bi[i];
1693         nz  = bi[i+1] - bi[i];
1694         for (j=0; j<nz; j++) {
1695           col    = pj[j];
1696           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1697         }
1698         /* U part */
1699         pc1 = b->a + bdiag[i+1]+1;
1700         pj  = b->j + bdiag[i+1]+1;
1701         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1702         for (j=0; j<nz; j++) {
1703           col    = pj[j];
1704           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1705         }
1706 
1707         sctx.rs = rs;
1708         sctx.pv = rtmp1[i];
1709         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1710         if (sctx.newshift) break;
1711         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1712         *pc1 = 1.0/sctx.pv;
1713 
1714         /* Now take care of 1st column of diagonal 4x4 block. */
1715         pc2 = rtmp2 + i;
1716         pc3 = rtmp3 + i;
1717         pc4 = rtmp4 + i;
1718         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1719           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1720           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1721           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1722           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1723           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1724           for (j=0; j<nz; j++) {
1725             col         = pj[j];
1726             rtmp2[col] -= mul2 * rtmp1[col];
1727             rtmp3[col] -= mul3 * rtmp1[col];
1728             rtmp4[col] -= mul4 * rtmp1[col];
1729           }
1730           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1731         }
1732 
1733         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1734         rs = 0.0;
1735         /* L part */
1736         pc2 = b->a + bi[i+1];
1737         pj  = b->j + bi[i+1];
1738         nz  = bi[i+2] - bi[i+1];
1739         for (j=0; j<nz; j++) {
1740           col    = pj[j];
1741           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1742         }
1743         /* U part */
1744         pc2 = b->a + bdiag[i+2]+1;
1745         pj  = b->j + bdiag[i+2]+1;
1746         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1747         for (j=0; j<nz; j++) {
1748           col    = pj[j];
1749           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1750         }
1751 
1752         sctx.rs = rs;
1753         sctx.pv = rtmp2[i+1];
1754         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1755         if (sctx.newshift) break;
1756         pc2  = b->a + bdiag[i+1];
1757         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1758 
1759         /* Now take care of 2nd column of diagonal 4x4 block. */
1760         pc3 = rtmp3 + i+1;
1761         pc4 = rtmp4 + i+1;
1762         if (*pc3 != 0.0 || *pc4 != 0.0) {
1763           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1764           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1765           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1766           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1767           for (j=0; j<nz; j++) {
1768             col         = pj[j];
1769             rtmp3[col] -= mul3 * rtmp2[col];
1770             rtmp4[col] -= mul4 * rtmp2[col];
1771           }
1772           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1773         }
1774 
1775         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1776         rs = 0.0;
1777         /* L part */
1778         pc3 = b->a + bi[i+2];
1779         pj  = b->j + bi[i+2];
1780         nz  = bi[i+3] - bi[i+2];
1781         for (j=0; j<nz; j++) {
1782           col    = pj[j];
1783           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1784         }
1785         /* U part */
1786         pc3 = b->a + bdiag[i+3]+1;
1787         pj  = b->j + bdiag[i+3]+1;
1788         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1789         for (j=0; j<nz; j++) {
1790           col    = pj[j];
1791           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1792         }
1793 
1794         sctx.rs = rs;
1795         sctx.pv = rtmp3[i+2];
1796         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1797         if (sctx.newshift) break;
1798         pc3  = b->a + bdiag[i+2];
1799         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1800 
1801         /* Now take care of 3rd column of diagonal 4x4 block. */
1802         pc4 = rtmp4 + i+2;
1803         if (*pc4 != 0.0) {
1804           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1805           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1806           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1807           for (j=0; j<nz; j++) {
1808             col         = pj[j];
1809             rtmp4[col] -= mul4 * rtmp3[col];
1810           }
1811           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1812         }
1813 
1814         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1815         rs = 0.0;
1816         /* L part */
1817         pc4 = b->a + bi[i+3];
1818         pj  = b->j + bi[i+3];
1819         nz  = bi[i+4] - bi[i+3];
1820         for (j=0; j<nz; j++) {
1821           col    = pj[j];
1822           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1823         }
1824         /* U part */
1825         pc4 = b->a + bdiag[i+4]+1;
1826         pj  = b->j + bdiag[i+4]+1;
1827         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1828         for (j=0; j<nz; j++) {
1829           col    = pj[j];
1830           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1831         }
1832 
1833         sctx.rs = rs;
1834         sctx.pv = rtmp4[i+3];
1835         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1836         if (sctx.newshift) break;
1837         pc4  = b->a + bdiag[i+3];
1838         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1839         break;
1840 
1841       default:
1842         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1843       }
1844       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1845       i += nodesz;                 /* Update the row */
1846     }
1847 
1848     /* MatPivotRefine() */
1849     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1850       /*
1851        * if no shift in this attempt & shifting & started shifting & can refine,
1852        * then try lower shift
1853        */
1854       sctx.shift_hi       = sctx.shift_fraction;
1855       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1856       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1857       sctx.newshift       = PETSC_TRUE;
1858       sctx.nshift++;
1859     }
1860   } while (sctx.newshift);
1861 
1862   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1863   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1864   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1865   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1866 
1867   if (b->inode.size) {
1868     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1869   } else {
1870     C->ops->solve           = MatSolve_SeqAIJ;
1871   }
1872   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1873   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1874   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1875   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1876   C->assembled              = PETSC_TRUE;
1877   C->preallocated           = PETSC_TRUE;
1878 
1879   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1880 
1881   /* MatShiftView(A,info,&sctx) */
1882   if (sctx.nshift) {
1883     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1884       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1885     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1886       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1887     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1888       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1889     }
1890   }
1891   PetscFunctionReturn(0);
1892 }
1893 
1894 #undef __FUNCT__
1895 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1896 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1897 {
1898   Mat             C     = B;
1899   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1900   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1901   PetscErrorCode  ierr;
1902   const PetscInt  *r,*ic,*c,*ics;
1903   PetscInt        n   = A->rmap->n,*bi = b->i;
1904   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1905   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1906   PetscInt        *ai = a->i,*aj = a->j;
1907   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1908   PetscScalar     mul1,mul2,mul3,tmp;
1909   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1910   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1911   PetscReal       rs=0.0;
1912   FactorShiftCtx  sctx;
1913 
1914   PetscFunctionBegin;
1915   sctx.shift_top      = 0;
1916   sctx.nshift_max     = 0;
1917   sctx.shift_lo       = 0;
1918   sctx.shift_hi       = 0;
1919   sctx.shift_fraction = 0;
1920 
1921   /* if both shift schemes are chosen by user, only use info->shiftpd */
1922   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1923     sctx.shift_top = 0;
1924     for (i=0; i<n; i++) {
1925       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1926       rs    = 0.0;
1927       ajtmp = aj + ai[i];
1928       rtmp1 = aa + ai[i];
1929       nz    = ai[i+1] - ai[i];
1930       for (j=0; j<nz; j++) {
1931         if (*ajtmp != i) {
1932           rs += PetscAbsScalar(*rtmp1++);
1933         } else {
1934           rs -= PetscRealPart(*rtmp1++);
1935         }
1936         ajtmp++;
1937       }
1938       if (rs>sctx.shift_top) sctx.shift_top = rs;
1939     }
1940     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1941     sctx.shift_top *= 1.1;
1942     sctx.nshift_max = 5;
1943     sctx.shift_lo   = 0.;
1944     sctx.shift_hi   = 1.;
1945   }
1946   sctx.shift_amount = 0;
1947   sctx.nshift       = 0;
1948 
1949   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1950   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1951   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1952   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1953   ics    = ic;
1954 
1955   node_max = a->inode.node_count;
1956   ns       = a->inode.size;
1957   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1958 
1959   /* If max inode size > 3, split it into two inodes.*/
1960   /* also map the inode sizes according to the ordering */
1961   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1962   for (i=0,j=0; i<node_max; ++i,++j) {
1963     if (ns[i]>3) {
1964       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1965       ++j;
1966       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1967     } else {
1968       tmp_vec1[j] = ns[i];
1969     }
1970   }
1971   /* Use the correct node_max */
1972   node_max = j;
1973 
1974   /* Now reorder the inode info based on mat re-ordering info */
1975   /* First create a row -> inode_size_array_index map */
1976   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1977   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1978   for (i=0,row=0; i<node_max; i++) {
1979     nodesz = tmp_vec1[i];
1980     for (j=0; j<nodesz; j++,row++) {
1981       nsmap[row] = i;
1982     }
1983   }
1984   /* Using nsmap, create a reordered ns structure */
1985   for (i=0,j=0; i< node_max; i++) {
1986     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1987     tmp_vec2[i] = nodesz;
1988     j          += nodesz;
1989   }
1990   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1991   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1992   /* Now use the correct ns */
1993   ns = tmp_vec2;
1994 
1995   do {
1996     sctx.newshift = PETSC_FALSE;
1997     /* Now loop over each block-row, and do the factorization */
1998     for (i=0,row=0; i<node_max; i++) {
1999       nodesz = ns[i];
2000       nz     = bi[row+1] - bi[row];
2001       bjtmp  = bj + bi[row];
2002 
2003       switch (nodesz) {
2004       case 1:
2005         for  (j=0; j<nz; j++) {
2006           idx         = bjtmp[j];
2007           rtmp11[idx] = 0.0;
2008         }
2009 
2010         /* load in initial (unfactored row) */
2011         idx    = r[row];
2012         nz_tmp = ai[idx+1] - ai[idx];
2013         ajtmp  = aj + ai[idx];
2014         v1     = aa + ai[idx];
2015 
2016         for (j=0; j<nz_tmp; j++) {
2017           idx         = ics[ajtmp[j]];
2018           rtmp11[idx] = v1[j];
2019         }
2020         rtmp11[ics[r[row]]] += sctx.shift_amount;
2021 
2022         prow = *bjtmp++;
2023         while (prow < row) {
2024           pc1 = rtmp11 + prow;
2025           if (*pc1 != 0.0) {
2026             pv     = ba + bd[prow];
2027             pj     = nbj + bd[prow];
2028             mul1   = *pc1 * *pv++;
2029             *pc1   = mul1;
2030             nz_tmp = bi[prow+1] - bd[prow] - 1;
2031             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2032             for (j=0; j<nz_tmp; j++) {
2033               tmp          = pv[j];
2034               idx          = pj[j];
2035               rtmp11[idx] -= mul1 * tmp;
2036             }
2037           }
2038           prow = *bjtmp++;
2039         }
2040         pj  = bj + bi[row];
2041         pc1 = ba + bi[row];
2042 
2043         sctx.pv     = rtmp11[row];
2044         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2045         rs          = 0.0;
2046         for (j=0; j<nz; j++) {
2047           idx    = pj[j];
2048           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2049           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2050         }
2051         sctx.rs = rs;
2052         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2053         if (sctx.newshift) goto endofwhile;
2054         break;
2055 
2056       case 2:
2057         for (j=0; j<nz; j++) {
2058           idx         = bjtmp[j];
2059           rtmp11[idx] = 0.0;
2060           rtmp22[idx] = 0.0;
2061         }
2062 
2063         /* load in initial (unfactored row) */
2064         idx    = r[row];
2065         nz_tmp = ai[idx+1] - ai[idx];
2066         ajtmp  = aj + ai[idx];
2067         v1     = aa + ai[idx];
2068         v2     = aa + ai[idx+1];
2069         for (j=0; j<nz_tmp; j++) {
2070           idx         = ics[ajtmp[j]];
2071           rtmp11[idx] = v1[j];
2072           rtmp22[idx] = v2[j];
2073         }
2074         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2075         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2076 
2077         prow = *bjtmp++;
2078         while (prow < row) {
2079           pc1 = rtmp11 + prow;
2080           pc2 = rtmp22 + prow;
2081           if (*pc1 != 0.0 || *pc2 != 0.0) {
2082             pv   = ba + bd[prow];
2083             pj   = nbj + bd[prow];
2084             mul1 = *pc1 * *pv;
2085             mul2 = *pc2 * *pv;
2086             ++pv;
2087             *pc1 = mul1;
2088             *pc2 = mul2;
2089 
2090             nz_tmp = bi[prow+1] - bd[prow] - 1;
2091             for (j=0; j<nz_tmp; j++) {
2092               tmp          = pv[j];
2093               idx          = pj[j];
2094               rtmp11[idx] -= mul1 * tmp;
2095               rtmp22[idx] -= mul2 * tmp;
2096             }
2097             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2098           }
2099           prow = *bjtmp++;
2100         }
2101 
2102         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2103         pc1 = rtmp11 + prow;
2104         pc2 = rtmp22 + prow;
2105 
2106         sctx.pv = *pc1;
2107         pj      = bj + bi[prow];
2108         rs      = 0.0;
2109         for (j=0; j<nz; j++) {
2110           idx = pj[j];
2111           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2112         }
2113         sctx.rs = rs;
2114         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2115         if (sctx.newshift) goto endofwhile;
2116 
2117         if (*pc2 != 0.0) {
2118           pj     = nbj + bd[prow];
2119           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2120           *pc2   = mul2;
2121           nz_tmp = bi[prow+1] - bd[prow] - 1;
2122           for (j=0; j<nz_tmp; j++) {
2123             idx          = pj[j];
2124             tmp          = rtmp11[idx];
2125             rtmp22[idx] -= mul2 * tmp;
2126           }
2127           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2128         }
2129 
2130         pj  = bj + bi[row];
2131         pc1 = ba + bi[row];
2132         pc2 = ba + bi[row+1];
2133 
2134         sctx.pv       = rtmp22[row+1];
2135         rs            = 0.0;
2136         rtmp11[row]   = 1.0/rtmp11[row];
2137         rtmp22[row+1] = 1.0/rtmp22[row+1];
2138         /* copy row entries from dense representation to sparse */
2139         for (j=0; j<nz; j++) {
2140           idx    = pj[j];
2141           pc1[j] = rtmp11[idx];
2142           pc2[j] = rtmp22[idx];
2143           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2144         }
2145         sctx.rs = rs;
2146         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2147         if (sctx.newshift) goto endofwhile;
2148         break;
2149 
2150       case 3:
2151         for  (j=0; j<nz; j++) {
2152           idx         = bjtmp[j];
2153           rtmp11[idx] = 0.0;
2154           rtmp22[idx] = 0.0;
2155           rtmp33[idx] = 0.0;
2156         }
2157         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2158         idx    = r[row];
2159         nz_tmp = ai[idx+1] - ai[idx];
2160         ajtmp  = aj + ai[idx];
2161         v1     = aa + ai[idx];
2162         v2     = aa + ai[idx+1];
2163         v3     = aa + ai[idx+2];
2164         for (j=0; j<nz_tmp; j++) {
2165           idx         = ics[ajtmp[j]];
2166           rtmp11[idx] = v1[j];
2167           rtmp22[idx] = v2[j];
2168           rtmp33[idx] = v3[j];
2169         }
2170         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2171         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2172         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2173 
2174         /* loop over all pivot row blocks above this row block */
2175         prow = *bjtmp++;
2176         while (prow < row) {
2177           pc1 = rtmp11 + prow;
2178           pc2 = rtmp22 + prow;
2179           pc3 = rtmp33 + prow;
2180           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2181             pv   = ba  + bd[prow];
2182             pj   = nbj + bd[prow];
2183             mul1 = *pc1 * *pv;
2184             mul2 = *pc2 * *pv;
2185             mul3 = *pc3 * *pv;
2186             ++pv;
2187             *pc1 = mul1;
2188             *pc2 = mul2;
2189             *pc3 = mul3;
2190 
2191             nz_tmp = bi[prow+1] - bd[prow] - 1;
2192             /* update this row based on pivot row */
2193             for (j=0; j<nz_tmp; j++) {
2194               tmp          = pv[j];
2195               idx          = pj[j];
2196               rtmp11[idx] -= mul1 * tmp;
2197               rtmp22[idx] -= mul2 * tmp;
2198               rtmp33[idx] -= mul3 * tmp;
2199             }
2200             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2201           }
2202           prow = *bjtmp++;
2203         }
2204 
2205         /* Now take care of diagonal 3x3 block in this set of rows */
2206         /* note: prow = row here */
2207         pc1 = rtmp11 + prow;
2208         pc2 = rtmp22 + prow;
2209         pc3 = rtmp33 + prow;
2210 
2211         sctx.pv = *pc1;
2212         pj      = bj + bi[prow];
2213         rs      = 0.0;
2214         for (j=0; j<nz; j++) {
2215           idx = pj[j];
2216           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2217         }
2218         sctx.rs = rs;
2219         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2220         if (sctx.newshift) goto endofwhile;
2221 
2222         if (*pc2 != 0.0 || *pc3 != 0.0) {
2223           mul2   = (*pc2)/(*pc1);
2224           mul3   = (*pc3)/(*pc1);
2225           *pc2   = mul2;
2226           *pc3   = mul3;
2227           nz_tmp = bi[prow+1] - bd[prow] - 1;
2228           pj     = nbj + bd[prow];
2229           for (j=0; j<nz_tmp; j++) {
2230             idx          = pj[j];
2231             tmp          = rtmp11[idx];
2232             rtmp22[idx] -= mul2 * tmp;
2233             rtmp33[idx] -= mul3 * tmp;
2234           }
2235           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2236         }
2237         ++prow;
2238 
2239         pc2     = rtmp22 + prow;
2240         pc3     = rtmp33 + prow;
2241         sctx.pv = *pc2;
2242         pj      = bj + bi[prow];
2243         rs      = 0.0;
2244         for (j=0; j<nz; j++) {
2245           idx = pj[j];
2246           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2247         }
2248         sctx.rs = rs;
2249         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2250         if (sctx.newshift) goto endofwhile;
2251 
2252         if (*pc3 != 0.0) {
2253           mul3   = (*pc3)/(*pc2);
2254           *pc3   = mul3;
2255           pj     = nbj + bd[prow];
2256           nz_tmp = bi[prow+1] - bd[prow] - 1;
2257           for (j=0; j<nz_tmp; j++) {
2258             idx          = pj[j];
2259             tmp          = rtmp22[idx];
2260             rtmp33[idx] -= mul3 * tmp;
2261           }
2262           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2263         }
2264 
2265         pj  = bj + bi[row];
2266         pc1 = ba + bi[row];
2267         pc2 = ba + bi[row+1];
2268         pc3 = ba + bi[row+2];
2269 
2270         sctx.pv       = rtmp33[row+2];
2271         rs            = 0.0;
2272         rtmp11[row]   = 1.0/rtmp11[row];
2273         rtmp22[row+1] = 1.0/rtmp22[row+1];
2274         rtmp33[row+2] = 1.0/rtmp33[row+2];
2275         /* copy row entries from dense representation to sparse */
2276         for (j=0; j<nz; j++) {
2277           idx    = pj[j];
2278           pc1[j] = rtmp11[idx];
2279           pc2[j] = rtmp22[idx];
2280           pc3[j] = rtmp33[idx];
2281           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2282         }
2283 
2284         sctx.rs = rs;
2285         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2286         if (sctx.newshift) goto endofwhile;
2287         break;
2288 
2289       default:
2290         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2291       }
2292       row += nodesz;                 /* Update the row */
2293     }
2294 endofwhile:;
2295   } while (sctx.newshift);
2296   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2297   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2298   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2299   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2300   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2301 
2302   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2303   /* do not set solve add, since MatSolve_Inode + Add is faster */
2304   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2305   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2306   C->assembled              = PETSC_TRUE;
2307   C->preallocated           = PETSC_TRUE;
2308   if (sctx.nshift) {
2309     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2310       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2311     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2312       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2313     }
2314   }
2315   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2316   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2317   PetscFunctionReturn(0);
2318 }
2319 
2320 
2321 /* ----------------------------------------------------------- */
2322 #undef __FUNCT__
2323 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2324 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2325 {
2326   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2327   IS                iscol = a->col,isrow = a->row;
2328   PetscErrorCode    ierr;
2329   const PetscInt    *r,*c,*rout,*cout;
2330   PetscInt          i,j,n = A->rmap->n;
2331   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2332   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2333   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2334   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2335   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336   const PetscScalar *b;
2337 
2338   PetscFunctionBegin;
2339   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340   node_max = a->inode.node_count;
2341   ns       = a->inode.size;     /* Node Size array */
2342 
2343   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2344   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2345   tmp  = a->solve_work;
2346 
2347   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2348   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2349 
2350   /* forward solve the lower triangular */
2351   tmps = tmp;
2352   aa   = a_a;
2353   aj   = a_j;
2354   ad   = a->diag;
2355 
2356   for (i = 0,row = 0; i< node_max; ++i) {
2357     nsz = ns[i];
2358     aii = ai[row];
2359     v1  = aa + aii;
2360     vi  = aj + aii;
2361     nz  = ai[row+1]- ai[row];
2362 
2363     if (i < node_max-1) {
2364       /* Prefetch the indices for the next block */
2365       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366       /* Prefetch the data for the next block */
2367       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368     }
2369 
2370     switch (nsz) {               /* Each loop in 'case' is unrolled */
2371     case 1:
2372       sum1 = b[r[row]];
2373       for (j=0; j<nz-1; j+=2) {
2374         i0    = vi[j];
2375         i1    = vi[j+1];
2376         tmp0  = tmps[i0];
2377         tmp1  = tmps[i1];
2378         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379       }
2380       if (j == nz-1) {
2381         tmp0  = tmps[vi[j]];
2382         sum1 -= v1[j]*tmp0;
2383       }
2384       tmp[row++]=sum1;
2385       break;
2386     case 2:
2387       sum1 = b[r[row]];
2388       sum2 = b[r[row+1]];
2389       v2   = aa + ai[row+1];
2390 
2391       for (j=0; j<nz-1; j+=2) {
2392         i0    = vi[j];
2393         i1    = vi[j+1];
2394         tmp0  = tmps[i0];
2395         tmp1  = tmps[i1];
2396         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398       }
2399       if (j == nz-1) {
2400         tmp0  = tmps[vi[j]];
2401         sum1 -= v1[j] *tmp0;
2402         sum2 -= v2[j] *tmp0;
2403       }
2404       sum2     -= v2[nz] * sum1;
2405       tmp[row++]=sum1;
2406       tmp[row++]=sum2;
2407       break;
2408     case 3:
2409       sum1 = b[r[row]];
2410       sum2 = b[r[row+1]];
2411       sum3 = b[r[row+2]];
2412       v2   = aa + ai[row+1];
2413       v3   = aa + ai[row+2];
2414 
2415       for (j=0; j<nz-1; j+=2) {
2416         i0    = vi[j];
2417         i1    = vi[j+1];
2418         tmp0  = tmps[i0];
2419         tmp1  = tmps[i1];
2420         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423       }
2424       if (j == nz-1) {
2425         tmp0  = tmps[vi[j]];
2426         sum1 -= v1[j] *tmp0;
2427         sum2 -= v2[j] *tmp0;
2428         sum3 -= v3[j] *tmp0;
2429       }
2430       sum2     -= v2[nz] * sum1;
2431       sum3     -= v3[nz] * sum1;
2432       sum3     -= v3[nz+1] * sum2;
2433       tmp[row++]=sum1;
2434       tmp[row++]=sum2;
2435       tmp[row++]=sum3;
2436       break;
2437 
2438     case 4:
2439       sum1 = b[r[row]];
2440       sum2 = b[r[row+1]];
2441       sum3 = b[r[row+2]];
2442       sum4 = b[r[row+3]];
2443       v2   = aa + ai[row+1];
2444       v3   = aa + ai[row+2];
2445       v4   = aa + ai[row+3];
2446 
2447       for (j=0; j<nz-1; j+=2) {
2448         i0    = vi[j];
2449         i1    = vi[j+1];
2450         tmp0  = tmps[i0];
2451         tmp1  = tmps[i1];
2452         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456       }
2457       if (j == nz-1) {
2458         tmp0  = tmps[vi[j]];
2459         sum1 -= v1[j] *tmp0;
2460         sum2 -= v2[j] *tmp0;
2461         sum3 -= v3[j] *tmp0;
2462         sum4 -= v4[j] *tmp0;
2463       }
2464       sum2 -= v2[nz] * sum1;
2465       sum3 -= v3[nz] * sum1;
2466       sum4 -= v4[nz] * sum1;
2467       sum3 -= v3[nz+1] * sum2;
2468       sum4 -= v4[nz+1] * sum2;
2469       sum4 -= v4[nz+2] * sum3;
2470 
2471       tmp[row++]=sum1;
2472       tmp[row++]=sum2;
2473       tmp[row++]=sum3;
2474       tmp[row++]=sum4;
2475       break;
2476     case 5:
2477       sum1 = b[r[row]];
2478       sum2 = b[r[row+1]];
2479       sum3 = b[r[row+2]];
2480       sum4 = b[r[row+3]];
2481       sum5 = b[r[row+4]];
2482       v2   = aa + ai[row+1];
2483       v3   = aa + ai[row+2];
2484       v4   = aa + ai[row+3];
2485       v5   = aa + ai[row+4];
2486 
2487       for (j=0; j<nz-1; j+=2) {
2488         i0    = vi[j];
2489         i1    = vi[j+1];
2490         tmp0  = tmps[i0];
2491         tmp1  = tmps[i1];
2492         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497       }
2498       if (j == nz-1) {
2499         tmp0  = tmps[vi[j]];
2500         sum1 -= v1[j] *tmp0;
2501         sum2 -= v2[j] *tmp0;
2502         sum3 -= v3[j] *tmp0;
2503         sum4 -= v4[j] *tmp0;
2504         sum5 -= v5[j] *tmp0;
2505       }
2506 
2507       sum2 -= v2[nz] * sum1;
2508       sum3 -= v3[nz] * sum1;
2509       sum4 -= v4[nz] * sum1;
2510       sum5 -= v5[nz] * sum1;
2511       sum3 -= v3[nz+1] * sum2;
2512       sum4 -= v4[nz+1] * sum2;
2513       sum5 -= v5[nz+1] * sum2;
2514       sum4 -= v4[nz+2] * sum3;
2515       sum5 -= v5[nz+2] * sum3;
2516       sum5 -= v5[nz+3] * sum4;
2517 
2518       tmp[row++]=sum1;
2519       tmp[row++]=sum2;
2520       tmp[row++]=sum3;
2521       tmp[row++]=sum4;
2522       tmp[row++]=sum5;
2523       break;
2524     default:
2525       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526     }
2527   }
2528   /* backward solve the upper triangular */
2529   for (i=node_max -1,row = n-1; i>=0; i--) {
2530     nsz = ns[i];
2531     aii = ad[row+1] + 1;
2532     v1  = aa + aii;
2533     vi  = aj + aii;
2534     nz  = ad[row]- ad[row+1] - 1;
2535 
2536     if (i > 0) {
2537       /* Prefetch the indices for the next block */
2538       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539       /* Prefetch the data for the next block */
2540       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541     }
2542 
2543     switch (nsz) {               /* Each loop in 'case' is unrolled */
2544     case 1:
2545       sum1 = tmp[row];
2546 
2547       for (j=0; j<nz-1; j+=2) {
2548         i0    = vi[j];
2549         i1    = vi[j+1];
2550         tmp0  = tmps[i0];
2551         tmp1  = tmps[i1];
2552         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553       }
2554       if (j == nz-1) {
2555         tmp0  = tmps[vi[j]];
2556         sum1 -= v1[j]*tmp0;
2557       }
2558       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559       break;
2560     case 2:
2561       sum1 = tmp[row];
2562       sum2 = tmp[row-1];
2563       v2   = aa + ad[row] + 1;
2564       for (j=0; j<nz-1; j+=2) {
2565         i0    = vi[j];
2566         i1    = vi[j+1];
2567         tmp0  = tmps[i0];
2568         tmp1  = tmps[i1];
2569         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571       }
2572       if (j == nz-1) {
2573         tmp0  = tmps[vi[j]];
2574         sum1 -= v1[j]* tmp0;
2575         sum2 -= v2[j+1]* tmp0;
2576       }
2577 
2578       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579       sum2     -= v2[0] * tmp0;
2580       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581       break;
2582     case 3:
2583       sum1 = tmp[row];
2584       sum2 = tmp[row -1];
2585       sum3 = tmp[row -2];
2586       v2   = aa + ad[row] + 1;
2587       v3   = aa + ad[row -1] + 1;
2588       for (j=0; j<nz-1; j+=2) {
2589         i0    = vi[j];
2590         i1    = vi[j+1];
2591         tmp0  = tmps[i0];
2592         tmp1  = tmps[i1];
2593         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596       }
2597       if (j== nz-1) {
2598         tmp0  = tmps[vi[j]];
2599         sum1 -= v1[j] * tmp0;
2600         sum2 -= v2[j+1] * tmp0;
2601         sum3 -= v3[j+2] * tmp0;
2602       }
2603       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604       sum2     -= v2[0]* tmp0;
2605       sum3     -= v3[1] * tmp0;
2606       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607       sum3     -= v3[0]* tmp0;
2608       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609 
2610       break;
2611     case 4:
2612       sum1 = tmp[row];
2613       sum2 = tmp[row -1];
2614       sum3 = tmp[row -2];
2615       sum4 = tmp[row -3];
2616       v2   = aa + ad[row]+1;
2617       v3   = aa + ad[row -1]+1;
2618       v4   = aa + ad[row -2]+1;
2619 
2620       for (j=0; j<nz-1; j+=2) {
2621         i0    = vi[j];
2622         i1    = vi[j+1];
2623         tmp0  = tmps[i0];
2624         tmp1  = tmps[i1];
2625         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2626         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629       }
2630       if (j== nz-1) {
2631         tmp0  = tmps[vi[j]];
2632         sum1 -= v1[j] * tmp0;
2633         sum2 -= v2[j+1] * tmp0;
2634         sum3 -= v3[j+2] * tmp0;
2635         sum4 -= v4[j+3] * tmp0;
2636       }
2637 
2638       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639       sum2     -= v2[0] * tmp0;
2640       sum3     -= v3[1] * tmp0;
2641       sum4     -= v4[2] * tmp0;
2642       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643       sum3     -= v3[0] * tmp0;
2644       sum4     -= v4[1] * tmp0;
2645       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646       sum4     -= v4[0] * tmp0;
2647       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648       break;
2649     case 5:
2650       sum1 = tmp[row];
2651       sum2 = tmp[row -1];
2652       sum3 = tmp[row -2];
2653       sum4 = tmp[row -3];
2654       sum5 = tmp[row -4];
2655       v2   = aa + ad[row]+1;
2656       v3   = aa + ad[row -1]+1;
2657       v4   = aa + ad[row -2]+1;
2658       v5   = aa + ad[row -3]+1;
2659       for (j=0; j<nz-1; j+=2) {
2660         i0    = vi[j];
2661         i1    = vi[j+1];
2662         tmp0  = tmps[i0];
2663         tmp1  = tmps[i1];
2664         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669       }
2670       if (j==nz-1) {
2671         tmp0  = tmps[vi[j]];
2672         sum1 -= v1[j] * tmp0;
2673         sum2 -= v2[j+1] * tmp0;
2674         sum3 -= v3[j+2] * tmp0;
2675         sum4 -= v4[j+3] * tmp0;
2676         sum5 -= v5[j+4] * tmp0;
2677       }
2678 
2679       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680       sum2     -= v2[0] * tmp0;
2681       sum3     -= v3[1] * tmp0;
2682       sum4     -= v4[2] * tmp0;
2683       sum5     -= v5[3] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685       sum3     -= v3[0] * tmp0;
2686       sum4     -= v4[1] * tmp0;
2687       sum5     -= v5[2] * tmp0;
2688       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689       sum4     -= v4[0] * tmp0;
2690       sum5     -= v5[1] * tmp0;
2691       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692       sum5     -= v5[0] * tmp0;
2693       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694       break;
2695     default:
2696       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697     }
2698   }
2699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2703   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 
2708 /*
2709      Makes a longer coloring[] array and calls the usual code with that
2710 */
2711 #undef __FUNCT__
2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714 {
2715   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2716   PetscErrorCode  ierr;
2717   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718   PetscInt        *colorused,i;
2719   ISColoringValue *newcolor;
2720 
2721   PetscFunctionBegin;
2722   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2723   /* loop over inodes, marking a color for each column*/
2724   row = 0;
2725   for (i=0; i<m; i++) {
2726     for (j=0; j<ns[i]; j++) {
2727       newcolor[row++] = coloring[i] + j*ncolors;
2728     }
2729   }
2730 
2731   /* eliminate unneeded colors */
2732   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2733   for (i=0; i<n; i++) {
2734     colorused[newcolor[i]] = 1;
2735   }
2736 
2737   for (i=1; i<5*ncolors; i++) {
2738     colorused[i] += colorused[i-1];
2739   }
2740   ncolors = colorused[5*ncolors-1];
2741   for (i=0; i<n; i++) {
2742     newcolor[i] = colorused[newcolor[i]]-1;
2743   }
2744   ierr = PetscFree(colorused);CHKERRQ(ierr);
2745   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2746   ierr = PetscFree(coloring);CHKERRQ(ierr);
2747   PetscFunctionReturn(0);
2748 }
2749 
2750 #include <petsc/private/kernels/blockinvert.h>
2751 
2752 #undef __FUNCT__
2753 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2754 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2755 {
2756   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2757   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2758   MatScalar         *ibdiag,*bdiag,work[25],*t;
2759   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2760   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2761   const PetscScalar *xb, *b;
2762   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2763   PetscErrorCode    ierr;
2764   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2765   PetscInt          sz,k,ipvt[5];
2766   PetscBool         allowzeropivot,zeropivotdetected;
2767   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2768 
2769   PetscFunctionBegin;
2770   allowzeropivot = PetscNot(A->erroriffailure);
2771   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2772   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2773   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2774 
2775   if (!a->inode.ibdiagvalid) {
2776     if (!a->inode.ibdiag) {
2777       /* calculate space needed for diagonal blocks */
2778       for (i=0; i<m; i++) {
2779         cnt += sizes[i]*sizes[i];
2780       }
2781       a->inode.bdiagsize = cnt;
2782 
2783       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2784     }
2785 
2786     /* copy over the diagonal blocks and invert them */
2787     ibdiag = a->inode.ibdiag;
2788     bdiag  = a->inode.bdiag;
2789     cnt    = 0;
2790     for (i=0, row = 0; i<m; i++) {
2791       for (j=0; j<sizes[i]; j++) {
2792         for (k=0; k<sizes[i]; k++) {
2793           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2794         }
2795       }
2796       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2797 
2798       switch (sizes[i]) {
2799       case 1:
2800         /* Create matrix data structure */
2801         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2802           if (allowzeropivot) {
2803             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2804             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2805             A->factorerror_zeropivot_row   = row;
2806             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2807           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2808         }
2809         ibdiag[cnt] = 1.0/ibdiag[cnt];
2810         break;
2811       case 2:
2812         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2813         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2814         break;
2815       case 3:
2816         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2817         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2818         break;
2819       case 4:
2820         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2821         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2822         break;
2823       case 5:
2824         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2825         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2826         break;
2827       default:
2828         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2829       }
2830       cnt += sizes[i]*sizes[i];
2831       row += sizes[i];
2832     }
2833     a->inode.ibdiagvalid = PETSC_TRUE;
2834   }
2835   ibdiag = a->inode.ibdiag;
2836   bdiag  = a->inode.bdiag;
2837   t      = a->inode.ssor_work;
2838 
2839   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2840   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2841   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2842   if (flag & SOR_ZERO_INITIAL_GUESS) {
2843     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2844 
2845       for (i=0, row=0; i<m; i++) {
2846         sz  = diag[row] - ii[row];
2847         v1  = a->a + ii[row];
2848         idx = a->j + ii[row];
2849 
2850         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2851         switch (sizes[i]) {
2852         case 1:
2853 
2854           sum1 = b[row];
2855           for (n = 0; n<sz-1; n+=2) {
2856             i1    = idx[0];
2857             i2    = idx[1];
2858             idx  += 2;
2859             tmp0  = x[i1];
2860             tmp1  = x[i2];
2861             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2862           }
2863 
2864           if (n == sz-1) {
2865             tmp0  = x[*idx];
2866             sum1 -= *v1 * tmp0;
2867           }
2868           t[row]   = sum1;
2869           x[row++] = sum1*(*ibdiag++);
2870           break;
2871         case 2:
2872           v2   = a->a + ii[row+1];
2873           sum1 = b[row];
2874           sum2 = b[row+1];
2875           for (n = 0; n<sz-1; n+=2) {
2876             i1    = idx[0];
2877             i2    = idx[1];
2878             idx  += 2;
2879             tmp0  = x[i1];
2880             tmp1  = x[i2];
2881             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2882             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2883           }
2884 
2885           if (n == sz-1) {
2886             tmp0  = x[*idx];
2887             sum1 -= v1[0] * tmp0;
2888             sum2 -= v2[0] * tmp0;
2889           }
2890           t[row]   = sum1;
2891           t[row+1] = sum2;
2892           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2893           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2894           ibdiag  += 4;
2895           break;
2896         case 3:
2897           v2   = a->a + ii[row+1];
2898           v3   = a->a + ii[row+2];
2899           sum1 = b[row];
2900           sum2 = b[row+1];
2901           sum3 = b[row+2];
2902           for (n = 0; n<sz-1; n+=2) {
2903             i1    = idx[0];
2904             i2    = idx[1];
2905             idx  += 2;
2906             tmp0  = x[i1];
2907             tmp1  = x[i2];
2908             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2909             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2910             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2911           }
2912 
2913           if (n == sz-1) {
2914             tmp0  = x[*idx];
2915             sum1 -= v1[0] * tmp0;
2916             sum2 -= v2[0] * tmp0;
2917             sum3 -= v3[0] * tmp0;
2918           }
2919           t[row]   = sum1;
2920           t[row+1] = sum2;
2921           t[row+2] = sum3;
2922           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2923           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2924           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2925           ibdiag  += 9;
2926           break;
2927         case 4:
2928           v2   = a->a + ii[row+1];
2929           v3   = a->a + ii[row+2];
2930           v4   = a->a + ii[row+3];
2931           sum1 = b[row];
2932           sum2 = b[row+1];
2933           sum3 = b[row+2];
2934           sum4 = b[row+3];
2935           for (n = 0; n<sz-1; n+=2) {
2936             i1    = idx[0];
2937             i2    = idx[1];
2938             idx  += 2;
2939             tmp0  = x[i1];
2940             tmp1  = x[i2];
2941             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2942             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2943             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2944             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2945           }
2946 
2947           if (n == sz-1) {
2948             tmp0  = x[*idx];
2949             sum1 -= v1[0] * tmp0;
2950             sum2 -= v2[0] * tmp0;
2951             sum3 -= v3[0] * tmp0;
2952             sum4 -= v4[0] * tmp0;
2953           }
2954           t[row]   = sum1;
2955           t[row+1] = sum2;
2956           t[row+2] = sum3;
2957           t[row+3] = sum4;
2958           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2959           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2960           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2961           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2962           ibdiag  += 16;
2963           break;
2964         case 5:
2965           v2   = a->a + ii[row+1];
2966           v3   = a->a + ii[row+2];
2967           v4   = a->a + ii[row+3];
2968           v5   = a->a + ii[row+4];
2969           sum1 = b[row];
2970           sum2 = b[row+1];
2971           sum3 = b[row+2];
2972           sum4 = b[row+3];
2973           sum5 = b[row+4];
2974           for (n = 0; n<sz-1; n+=2) {
2975             i1    = idx[0];
2976             i2    = idx[1];
2977             idx  += 2;
2978             tmp0  = x[i1];
2979             tmp1  = x[i2];
2980             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2981             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2982             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2983             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2984             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2985           }
2986 
2987           if (n == sz-1) {
2988             tmp0  = x[*idx];
2989             sum1 -= v1[0] * tmp0;
2990             sum2 -= v2[0] * tmp0;
2991             sum3 -= v3[0] * tmp0;
2992             sum4 -= v4[0] * tmp0;
2993             sum5 -= v5[0] * tmp0;
2994           }
2995           t[row]   = sum1;
2996           t[row+1] = sum2;
2997           t[row+2] = sum3;
2998           t[row+3] = sum4;
2999           t[row+4] = sum5;
3000           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3001           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3002           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3003           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3004           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3005           ibdiag  += 25;
3006           break;
3007         default:
3008           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3009         }
3010       }
3011 
3012       xb   = t;
3013       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3014     } else xb = b;
3015     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3016 
3017       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3018       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3019         ibdiag -= sizes[i]*sizes[i];
3020         sz      = ii[row+1] - diag[row] - 1;
3021         v1      = a->a + diag[row] + 1;
3022         idx     = a->j + diag[row] + 1;
3023 
3024         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3025         switch (sizes[i]) {
3026         case 1:
3027 
3028           sum1 = xb[row];
3029           for (n = 0; n<sz-1; n+=2) {
3030             i1    = idx[0];
3031             i2    = idx[1];
3032             idx  += 2;
3033             tmp0  = x[i1];
3034             tmp1  = x[i2];
3035             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3036           }
3037 
3038           if (n == sz-1) {
3039             tmp0  = x[*idx];
3040             sum1 -= *v1*tmp0;
3041           }
3042           x[row--] = sum1*(*ibdiag);
3043           break;
3044 
3045         case 2:
3046 
3047           sum1 = xb[row];
3048           sum2 = xb[row-1];
3049           /* note that sum1 is associated with the second of the two rows */
3050           v2 = a->a + diag[row-1] + 2;
3051           for (n = 0; n<sz-1; n+=2) {
3052             i1    = idx[0];
3053             i2    = idx[1];
3054             idx  += 2;
3055             tmp0  = x[i1];
3056             tmp1  = x[i2];
3057             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3058             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3059           }
3060 
3061           if (n == sz-1) {
3062             tmp0  = x[*idx];
3063             sum1 -= *v1*tmp0;
3064             sum2 -= *v2*tmp0;
3065           }
3066           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3067           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3068           break;
3069         case 3:
3070 
3071           sum1 = xb[row];
3072           sum2 = xb[row-1];
3073           sum3 = xb[row-2];
3074           v2   = a->a + diag[row-1] + 2;
3075           v3   = a->a + diag[row-2] + 3;
3076           for (n = 0; n<sz-1; n+=2) {
3077             i1    = idx[0];
3078             i2    = idx[1];
3079             idx  += 2;
3080             tmp0  = x[i1];
3081             tmp1  = x[i2];
3082             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3083             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3084             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3085           }
3086 
3087           if (n == sz-1) {
3088             tmp0  = x[*idx];
3089             sum1 -= *v1*tmp0;
3090             sum2 -= *v2*tmp0;
3091             sum3 -= *v3*tmp0;
3092           }
3093           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3094           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3095           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3096           break;
3097         case 4:
3098 
3099           sum1 = xb[row];
3100           sum2 = xb[row-1];
3101           sum3 = xb[row-2];
3102           sum4 = xb[row-3];
3103           v2   = a->a + diag[row-1] + 2;
3104           v3   = a->a + diag[row-2] + 3;
3105           v4   = a->a + diag[row-3] + 4;
3106           for (n = 0; n<sz-1; n+=2) {
3107             i1    = idx[0];
3108             i2    = idx[1];
3109             idx  += 2;
3110             tmp0  = x[i1];
3111             tmp1  = x[i2];
3112             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3113             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3114             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3115             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3116           }
3117 
3118           if (n == sz-1) {
3119             tmp0  = x[*idx];
3120             sum1 -= *v1*tmp0;
3121             sum2 -= *v2*tmp0;
3122             sum3 -= *v3*tmp0;
3123             sum4 -= *v4*tmp0;
3124           }
3125           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3126           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3127           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3128           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3129           break;
3130         case 5:
3131 
3132           sum1 = xb[row];
3133           sum2 = xb[row-1];
3134           sum3 = xb[row-2];
3135           sum4 = xb[row-3];
3136           sum5 = xb[row-4];
3137           v2   = a->a + diag[row-1] + 2;
3138           v3   = a->a + diag[row-2] + 3;
3139           v4   = a->a + diag[row-3] + 4;
3140           v5   = a->a + diag[row-4] + 5;
3141           for (n = 0; n<sz-1; n+=2) {
3142             i1    = idx[0];
3143             i2    = idx[1];
3144             idx  += 2;
3145             tmp0  = x[i1];
3146             tmp1  = x[i2];
3147             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3148             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3149             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3150             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3151             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3152           }
3153 
3154           if (n == sz-1) {
3155             tmp0  = x[*idx];
3156             sum1 -= *v1*tmp0;
3157             sum2 -= *v2*tmp0;
3158             sum3 -= *v3*tmp0;
3159             sum4 -= *v4*tmp0;
3160             sum5 -= *v5*tmp0;
3161           }
3162           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3163           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3164           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3165           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3166           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3167           break;
3168         default:
3169           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3170         }
3171       }
3172 
3173       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3174     }
3175     its--;
3176   }
3177   while (its--) {
3178 
3179     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3180       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3181            i<m;
3182            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3183 
3184         sz  = diag[row] - ii[row];
3185         v1  = a->a + ii[row];
3186         idx = a->j + ii[row];
3187         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3188         switch (sizes[i]) {
3189         case 1:
3190           sum1 = b[row];
3191           for (n = 0; n<sz-1; n+=2) {
3192             i1    = idx[0];
3193             i2    = idx[1];
3194             idx  += 2;
3195             tmp0  = x[i1];
3196             tmp1  = x[i2];
3197             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3198           }
3199           if (n == sz-1) {
3200             tmp0  = x[*idx++];
3201             sum1 -= *v1 * tmp0;
3202             v1++;
3203           }
3204           t[row]   = sum1;
3205           sz      = ii[row+1] - diag[row] - 1;
3206           idx     = a->j + diag[row] + 1;
3207           v1 += 1;
3208           for (n = 0; n<sz-1; n+=2) {
3209             i1    = idx[0];
3210             i2    = idx[1];
3211             idx  += 2;
3212             tmp0  = x[i1];
3213             tmp1  = x[i2];
3214             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3215           }
3216           if (n == sz-1) {
3217             tmp0  = x[*idx++];
3218             sum1 -= *v1 * tmp0;
3219           }
3220           /* in MatSOR_SeqAIJ this line would be
3221            *
3222            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3223            *
3224            * but omega == 1, so this becomes
3225            *
3226            * x[row] = sum1*(*ibdiag++);
3227            *
3228            */
3229           x[row] = sum1*(*ibdiag);
3230           break;
3231         case 2:
3232           v2   = a->a + ii[row+1];
3233           sum1 = b[row];
3234           sum2 = b[row+1];
3235           for (n = 0; n<sz-1; n+=2) {
3236             i1    = idx[0];
3237             i2    = idx[1];
3238             idx  += 2;
3239             tmp0  = x[i1];
3240             tmp1  = x[i2];
3241             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3242             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3243           }
3244           if (n == sz-1) {
3245             tmp0  = x[*idx++];
3246             sum1 -= v1[0] * tmp0;
3247             sum2 -= v2[0] * tmp0;
3248             v1++; v2++;
3249           }
3250           t[row]   = sum1;
3251           t[row+1] = sum2;
3252           sz      = ii[row+1] - diag[row] - 2;
3253           idx     = a->j + diag[row] + 2;
3254           v1 += 2;
3255           v2 += 2;
3256           for (n = 0; n<sz-1; n+=2) {
3257             i1    = idx[0];
3258             i2    = idx[1];
3259             idx  += 2;
3260             tmp0  = x[i1];
3261             tmp1  = x[i2];
3262             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3263             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3264           }
3265           if (n == sz-1) {
3266             tmp0  = x[*idx];
3267             sum1 -= v1[0] * tmp0;
3268             sum2 -= v2[0] * tmp0;
3269           }
3270           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3271           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3272           break;
3273         case 3:
3274           v2   = a->a + ii[row+1];
3275           v3   = a->a + ii[row+2];
3276           sum1 = b[row];
3277           sum2 = b[row+1];
3278           sum3 = b[row+2];
3279           for (n = 0; n<sz-1; n+=2) {
3280             i1    = idx[0];
3281             i2    = idx[1];
3282             idx  += 2;
3283             tmp0  = x[i1];
3284             tmp1  = x[i2];
3285             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3286             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3287             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3288           }
3289           if (n == sz-1) {
3290             tmp0  = x[*idx++];
3291             sum1 -= v1[0] * tmp0;
3292             sum2 -= v2[0] * tmp0;
3293             sum3 -= v3[0] * tmp0;
3294             v1++; v2++; v3++;
3295           }
3296           t[row]   = sum1;
3297           t[row+1] = sum2;
3298           t[row+2] = sum3;
3299           sz      = ii[row+1] - diag[row] - 3;
3300           idx     = a->j + diag[row] + 3;
3301           v1 += 3;
3302           v2 += 3;
3303           v3 += 3;
3304           for (n = 0; n<sz-1; n+=2) {
3305             i1    = idx[0];
3306             i2    = idx[1];
3307             idx  += 2;
3308             tmp0  = x[i1];
3309             tmp1  = x[i2];
3310             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3311             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3312             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3313           }
3314           if (n == sz-1) {
3315             tmp0  = x[*idx];
3316             sum1 -= v1[0] * tmp0;
3317             sum2 -= v2[0] * tmp0;
3318             sum3 -= v3[0] * tmp0;
3319           }
3320           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3321           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3322           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3323           break;
3324         case 4:
3325           v2   = a->a + ii[row+1];
3326           v3   = a->a + ii[row+2];
3327           v4   = a->a + ii[row+3];
3328           sum1 = b[row];
3329           sum2 = b[row+1];
3330           sum3 = b[row+2];
3331           sum4 = b[row+3];
3332           for (n = 0; n<sz-1; n+=2) {
3333             i1    = idx[0];
3334             i2    = idx[1];
3335             idx  += 2;
3336             tmp0  = x[i1];
3337             tmp1  = x[i2];
3338             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3339             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3340             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3341             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3342           }
3343           if (n == sz-1) {
3344             tmp0  = x[*idx++];
3345             sum1 -= v1[0] * tmp0;
3346             sum2 -= v2[0] * tmp0;
3347             sum3 -= v3[0] * tmp0;
3348             sum4 -= v4[0] * tmp0;
3349             v1++; v2++; v3++; v4++;
3350           }
3351           t[row]   = sum1;
3352           t[row+1] = sum2;
3353           t[row+2] = sum3;
3354           t[row+3] = sum4;
3355           sz      = ii[row+1] - diag[row] - 4;
3356           idx     = a->j + diag[row] + 4;
3357           v1 += 4;
3358           v2 += 4;
3359           v3 += 4;
3360           v4 += 4;
3361           for (n = 0; n<sz-1; n+=2) {
3362             i1    = idx[0];
3363             i2    = idx[1];
3364             idx  += 2;
3365             tmp0  = x[i1];
3366             tmp1  = x[i2];
3367             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3368             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3369             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3370             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3371           }
3372           if (n == sz-1) {
3373             tmp0  = x[*idx];
3374             sum1 -= v1[0] * tmp0;
3375             sum2 -= v2[0] * tmp0;
3376             sum3 -= v3[0] * tmp0;
3377             sum4 -= v4[0] * tmp0;
3378           }
3379           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3380           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3381           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3382           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3383           break;
3384         case 5:
3385           v2   = a->a + ii[row+1];
3386           v3   = a->a + ii[row+2];
3387           v4   = a->a + ii[row+3];
3388           v5   = a->a + ii[row+4];
3389           sum1 = b[row];
3390           sum2 = b[row+1];
3391           sum3 = b[row+2];
3392           sum4 = b[row+3];
3393           sum5 = b[row+4];
3394           for (n = 0; n<sz-1; n+=2) {
3395             i1    = idx[0];
3396             i2    = idx[1];
3397             idx  += 2;
3398             tmp0  = x[i1];
3399             tmp1  = x[i2];
3400             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3401             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3402             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3403             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3404             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3405           }
3406           if (n == sz-1) {
3407             tmp0  = x[*idx++];
3408             sum1 -= v1[0] * tmp0;
3409             sum2 -= v2[0] * tmp0;
3410             sum3 -= v3[0] * tmp0;
3411             sum4 -= v4[0] * tmp0;
3412             sum5 -= v5[0] * tmp0;
3413             v1++; v2++; v3++; v4++; v5++;
3414           }
3415           t[row]   = sum1;
3416           t[row+1] = sum2;
3417           t[row+2] = sum3;
3418           t[row+3] = sum4;
3419           t[row+4] = sum5;
3420           sz      = ii[row+1] - diag[row] - 5;
3421           idx     = a->j + diag[row] + 5;
3422           v1 += 5;
3423           v2 += 5;
3424           v3 += 5;
3425           v4 += 5;
3426           v5 += 5;
3427           for (n = 0; n<sz-1; n+=2) {
3428             i1    = idx[0];
3429             i2    = idx[1];
3430             idx  += 2;
3431             tmp0  = x[i1];
3432             tmp1  = x[i2];
3433             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3434             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3435             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3436             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3437             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3438           }
3439           if (n == sz-1) {
3440             tmp0  = x[*idx];
3441             sum1 -= v1[0] * tmp0;
3442             sum2 -= v2[0] * tmp0;
3443             sum3 -= v3[0] * tmp0;
3444             sum4 -= v4[0] * tmp0;
3445             sum5 -= v5[0] * tmp0;
3446           }
3447           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3448           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3449           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3450           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3451           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3452           break;
3453         default:
3454           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3455         }
3456       }
3457       xb   = t;
3458       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3459     } else xb = b;
3460 
3461     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3462 
3463       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3464       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3465         ibdiag -= sizes[i]*sizes[i];
3466 
3467         /* set RHS */
3468         if (xb == b) {
3469           /* whole (old way) */
3470           sz      = ii[row+1] - ii[row];
3471           idx     = a->j + ii[row];
3472           switch (sizes[i]) {
3473           case 5:
3474             v5      = a->a + ii[row-4];
3475           case 4: /* fall through */
3476             v4      = a->a + ii[row-3];
3477           case 3:
3478             v3      = a->a + ii[row-2];
3479           case 2:
3480             v2      = a->a + ii[row-1];
3481           case 1:
3482             v1      = a->a + ii[row];
3483             break;
3484           default:
3485             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3486           }
3487         } else {
3488           /* upper, no diag */
3489           sz      = ii[row+1] - diag[row] - 1;
3490           idx     = a->j + diag[row] + 1;
3491           switch (sizes[i]) {
3492           case 5:
3493             v5      = a->a + diag[row-4] + 5;
3494           case 4: /* fall through */
3495             v4      = a->a + diag[row-3] + 4;
3496           case 3:
3497             v3      = a->a + diag[row-2] + 3;
3498           case 2:
3499             v2      = a->a + diag[row-1] + 2;
3500           case 1:
3501             v1      = a->a + diag[row] + 1;
3502           }
3503         }
3504         /* set sum */
3505         switch (sizes[i]) {
3506         case 5:
3507           sum5 = xb[row-4];
3508         case 4: /* fall through */
3509           sum4 = xb[row-3];
3510         case 3:
3511           sum3 = xb[row-2];
3512         case 2:
3513           sum2 = xb[row-1];
3514         case 1:
3515           /* note that sum1 is associated with the last row */
3516           sum1 = xb[row];
3517         }
3518         /* do sums */
3519         for (n = 0; n<sz-1; n+=2) {
3520             i1    = idx[0];
3521             i2    = idx[1];
3522             idx  += 2;
3523             tmp0  = x[i1];
3524             tmp1  = x[i2];
3525             switch (sizes[i]) {
3526             case 5:
3527               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3528             case 4: /* fall through */
3529               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3530             case 3:
3531               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3532             case 2:
3533               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3534             case 1:
3535               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3536             }
3537         }
3538         /* ragged edge */
3539         if (n == sz-1) {
3540           tmp0  = x[*idx];
3541           switch (sizes[i]) {
3542           case 5:
3543             sum5 -= *v5*tmp0;
3544           case 4: /* fall through */
3545             sum4 -= *v4*tmp0;
3546           case 3:
3547             sum3 -= *v3*tmp0;
3548           case 2:
3549             sum2 -= *v2*tmp0;
3550           case 1:
3551             sum1 -= *v1*tmp0;
3552           }
3553         }
3554         /* update */
3555         if (xb == b) {
3556           /* whole (old way) w/ diag */
3557           switch (sizes[i]) {
3558           case 5:
3559             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3560             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3561             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3562             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3563             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3564             break;
3565           case 4:
3566             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3567             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3568             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3569             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3570             break;
3571           case 3:
3572             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3573             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3574             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3575             break;
3576           case 2:
3577             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3578             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3579             break;
3580           case 1:
3581             x[row--] += sum1*(*ibdiag);
3582             break;
3583           }
3584         } else {
3585           /* no diag so set =  */
3586           switch (sizes[i]) {
3587           case 5:
3588             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3589             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3590             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3591             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3592             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3593             break;
3594           case 4:
3595             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3596             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3597             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3598             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3599             break;
3600           case 3:
3601             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3602             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3603             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3604             break;
3605           case 2:
3606             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3607             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3608             break;
3609           case 1:
3610             x[row--] = sum1*(*ibdiag);
3611             break;
3612           }
3613         }
3614       }
3615       if (xb == b) {
3616         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3617       } else {
3618         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3619       }
3620     }
3621   }
3622   if (flag & SOR_EISENSTAT) {
3623     /*
3624           Apply  (U + D)^-1  where D is now the block diagonal
3625     */
3626     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3627     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3628       ibdiag -= sizes[i]*sizes[i];
3629       sz      = ii[row+1] - diag[row] - 1;
3630       v1      = a->a + diag[row] + 1;
3631       idx     = a->j + diag[row] + 1;
3632       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3633       switch (sizes[i]) {
3634       case 1:
3635 
3636         sum1 = b[row];
3637         for (n = 0; n<sz-1; n+=2) {
3638           i1    = idx[0];
3639           i2    = idx[1];
3640           idx  += 2;
3641           tmp0  = x[i1];
3642           tmp1  = x[i2];
3643           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3644         }
3645 
3646         if (n == sz-1) {
3647           tmp0  = x[*idx];
3648           sum1 -= *v1*tmp0;
3649         }
3650         x[row] = sum1*(*ibdiag);row--;
3651         break;
3652 
3653       case 2:
3654 
3655         sum1 = b[row];
3656         sum2 = b[row-1];
3657         /* note that sum1 is associated with the second of the two rows */
3658         v2 = a->a + diag[row-1] + 2;
3659         for (n = 0; n<sz-1; n+=2) {
3660           i1    = idx[0];
3661           i2    = idx[1];
3662           idx  += 2;
3663           tmp0  = x[i1];
3664           tmp1  = x[i2];
3665           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3666           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3667         }
3668 
3669         if (n == sz-1) {
3670           tmp0  = x[*idx];
3671           sum1 -= *v1*tmp0;
3672           sum2 -= *v2*tmp0;
3673         }
3674         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3675         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3676         row     -= 2;
3677         break;
3678       case 3:
3679 
3680         sum1 = b[row];
3681         sum2 = b[row-1];
3682         sum3 = b[row-2];
3683         v2   = a->a + diag[row-1] + 2;
3684         v3   = a->a + diag[row-2] + 3;
3685         for (n = 0; n<sz-1; n+=2) {
3686           i1    = idx[0];
3687           i2    = idx[1];
3688           idx  += 2;
3689           tmp0  = x[i1];
3690           tmp1  = x[i2];
3691           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3692           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3693           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3694         }
3695 
3696         if (n == sz-1) {
3697           tmp0  = x[*idx];
3698           sum1 -= *v1*tmp0;
3699           sum2 -= *v2*tmp0;
3700           sum3 -= *v3*tmp0;
3701         }
3702         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3703         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3704         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3705         row     -= 3;
3706         break;
3707       case 4:
3708 
3709         sum1 = b[row];
3710         sum2 = b[row-1];
3711         sum3 = b[row-2];
3712         sum4 = b[row-3];
3713         v2   = a->a + diag[row-1] + 2;
3714         v3   = a->a + diag[row-2] + 3;
3715         v4   = a->a + diag[row-3] + 4;
3716         for (n = 0; n<sz-1; n+=2) {
3717           i1    = idx[0];
3718           i2    = idx[1];
3719           idx  += 2;
3720           tmp0  = x[i1];
3721           tmp1  = x[i2];
3722           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3723           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3724           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3725           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3726         }
3727 
3728         if (n == sz-1) {
3729           tmp0  = x[*idx];
3730           sum1 -= *v1*tmp0;
3731           sum2 -= *v2*tmp0;
3732           sum3 -= *v3*tmp0;
3733           sum4 -= *v4*tmp0;
3734         }
3735         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3736         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3737         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3738         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3739         row     -= 4;
3740         break;
3741       case 5:
3742 
3743         sum1 = b[row];
3744         sum2 = b[row-1];
3745         sum3 = b[row-2];
3746         sum4 = b[row-3];
3747         sum5 = b[row-4];
3748         v2   = a->a + diag[row-1] + 2;
3749         v3   = a->a + diag[row-2] + 3;
3750         v4   = a->a + diag[row-3] + 4;
3751         v5   = a->a + diag[row-4] + 5;
3752         for (n = 0; n<sz-1; n+=2) {
3753           i1    = idx[0];
3754           i2    = idx[1];
3755           idx  += 2;
3756           tmp0  = x[i1];
3757           tmp1  = x[i2];
3758           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3759           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3760           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3761           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3762           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3763         }
3764 
3765         if (n == sz-1) {
3766           tmp0  = x[*idx];
3767           sum1 -= *v1*tmp0;
3768           sum2 -= *v2*tmp0;
3769           sum3 -= *v3*tmp0;
3770           sum4 -= *v4*tmp0;
3771           sum5 -= *v5*tmp0;
3772         }
3773         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3774         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3775         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3776         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3777         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3778         row     -= 5;
3779         break;
3780       default:
3781         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3782       }
3783     }
3784     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3785 
3786     /*
3787            t = b - D x    where D is the block diagonal
3788     */
3789     cnt = 0;
3790     for (i=0, row=0; i<m; i++) {
3791       switch (sizes[i]) {
3792       case 1:
3793         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3794         break;
3795       case 2:
3796         x1       = x[row]; x2 = x[row+1];
3797         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3798         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3799         t[row]   = b[row] - tmp1;
3800         t[row+1] = b[row+1] - tmp2; row += 2;
3801         cnt     += 4;
3802         break;
3803       case 3:
3804         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3805         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3806         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3807         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3808         t[row]   = b[row] - tmp1;
3809         t[row+1] = b[row+1] - tmp2;
3810         t[row+2] = b[row+2] - tmp3; row += 3;
3811         cnt     += 9;
3812         break;
3813       case 4:
3814         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3815         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3816         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3817         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3818         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3819         t[row]   = b[row] - tmp1;
3820         t[row+1] = b[row+1] - tmp2;
3821         t[row+2] = b[row+2] - tmp3;
3822         t[row+3] = b[row+3] - tmp4; row += 4;
3823         cnt     += 16;
3824         break;
3825       case 5:
3826         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3827         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3828         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3829         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3830         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3831         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3832         t[row]   = b[row] - tmp1;
3833         t[row+1] = b[row+1] - tmp2;
3834         t[row+2] = b[row+2] - tmp3;
3835         t[row+3] = b[row+3] - tmp4;
3836         t[row+4] = b[row+4] - tmp5;row += 5;
3837         cnt     += 25;
3838         break;
3839       default:
3840         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3841       }
3842     }
3843     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3844 
3845 
3846 
3847     /*
3848           Apply (L + D)^-1 where D is the block diagonal
3849     */
3850     for (i=0, row=0; i<m; i++) {
3851       sz  = diag[row] - ii[row];
3852       v1  = a->a + ii[row];
3853       idx = a->j + ii[row];
3854       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3855       switch (sizes[i]) {
3856       case 1:
3857 
3858         sum1 = t[row];
3859         for (n = 0; n<sz-1; n+=2) {
3860           i1    = idx[0];
3861           i2    = idx[1];
3862           idx  += 2;
3863           tmp0  = t[i1];
3864           tmp1  = t[i2];
3865           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3866         }
3867 
3868         if (n == sz-1) {
3869           tmp0  = t[*idx];
3870           sum1 -= *v1 * tmp0;
3871         }
3872         x[row] += t[row] = sum1*(*ibdiag++); row++;
3873         break;
3874       case 2:
3875         v2   = a->a + ii[row+1];
3876         sum1 = t[row];
3877         sum2 = t[row+1];
3878         for (n = 0; n<sz-1; n+=2) {
3879           i1    = idx[0];
3880           i2    = idx[1];
3881           idx  += 2;
3882           tmp0  = t[i1];
3883           tmp1  = t[i2];
3884           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3885           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3886         }
3887 
3888         if (n == sz-1) {
3889           tmp0  = t[*idx];
3890           sum1 -= v1[0] * tmp0;
3891           sum2 -= v2[0] * tmp0;
3892         }
3893         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3894         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3895         ibdiag   += 4; row += 2;
3896         break;
3897       case 3:
3898         v2   = a->a + ii[row+1];
3899         v3   = a->a + ii[row+2];
3900         sum1 = t[row];
3901         sum2 = t[row+1];
3902         sum3 = t[row+2];
3903         for (n = 0; n<sz-1; n+=2) {
3904           i1    = idx[0];
3905           i2    = idx[1];
3906           idx  += 2;
3907           tmp0  = t[i1];
3908           tmp1  = t[i2];
3909           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3910           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3911           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3912         }
3913 
3914         if (n == sz-1) {
3915           tmp0  = t[*idx];
3916           sum1 -= v1[0] * tmp0;
3917           sum2 -= v2[0] * tmp0;
3918           sum3 -= v3[0] * tmp0;
3919         }
3920         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3921         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3922         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3923         ibdiag   += 9; row += 3;
3924         break;
3925       case 4:
3926         v2   = a->a + ii[row+1];
3927         v3   = a->a + ii[row+2];
3928         v4   = a->a + ii[row+3];
3929         sum1 = t[row];
3930         sum2 = t[row+1];
3931         sum3 = t[row+2];
3932         sum4 = t[row+3];
3933         for (n = 0; n<sz-1; n+=2) {
3934           i1    = idx[0];
3935           i2    = idx[1];
3936           idx  += 2;
3937           tmp0  = t[i1];
3938           tmp1  = t[i2];
3939           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3940           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3941           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3942           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3943         }
3944 
3945         if (n == sz-1) {
3946           tmp0  = t[*idx];
3947           sum1 -= v1[0] * tmp0;
3948           sum2 -= v2[0] * tmp0;
3949           sum3 -= v3[0] * tmp0;
3950           sum4 -= v4[0] * tmp0;
3951         }
3952         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3953         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3954         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3955         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3956         ibdiag   += 16; row += 4;
3957         break;
3958       case 5:
3959         v2   = a->a + ii[row+1];
3960         v3   = a->a + ii[row+2];
3961         v4   = a->a + ii[row+3];
3962         v5   = a->a + ii[row+4];
3963         sum1 = t[row];
3964         sum2 = t[row+1];
3965         sum3 = t[row+2];
3966         sum4 = t[row+3];
3967         sum5 = t[row+4];
3968         for (n = 0; n<sz-1; n+=2) {
3969           i1    = idx[0];
3970           i2    = idx[1];
3971           idx  += 2;
3972           tmp0  = t[i1];
3973           tmp1  = t[i2];
3974           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3975           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3976           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3977           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3978           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3979         }
3980 
3981         if (n == sz-1) {
3982           tmp0  = t[*idx];
3983           sum1 -= v1[0] * tmp0;
3984           sum2 -= v2[0] * tmp0;
3985           sum3 -= v3[0] * tmp0;
3986           sum4 -= v4[0] * tmp0;
3987           sum5 -= v5[0] * tmp0;
3988         }
3989         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3990         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3991         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3992         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3993         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3994         ibdiag   += 25; row += 5;
3995         break;
3996       default:
3997         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3998       }
3999     }
4000     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
4001   }
4002   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4003   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4004   PetscFunctionReturn(0);
4005 }
4006 
4007 #undef __FUNCT__
4008 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
4009 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4010 {
4011   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4012   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4013   const MatScalar   *bdiag = a->inode.bdiag;
4014   const PetscScalar *b;
4015   PetscErrorCode    ierr;
4016   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4017   const PetscInt    *sizes = a->inode.size;
4018 
4019   PetscFunctionBegin;
4020   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4021   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4022   cnt  = 0;
4023   for (i=0, row=0; i<m; i++) {
4024     switch (sizes[i]) {
4025     case 1:
4026       x[row] = b[row]*bdiag[cnt++];row++;
4027       break;
4028     case 2:
4029       x1       = b[row]; x2 = b[row+1];
4030       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4031       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4032       x[row++] = tmp1;
4033       x[row++] = tmp2;
4034       cnt     += 4;
4035       break;
4036     case 3:
4037       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4038       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4039       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4040       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4041       x[row++] = tmp1;
4042       x[row++] = tmp2;
4043       x[row++] = tmp3;
4044       cnt     += 9;
4045       break;
4046     case 4:
4047       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4048       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4049       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4050       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4051       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4052       x[row++] = tmp1;
4053       x[row++] = tmp2;
4054       x[row++] = tmp3;
4055       x[row++] = tmp4;
4056       cnt     += 16;
4057       break;
4058     case 5:
4059       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4060       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4061       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4062       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4063       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4064       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4065       x[row++] = tmp1;
4066       x[row++] = tmp2;
4067       x[row++] = tmp3;
4068       x[row++] = tmp4;
4069       x[row++] = tmp5;
4070       cnt     += 25;
4071       break;
4072     default:
4073       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4074     }
4075   }
4076   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4077   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4078   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4079   PetscFunctionReturn(0);
4080 }
4081 
4082 /*
4083     samestructure indicates that the matrix has not changed its nonzero structure so we
4084     do not need to recompute the inodes
4085 */
4086 #undef __FUNCT__
4087 #define __FUNCT__ "MatSeqAIJCheckInode"
4088 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4089 {
4090   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4091   PetscErrorCode ierr;
4092   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4093   PetscBool      flag;
4094   const PetscInt *idx,*idy,*ii;
4095 
4096   PetscFunctionBegin;
4097   if (!a->inode.use) PetscFunctionReturn(0);
4098   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4099 
4100   m = A->rmap->n;
4101   if (a->inode.size) ns = a->inode.size;
4102   else {
4103     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4104   }
4105 
4106   i          = 0;
4107   node_count = 0;
4108   idx        = a->j;
4109   ii         = a->i;
4110   while (i < m) {                /* For each row */
4111     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4112     /* Limits the number of elements in a node to 'a->inode.limit' */
4113     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4114       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4115       if (nzy != nzx) break;
4116       idy += nzx;              /* Same nonzero pattern */
4117       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4118       if (!flag) break;
4119     }
4120     ns[node_count++] = blk_size;
4121     idx             += blk_size*nzx;
4122     i                = j;
4123   }
4124   /* If not enough inodes found,, do not use inode version of the routines */
4125   if (!m || node_count > .8*m) {
4126     ierr = PetscFree(ns);CHKERRQ(ierr);
4127 
4128     a->inode.node_count       = 0;
4129     a->inode.size             = NULL;
4130     a->inode.use              = PETSC_FALSE;
4131     A->ops->mult              = MatMult_SeqAIJ;
4132     A->ops->sor               = MatSOR_SeqAIJ;
4133     A->ops->multadd           = MatMultAdd_SeqAIJ;
4134     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4135     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4136     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4137     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4138     A->ops->coloringpatch     = 0;
4139     A->ops->multdiagonalblock = 0;
4140 
4141     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4142   } else {
4143     if (!A->factortype) {
4144       A->ops->mult              = MatMult_SeqAIJ_Inode;
4145       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4146       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4147       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4148       if (A->rmap->n == A->cmap->n) {
4149         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4150         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4151         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4152         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4153         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4154       }
4155     } else {
4156       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4157     }
4158     a->inode.node_count = node_count;
4159     a->inode.size       = ns;
4160     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4161   }
4162   a->inode.checked          = PETSC_TRUE;
4163   a->inode.mat_nonzerostate = A->nonzerostate;
4164   PetscFunctionReturn(0);
4165 }
4166 
4167 #undef __FUNCT__
4168 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4169 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4170 {
4171   Mat            B =*C;
4172   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4173   PetscErrorCode ierr;
4174   PetscInt       m=A->rmap->n;
4175 
4176   PetscFunctionBegin;
4177   c->inode.use       = a->inode.use;
4178   c->inode.limit     = a->inode.limit;
4179   c->inode.max_limit = a->inode.max_limit;
4180   if (a->inode.size) {
4181     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4182     c->inode.node_count = a->inode.node_count;
4183     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4184     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4185     if (!B->factortype) {
4186       B->ops->mult              = MatMult_SeqAIJ_Inode;
4187       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4188       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4189       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4190       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4191       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4192       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4193       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4194       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4195     } else {
4196       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4197     }
4198   } else {
4199     c->inode.size       = 0;
4200     c->inode.node_count = 0;
4201   }
4202   c->inode.ibdiagvalid = PETSC_FALSE;
4203   c->inode.ibdiag      = 0;
4204   c->inode.bdiag       = 0;
4205   PetscFunctionReturn(0);
4206 }
4207 
4208 #undef __FUNCT__
4209 #define __FUNCT__ "MatGetRow_FactoredLU"
4210 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4211 {
4212   PetscInt       k;
4213   const PetscInt *vi;
4214 
4215   PetscFunctionBegin;
4216   vi = aj + ai[row];
4217   for (k=0; k<nzl; k++) cols[k] = vi[k];
4218   vi        = aj + adiag[row];
4219   cols[nzl] = vi[0];
4220   vi        = aj + adiag[row+1]+1;
4221   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4222   PetscFunctionReturn(0);
4223 }
4224 /*
4225    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4226    Modified from MatSeqAIJCheckInode().
4227 
4228    Input Parameters:
4229 .  Mat A - ILU or LU matrix factor
4230 
4231 */
4232 #undef __FUNCT__
4233 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU"
4234 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4235 {
4236   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4237   PetscErrorCode ierr;
4238   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4239   PetscInt       *cols1,*cols2,*ns;
4240   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4241   PetscBool      flag;
4242 
4243   PetscFunctionBegin;
4244   if (!a->inode.use)    PetscFunctionReturn(0);
4245   if (a->inode.checked) PetscFunctionReturn(0);
4246 
4247   m = A->rmap->n;
4248   if (a->inode.size) ns = a->inode.size;
4249   else {
4250     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4251   }
4252 
4253   i          = 0;
4254   node_count = 0;
4255   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4256   while (i < m) {                /* For each row */
4257     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4258     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4259     nzx  = nzl1 + nzu1 + 1;
4260     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4261 
4262     /* Limits the number of elements in a node to 'a->inode.limit' */
4263     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4264       nzl2 = ai[j+1] - ai[j];
4265       nzu2 = adiag[j] - adiag[j+1] - 1;
4266       nzy  = nzl2 + nzu2 + 1;
4267       if (nzy != nzx) break;
4268       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4269       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4270       if (!flag) break;
4271     }
4272     ns[node_count++] = blk_size;
4273     i                = j;
4274   }
4275   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4276   /* If not enough inodes found,, do not use inode version of the routines */
4277   if (!m || node_count > .8*m) {
4278     ierr = PetscFree(ns);CHKERRQ(ierr);
4279 
4280     a->inode.node_count = 0;
4281     a->inode.size       = NULL;
4282     a->inode.use        = PETSC_FALSE;
4283 
4284     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4285   } else {
4286     A->ops->mult              = 0;
4287     A->ops->sor               = 0;
4288     A->ops->multadd           = 0;
4289     A->ops->getrowij          = 0;
4290     A->ops->restorerowij      = 0;
4291     A->ops->getcolumnij       = 0;
4292     A->ops->restorecolumnij   = 0;
4293     A->ops->coloringpatch     = 0;
4294     A->ops->multdiagonalblock = 0;
4295     a->inode.node_count       = node_count;
4296     a->inode.size             = ns;
4297 
4298     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4299   }
4300   a->inode.checked = PETSC_TRUE;
4301   PetscFunctionReturn(0);
4302 }
4303 
4304 #undef __FUNCT__
4305 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4306 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4307 {
4308   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4309 
4310   PetscFunctionBegin;
4311   a->inode.ibdiagvalid = PETSC_FALSE;
4312   PetscFunctionReturn(0);
4313 }
4314 
4315 /*
4316      This is really ugly. if inodes are used this replaces the
4317   permutations with ones that correspond to rows/cols of the matrix
4318   rather then inode blocks
4319 */
4320 #undef __FUNCT__
4321 #define __FUNCT__ "MatInodeAdjustForInodes"
4322 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4323 {
4324   PetscErrorCode ierr;
4325 
4326   PetscFunctionBegin;
4327   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4328   PetscFunctionReturn(0);
4329 }
4330 
4331 #undef __FUNCT__
4332 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4333 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4334 {
4335   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4336   PetscErrorCode ierr;
4337   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4338   const PetscInt *ridx,*cidx;
4339   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4340   PetscInt       nslim_col,*ns_col;
4341   IS             ris = *rperm,cis = *cperm;
4342 
4343   PetscFunctionBegin;
4344   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4345   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4346 
4347   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4348   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4349   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4350 
4351   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4352   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4353 
4354   /* Form the inode structure for the rows of permuted matric using inv perm*/
4355   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4356 
4357   /* Construct the permutations for rows*/
4358   for (i=0,row = 0; i<nslim_row; ++i) {
4359     indx      = ridx[i];
4360     start_val = tns[indx];
4361     end_val   = tns[indx + 1];
4362     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4363   }
4364 
4365   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4366   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4367 
4368   /* Construct permutations for columns */
4369   for (i=0,col=0; i<nslim_col; ++i) {
4370     indx      = cidx[i];
4371     start_val = tns[indx];
4372     end_val   = tns[indx + 1];
4373     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4374   }
4375 
4376   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4377   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4378   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4379   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4380 
4381   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4382   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4383 
4384   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4385   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4386   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4387   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4388   ierr = PetscFree(tns);CHKERRQ(ierr);
4389   PetscFunctionReturn(0);
4390 }
4391 
4392 #undef __FUNCT__
4393 #define __FUNCT__ "MatInodeGetInodeSizes"
4394 /*@C
4395    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4396 
4397    Not Collective
4398 
4399    Input Parameter:
4400 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4401 
4402    Output Parameter:
4403 +  node_count - no of inodes present in the matrix.
4404 .  sizes      - an array of size node_count,with sizes of each inode.
4405 -  limit      - the max size used to generate the inodes.
4406 
4407    Level: advanced
4408 
4409    Notes: This routine returns some internal storage information
4410    of the matrix, it is intended to be used by advanced users.
4411    It should be called after the matrix is assembled.
4412    The contents of the sizes[] array should not be changed.
4413    NULL may be passed for information not requested.
4414 
4415 .keywords: matrix, seqaij, get, inode
4416 
4417 .seealso: MatGetInfo()
4418 @*/
4419 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4420 {
4421   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4422 
4423   PetscFunctionBegin;
4424   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4425   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4426   if (f) {
4427     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4428   }
4429   PetscFunctionReturn(0);
4430 }
4431 
4432 #undef __FUNCT__
4433 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4434 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4435 {
4436   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4437 
4438   PetscFunctionBegin;
4439   if (node_count) *node_count = a->inode.node_count;
4440   if (sizes)      *sizes      = a->inode.size;
4441   if (limit)      *limit      = a->inode.limit;
4442   PetscFunctionReturn(0);
4443 }
4444