xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 4fc747eaadbeca11629f314a99edccbc2ed7b3d3) !
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     if (j==jmax) continue; /* empty row */
101     col  = *j++ + ishift;
102     i2   = tvc[col];
103     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104       ia[i1+1]++;
105       ia[i2+1]++;
106       i2++;                     /* Start col of next node */
107       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108       i2 = tvc[col];
109     }
110     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
111   }
112 
113   /* shift ia[i] to point to next row */
114   for (i1=1; i1<nslim_row+1; i1++) {
115     row        = ia[i1-1];
116     ia[i1]    += row;
117     work[i1-1] = row - oshift;
118   }
119 
120   /* allocate space for column pointers */
121   nz   = ia[nslim_row] + (!ishift);
122   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
123   *jja = ja;
124 
125   /* loop over lower triangular part putting into ja */
126   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127     j    = aj + ai[row] + ishift;
128     jmax = aj + ai[row+1] + ishift;
129     if (j==jmax) continue; /* empty row */
130     col  = *j++ + ishift;
131     i2   = tvc[col];
132     while (i2<i1 && j<jmax) {
133       ja[work[i2]++] = i1 + oshift;
134       ja[work[i1]++] = i2 + oshift;
135       ++i2;
136       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137       i2 = tvc[col];
138     }
139     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
140 
141   }
142   ierr = PetscFree(work);CHKERRQ(ierr);
143   ierr = PetscFree(tns);CHKERRQ(ierr);
144   ierr = PetscFree(tvc);CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 /*
149       This builds nonsymmetric version of nonzero structure,
150 */
151 #undef __FUNCT__
152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154 {
155   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
156   PetscErrorCode ierr;
157   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158   PetscInt       *tns,*tvc,nsz,i1,i2;
159   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1) {
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
176   }
177   /* allocate space for row pointers */
178   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
179   *iia = ia;
180   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
181   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
182 
183   /* determine the number of columns in each row */
184   ia[0] = oshift;
185   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186     j   = aj + ai[row] + ishift;
187     nz  = ai[row+1] - ai[row];
188     if (!nz) continue; /* empty row */
189     col = *j++ + ishift;
190     i2  = tvc[col];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211   /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     nz  = ai[row+1] - ai[row];
215     if (!nz) continue; /* empty row */
216     col = *j++ + ishift;
217     i2  = tvc[col];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
235 {
236   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267   PetscFunctionReturn(0);
268 }
269 
270 /* ----------------------------------------------------------- */
271 
272 #undef __FUNCT__
273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275 {
276   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
277   PetscErrorCode ierr;
278   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
280 
281   PetscFunctionBegin;
282   nslim_row = a->inode.node_count;
283   n         = A->cmap->n;
284 
285   /* Create The column_inode for this matrix */
286   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
287 
288   /* allocate space for reformated column_inode structure */
289   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
290   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
291   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
292 
293   for (i1=0,col=0; i1<nslim_col; ++i1) {
294     nsz = ns_col[i1];
295     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
296   }
297   /* allocate space for column pointers */
298   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
299   *iia = ia;
300   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
301   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
302 
303   /* determine the number of columns in each row */
304   ia[0] = oshift;
305   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
306     j   = aj + ai[row] + ishift;
307     col = *j++ + ishift;
308     i2  = tvc[col];
309     nz  = ai[row+1] - ai[row];
310     while (nz-- > 0) {           /* off-diagonal elemets */
311       /* ia[i1+1]++; */
312       ia[i2+1]++;
313       i2++;
314       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
315       if (nz > 0) i2 = tvc[col];
316     }
317   }
318 
319   /* shift ia[i] to point to next col */
320   for (i1=1; i1<nslim_col+1; i1++) {
321     col        = ia[i1-1];
322     ia[i1]    += col;
323     work[i1-1] = col - oshift;
324   }
325 
326   /* allocate space for column pointers */
327   nz   = ia[nslim_col] + (!ishift);
328   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
329   *jja = ja;
330 
331   /* loop over matrix putting into ja */
332   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
333     j   = aj + ai[row] + ishift;
334     col = *j++ + ishift;
335     i2  = tvc[col];
336     nz  = ai[row+1] - ai[row];
337     while (nz-- > 0) {
338       /* ja[work[i1]++] = i2 + oshift; */
339       ja[work[i2]++] = i1 + oshift;
340       i2++;
341       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
342       if (nz > 0) i2 = tvc[col];
343     }
344   }
345   ierr = PetscFree(ns_col);CHKERRQ(ierr);
346   ierr = PetscFree(work);CHKERRQ(ierr);
347   ierr = PetscFree(tns);CHKERRQ(ierr);
348   ierr = PetscFree(tvc);CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 #undef __FUNCT__
353 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
354 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
355 {
356   PetscErrorCode ierr;
357 
358   PetscFunctionBegin;
359   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
360   if (!ia) PetscFunctionReturn(0);
361 
362   if (!blockcompressed) {
363     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
364   } else if (symmetric) {
365     /* Since the indices are symmetric it does'nt matter */
366     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
367   } else {
368     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
369   }
370   PetscFunctionReturn(0);
371 }
372 
373 #undef __FUNCT__
374 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
375 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
376 {
377   PetscErrorCode ierr;
378 
379   PetscFunctionBegin;
380   if (!ia) PetscFunctionReturn(0);
381   if (!blockcompressed) {
382     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
383   } else {
384     ierr = PetscFree(*ia);CHKERRQ(ierr);
385     ierr = PetscFree(*ja);CHKERRQ(ierr);
386   }
387   PetscFunctionReturn(0);
388 }
389 
390 /* ----------------------------------------------------------- */
391 
392 #undef __FUNCT__
393 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
394 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
395 {
396   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
397   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
398   PetscScalar       *y;
399   const PetscScalar *x;
400   const MatScalar   *v1,*v2,*v3,*v4,*v5;
401   PetscErrorCode    ierr;
402   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
403   const PetscInt    *idx,*ns,*ii;
404 
405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407 #endif
408 
409   PetscFunctionBegin;
410   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
411   node_max = a->inode.node_count;
412   ns       = a->inode.size;     /* Node Size array */
413   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
414   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
415   idx      = a->j;
416   v1       = a->a;
417   ii       = a->i;
418 
419   for (i = 0,row = 0; i< node_max; ++i) {
420     nsz         = ns[i];
421     n           = ii[1] - ii[0];
422     nonzerorow += (n>0)*nsz;
423     ii         += nsz;
424     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
425     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
426     sz = n;                     /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz) {               /* Each loop in 'case' is unrolled */
429     case 1:
430       sum1 = 0.;
431 
432       for (n = 0; n< sz-1; n+=2) {
433         i1    = idx[0];         /* The instructions are ordered to */
434         i2    = idx[1];         /* make the compiler's job easy */
435         idx  += 2;
436         tmp0  = x[i1];
437         tmp1  = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439       }
440 
441       if (n == sz-1) {          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ *tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1 = 0.;
449       sum2 = 0.;
450       v2   = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1    = idx[0];
454         i2    = idx[1];
455         idx  += 2;
456         tmp0  = x[i1];
457         tmp1  = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1) {
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1 = 0.;
473       sum2 = 0.;
474       sum3 = 0.;
475       v2   = v1 + n;
476       v3   = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1    = idx[0];
480         i2    = idx[1];
481         idx  += 2;
482         tmp0  = x[i1];
483         tmp1  = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1) {
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1      =v3;              /* Since the next block to be processed starts there*/
498       idx    +=2*sz;
499       break;
500     case 4:
501       sum1 = 0.;
502       sum2 = 0.;
503       sum3 = 0.;
504       sum4 = 0.;
505       v2   = v1 + n;
506       v3   = v2 + n;
507       v4   = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1    = idx[0];
511         i2    = idx[1];
512         idx  += 2;
513         tmp0  = x[i1];
514         tmp1  = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1) {
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1 = 0.;
536       sum2 = 0.;
537       sum3 = 0.;
538       sum4 = 0.;
539       sum5 = 0.;
540       v2   = v1 + n;
541       v3   = v2 + n;
542       v4   = v3 + n;
543       v5   = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1    = idx[0];
547         i2    = idx[1];
548         idx  += 2;
549         tmp0  = x[i1];
550         tmp1  = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1) {
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default:
574       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_SeqAIJ_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
586 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   const MatScalar   *v1,*v2,*v3,*v4,*v5;
591   const PetscScalar *x;
592   PetscScalar       *y,*z,*zt;
593   PetscErrorCode    ierr;
594   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
595   const PetscInt    *idx,*ns,*ii;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601 
602   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
603   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
604   zt = z;
605 
606   idx = a->j;
607   v1  = a->a;
608   ii  = a->i;
609 
610   for (i = 0,row = 0; i< node_max; ++i) {
611     nsz = ns[i];
612     n   = ii[1] - ii[0];
613     ii += nsz;
614     sz  = n;                    /* No of non zeros in this row */
615                                 /* Switch on the size of Node */
616     switch (nsz) {               /* Each loop in 'case' is unrolled */
617     case 1:
618       sum1 = *zt++;
619 
620       for (n = 0; n< sz-1; n+=2) {
621         i1    = idx[0];         /* The instructions are ordered to */
622         i2    = idx[1];         /* make the compiler's job easy */
623         idx  += 2;
624         tmp0  = x[i1];
625         tmp1  = x[i2];
626         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627       }
628 
629       if (n   == sz-1) {          /* Take care of the last nonzero  */
630         tmp0  = x[*idx++];
631         sum1 += *v1++ * tmp0;
632       }
633       y[row++]=sum1;
634       break;
635     case 2:
636       sum1 = *zt++;
637       sum2 = *zt++;
638       v2   = v1 + n;
639 
640       for (n = 0; n< sz-1; n+=2) {
641         i1    = idx[0];
642         i2    = idx[1];
643         idx  += 2;
644         tmp0  = x[i1];
645         tmp1  = x[i2];
646         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648       }
649       if (n   == sz-1) {
650         tmp0  = x[*idx++];
651         sum1 += *v1++ * tmp0;
652         sum2 += *v2++ * tmp0;
653       }
654       y[row++]=sum1;
655       y[row++]=sum2;
656       v1      =v2;              /* Since the next block to be processed starts there*/
657       idx    +=sz;
658       break;
659     case 3:
660       sum1 = *zt++;
661       sum2 = *zt++;
662       sum3 = *zt++;
663       v2   = v1 + n;
664       v3   = v2 + n;
665 
666       for (n = 0; n< sz-1; n+=2) {
667         i1    = idx[0];
668         i2    = idx[1];
669         idx  += 2;
670         tmp0  = x[i1];
671         tmp1  = x[i2];
672         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675       }
676       if (n == sz-1) {
677         tmp0  = x[*idx++];
678         sum1 += *v1++ * tmp0;
679         sum2 += *v2++ * tmp0;
680         sum3 += *v3++ * tmp0;
681       }
682       y[row++]=sum1;
683       y[row++]=sum2;
684       y[row++]=sum3;
685       v1      =v3;              /* Since the next block to be processed starts there*/
686       idx    +=2*sz;
687       break;
688     case 4:
689       sum1 = *zt++;
690       sum2 = *zt++;
691       sum3 = *zt++;
692       sum4 = *zt++;
693       v2   = v1 + n;
694       v3   = v2 + n;
695       v4   = v3 + n;
696 
697       for (n = 0; n< sz-1; n+=2) {
698         i1    = idx[0];
699         i2    = idx[1];
700         idx  += 2;
701         tmp0  = x[i1];
702         tmp1  = x[i2];
703         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707       }
708       if (n == sz-1) {
709         tmp0  = x[*idx++];
710         sum1 += *v1++ * tmp0;
711         sum2 += *v2++ * tmp0;
712         sum3 += *v3++ * tmp0;
713         sum4 += *v4++ * tmp0;
714       }
715       y[row++]=sum1;
716       y[row++]=sum2;
717       y[row++]=sum3;
718       y[row++]=sum4;
719       v1      =v4;              /* Since the next block to be processed starts there*/
720       idx    +=3*sz;
721       break;
722     case 5:
723       sum1 = *zt++;
724       sum2 = *zt++;
725       sum3 = *zt++;
726       sum4 = *zt++;
727       sum5 = *zt++;
728       v2   = v1 + n;
729       v3   = v2 + n;
730       v4   = v3 + n;
731       v5   = v4 + n;
732 
733       for (n = 0; n<sz-1; n+=2) {
734         i1    = idx[0];
735         i2    = idx[1];
736         idx  += 2;
737         tmp0  = x[i1];
738         tmp1  = x[i2];
739         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744       }
745       if (n   == sz-1) {
746         tmp0  = x[*idx++];
747         sum1 += *v1++ * tmp0;
748         sum2 += *v2++ * tmp0;
749         sum3 += *v3++ * tmp0;
750         sum4 += *v4++ * tmp0;
751         sum5 += *v5++ * tmp0;
752       }
753       y[row++]=sum1;
754       y[row++]=sum2;
755       y[row++]=sum3;
756       y[row++]=sum4;
757       y[row++]=sum5;
758       v1      =v5;       /* Since the next block to be processed starts there */
759       idx    +=4*sz;
760       break;
761     default:
762       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
763     }
764   }
765   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
766   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
767   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
768   PetscFunctionReturn(0);
769 }
770 
771 /* ----------------------------------------------------------- */
772 #undef __FUNCT__
773 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
774 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
775 {
776   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
777   IS                iscol = a->col,isrow = a->row;
778   PetscErrorCode    ierr;
779   const PetscInt    *r,*c,*rout,*cout;
780   PetscInt          i,j,n = A->rmap->n,nz;
781   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
782   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
783   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
784   PetscScalar       sum1,sum2,sum3,sum4,sum5;
785   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
786   const PetscScalar *b;
787 
788   PetscFunctionBegin;
789   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
790   node_max = a->inode.node_count;
791   ns       = a->inode.size;     /* Node Size array */
792 
793   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795   tmp  = a->solve_work;
796 
797   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
799 
800   /* forward solve the lower triangular */
801   tmps = tmp;
802   aa   = a_a;
803   aj   = a_j;
804   ad   = a->diag;
805 
806   for (i = 0,row = 0; i< node_max; ++i) {
807     nsz = ns[i];
808     aii = ai[row];
809     v1  = aa + aii;
810     vi  = aj + aii;
811     nz  = ad[row]- aii;
812     if (i < node_max-1) {
813       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
814       * but our indexing to determine it's size could. */
815       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
816       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
817       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
818       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
819     }
820 
821     switch (nsz) {               /* Each loop in 'case' is unrolled */
822     case 1:
823       sum1 = b[*r++];
824       for (j=0; j<nz-1; j+=2) {
825         i0    = vi[0];
826         i1    = vi[1];
827         vi   +=2;
828         tmp0  = tmps[i0];
829         tmp1  = tmps[i1];
830         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
831       }
832       if (j == nz-1) {
833         tmp0  = tmps[*vi++];
834         sum1 -= *v1++ *tmp0;
835       }
836       tmp[row++]=sum1;
837       break;
838     case 2:
839       sum1 = b[*r++];
840       sum2 = b[*r++];
841       v2   = aa + ai[row+1];
842 
843       for (j=0; j<nz-1; j+=2) {
844         i0    = vi[0];
845         i1    = vi[1];
846         vi   +=2;
847         tmp0  = tmps[i0];
848         tmp1  = tmps[i1];
849         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
850         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
851       }
852       if (j == nz-1) {
853         tmp0  = tmps[*vi++];
854         sum1 -= *v1++ *tmp0;
855         sum2 -= *v2++ *tmp0;
856       }
857       sum2     -= *v2++ *sum1;
858       tmp[row++]=sum1;
859       tmp[row++]=sum2;
860       break;
861     case 3:
862       sum1 = b[*r++];
863       sum2 = b[*r++];
864       sum3 = b[*r++];
865       v2   = aa + ai[row+1];
866       v3   = aa + ai[row+2];
867 
868       for (j=0; j<nz-1; j+=2) {
869         i0    = vi[0];
870         i1    = vi[1];
871         vi   +=2;
872         tmp0  = tmps[i0];
873         tmp1  = tmps[i1];
874         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
875         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
876         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
877       }
878       if (j == nz-1) {
879         tmp0  = tmps[*vi++];
880         sum1 -= *v1++ *tmp0;
881         sum2 -= *v2++ *tmp0;
882         sum3 -= *v3++ *tmp0;
883       }
884       sum2 -= *v2++ * sum1;
885       sum3 -= *v3++ * sum1;
886       sum3 -= *v3++ * sum2;
887 
888       tmp[row++]=sum1;
889       tmp[row++]=sum2;
890       tmp[row++]=sum3;
891       break;
892 
893     case 4:
894       sum1 = b[*r++];
895       sum2 = b[*r++];
896       sum3 = b[*r++];
897       sum4 = b[*r++];
898       v2   = aa + ai[row+1];
899       v3   = aa + ai[row+2];
900       v4   = aa + ai[row+3];
901 
902       for (j=0; j<nz-1; j+=2) {
903         i0    = vi[0];
904         i1    = vi[1];
905         vi   +=2;
906         tmp0  = tmps[i0];
907         tmp1  = tmps[i1];
908         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
909         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
910         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
911         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
912       }
913       if (j == nz-1) {
914         tmp0  = tmps[*vi++];
915         sum1 -= *v1++ *tmp0;
916         sum2 -= *v2++ *tmp0;
917         sum3 -= *v3++ *tmp0;
918         sum4 -= *v4++ *tmp0;
919       }
920       sum2 -= *v2++ * sum1;
921       sum3 -= *v3++ * sum1;
922       sum4 -= *v4++ * sum1;
923       sum3 -= *v3++ * sum2;
924       sum4 -= *v4++ * sum2;
925       sum4 -= *v4++ * sum3;
926 
927       tmp[row++]=sum1;
928       tmp[row++]=sum2;
929       tmp[row++]=sum3;
930       tmp[row++]=sum4;
931       break;
932     case 5:
933       sum1 = b[*r++];
934       sum2 = b[*r++];
935       sum3 = b[*r++];
936       sum4 = b[*r++];
937       sum5 = b[*r++];
938       v2   = aa + ai[row+1];
939       v3   = aa + ai[row+2];
940       v4   = aa + ai[row+3];
941       v5   = aa + ai[row+4];
942 
943       for (j=0; j<nz-1; j+=2) {
944         i0    = vi[0];
945         i1    = vi[1];
946         vi   +=2;
947         tmp0  = tmps[i0];
948         tmp1  = tmps[i1];
949         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
950         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
951         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
952         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
953         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
954       }
955       if (j == nz-1) {
956         tmp0  = tmps[*vi++];
957         sum1 -= *v1++ *tmp0;
958         sum2 -= *v2++ *tmp0;
959         sum3 -= *v3++ *tmp0;
960         sum4 -= *v4++ *tmp0;
961         sum5 -= *v5++ *tmp0;
962       }
963 
964       sum2 -= *v2++ * sum1;
965       sum3 -= *v3++ * sum1;
966       sum4 -= *v4++ * sum1;
967       sum5 -= *v5++ * sum1;
968       sum3 -= *v3++ * sum2;
969       sum4 -= *v4++ * sum2;
970       sum5 -= *v5++ * sum2;
971       sum4 -= *v4++ * sum3;
972       sum5 -= *v5++ * sum3;
973       sum5 -= *v5++ * sum4;
974 
975       tmp[row++]=sum1;
976       tmp[row++]=sum2;
977       tmp[row++]=sum3;
978       tmp[row++]=sum4;
979       tmp[row++]=sum5;
980       break;
981     default:
982       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
983     }
984   }
985   /* backward solve the upper triangular */
986   for (i=node_max -1,row = n-1; i>=0; i--) {
987     nsz = ns[i];
988     aii = ai[row+1] -1;
989     v1  = aa + aii;
990     vi  = aj + aii;
991     nz  = aii- ad[row];
992     switch (nsz) {               /* Each loop in 'case' is unrolled */
993     case 1:
994       sum1 = tmp[row];
995 
996       for (j=nz; j>1; j-=2) {
997         vi   -=2;
998         i0    = vi[2];
999         i1    = vi[1];
1000         tmp0  = tmps[i0];
1001         tmp1  = tmps[i1];
1002         v1   -= 2;
1003         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1004       }
1005       if (j==1) {
1006         tmp0  = tmps[*vi--];
1007         sum1 -= *v1-- * tmp0;
1008       }
1009       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010       break;
1011     case 2:
1012       sum1 = tmp[row];
1013       sum2 = tmp[row -1];
1014       v2   = aa + ai[row]-1;
1015       for (j=nz; j>1; j-=2) {
1016         vi   -=2;
1017         i0    = vi[2];
1018         i1    = vi[1];
1019         tmp0  = tmps[i0];
1020         tmp1  = tmps[i1];
1021         v1   -= 2;
1022         v2   -= 2;
1023         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025       }
1026       if (j==1) {
1027         tmp0  = tmps[*vi--];
1028         sum1 -= *v1-- * tmp0;
1029         sum2 -= *v2-- * tmp0;
1030       }
1031 
1032       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1033       sum2   -= *v2-- * tmp0;
1034       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1035       break;
1036     case 3:
1037       sum1 = tmp[row];
1038       sum2 = tmp[row -1];
1039       sum3 = tmp[row -2];
1040       v2   = aa + ai[row]-1;
1041       v3   = aa + ai[row -1]-1;
1042       for (j=nz; j>1; j-=2) {
1043         vi   -=2;
1044         i0    = vi[2];
1045         i1    = vi[1];
1046         tmp0  = tmps[i0];
1047         tmp1  = tmps[i1];
1048         v1   -= 2;
1049         v2   -= 2;
1050         v3   -= 2;
1051         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1052         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1053         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1054       }
1055       if (j==1) {
1056         tmp0  = tmps[*vi--];
1057         sum1 -= *v1-- * tmp0;
1058         sum2 -= *v2-- * tmp0;
1059         sum3 -= *v3-- * tmp0;
1060       }
1061       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1062       sum2   -= *v2-- * tmp0;
1063       sum3   -= *v3-- * tmp0;
1064       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1065       sum3   -= *v3-- * tmp0;
1066       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1067 
1068       break;
1069     case 4:
1070       sum1 = tmp[row];
1071       sum2 = tmp[row -1];
1072       sum3 = tmp[row -2];
1073       sum4 = tmp[row -3];
1074       v2   = aa + ai[row]-1;
1075       v3   = aa + ai[row -1]-1;
1076       v4   = aa + ai[row -2]-1;
1077 
1078       for (j=nz; j>1; j-=2) {
1079         vi   -=2;
1080         i0    = vi[2];
1081         i1    = vi[1];
1082         tmp0  = tmps[i0];
1083         tmp1  = tmps[i1];
1084         v1   -= 2;
1085         v2   -= 2;
1086         v3   -= 2;
1087         v4   -= 2;
1088         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092       }
1093       if (j==1) {
1094         tmp0  = tmps[*vi--];
1095         sum1 -= *v1-- * tmp0;
1096         sum2 -= *v2-- * tmp0;
1097         sum3 -= *v3-- * tmp0;
1098         sum4 -= *v4-- * tmp0;
1099       }
1100 
1101       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1102       sum2   -= *v2-- * tmp0;
1103       sum3   -= *v3-- * tmp0;
1104       sum4   -= *v4-- * tmp0;
1105       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1106       sum3   -= *v3-- * tmp0;
1107       sum4   -= *v4-- * tmp0;
1108       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1109       sum4   -= *v4-- * tmp0;
1110       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1111       break;
1112     case 5:
1113       sum1 = tmp[row];
1114       sum2 = tmp[row -1];
1115       sum3 = tmp[row -2];
1116       sum4 = tmp[row -3];
1117       sum5 = tmp[row -4];
1118       v2   = aa + ai[row]-1;
1119       v3   = aa + ai[row -1]-1;
1120       v4   = aa + ai[row -2]-1;
1121       v5   = aa + ai[row -3]-1;
1122       for (j=nz; j>1; j-=2) {
1123         vi   -= 2;
1124         i0    = vi[2];
1125         i1    = vi[1];
1126         tmp0  = tmps[i0];
1127         tmp1  = tmps[i1];
1128         v1   -= 2;
1129         v2   -= 2;
1130         v3   -= 2;
1131         v4   -= 2;
1132         v5   -= 2;
1133         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1134         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1135         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1136         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1137         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1138       }
1139       if (j==1) {
1140         tmp0  = tmps[*vi--];
1141         sum1 -= *v1-- * tmp0;
1142         sum2 -= *v2-- * tmp0;
1143         sum3 -= *v3-- * tmp0;
1144         sum4 -= *v4-- * tmp0;
1145         sum5 -= *v5-- * tmp0;
1146       }
1147 
1148       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1149       sum2   -= *v2-- * tmp0;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1158       sum4   -= *v4-- * tmp0;
1159       sum5   -= *v5-- * tmp0;
1160       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1161       sum5   -= *v5-- * tmp0;
1162       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1163       break;
1164     default:
1165       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1166     }
1167   }
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1178 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1179 {
1180   Mat             C     =B;
1181   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1182   IS              isrow = b->row,isicol = b->icol;
1183   PetscErrorCode  ierr;
1184   const PetscInt  *r,*ic,*ics;
1185   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1186   PetscInt        i,j,k,nz,nzL,row,*pj;
1187   const PetscInt  *ajtmp,*bjtmp;
1188   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1189   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1190   FactorShiftCtx  sctx;
1191   const PetscInt  *ddiag;
1192   PetscReal       rs;
1193   MatScalar       d;
1194   PetscInt        inod,nodesz,node_max,col;
1195   const PetscInt  *ns;
1196   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1197 
1198   PetscFunctionBegin;
1199   /* MatPivotSetUp(): initialize shift context sctx */
1200   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1201 
1202   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1203     ddiag          = a->diag;
1204     sctx.shift_top = info->zeropivot;
1205     for (i=0; i<n; i++) {
1206       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1207       d  = (aa)[ddiag[i]];
1208       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1209       v  = aa+ai[i];
1210       nz = ai[i+1] - ai[i];
1211       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1212       if (rs>sctx.shift_top) sctx.shift_top = rs;
1213     }
1214     sctx.shift_top *= 1.1;
1215     sctx.nshift_max = 5;
1216     sctx.shift_lo   = 0.;
1217     sctx.shift_hi   = 1.;
1218   }
1219 
1220   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1221   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1222 
1223   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1224   ics   = ic;
1225 
1226   node_max = a->inode.node_count;
1227   ns       = a->inode.size;
1228   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1229 
1230   /* If max inode size > 4, split it into two inodes.*/
1231   /* also map the inode sizes according to the ordering */
1232   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1233   for (i=0,j=0; i<node_max; ++i,++j) {
1234     if (ns[i] > 4) {
1235       tmp_vec1[j] = 4;
1236       ++j;
1237       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1238     } else {
1239       tmp_vec1[j] = ns[i];
1240     }
1241   }
1242   /* Use the correct node_max */
1243   node_max = j;
1244 
1245   /* Now reorder the inode info based on mat re-ordering info */
1246   /* First create a row -> inode_size_array_index map */
1247   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1248   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1249   for (i=0,row=0; i<node_max; i++) {
1250     nodesz = tmp_vec1[i];
1251     for (j=0; j<nodesz; j++,row++) {
1252       nsmap[row] = i;
1253     }
1254   }
1255   /* Using nsmap, create a reordered ns structure */
1256   for (i=0,j=0; i< node_max; i++) {
1257     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1258     tmp_vec2[i] = nodesz;
1259     j          += nodesz;
1260   }
1261   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1262   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1263 
1264   /* Now use the correct ns */
1265   ns = tmp_vec2;
1266 
1267   do {
1268     sctx.newshift = PETSC_FALSE;
1269     /* Now loop over each block-row, and do the factorization */
1270     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1271       nodesz = ns[inod];
1272 
1273       switch (nodesz) {
1274       case 1:
1275         /*----------*/
1276         /* zero rtmp1 */
1277         /* L part */
1278         nz    = bi[i+1] - bi[i];
1279         bjtmp = bj + bi[i];
1280         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1281 
1282         /* U part */
1283         nz    = bdiag[i]-bdiag[i+1];
1284         bjtmp = bj + bdiag[i+1]+1;
1285         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1286 
1287         /* load in initial (unfactored row) */
1288         nz    = ai[r[i]+1] - ai[r[i]];
1289         ajtmp = aj + ai[r[i]];
1290         v     = aa + ai[r[i]];
1291         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1292 
1293         /* ZeropivotApply() */
1294         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1295 
1296         /* elimination */
1297         bjtmp = bj + bi[i];
1298         row   = *bjtmp++;
1299         nzL   = bi[i+1] - bi[i];
1300         for (k=0; k < nzL; k++) {
1301           pc = rtmp1 + row;
1302           if (*pc != 0.0) {
1303             pv   = b->a + bdiag[row];
1304             mul1 = *pc * (*pv);
1305             *pc  = mul1;
1306             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1307             pv   = b->a + bdiag[row+1]+1;
1308             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1309             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1310             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1311           }
1312           row = *bjtmp++;
1313         }
1314 
1315         /* finished row so stick it into b->a */
1316         rs = 0.0;
1317         /* L part */
1318         pv = b->a + bi[i];
1319         pj = b->j + bi[i];
1320         nz = bi[i+1] - bi[i];
1321         for (j=0; j<nz; j++) {
1322           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1323         }
1324 
1325         /* U part */
1326         pv = b->a + bdiag[i+1]+1;
1327         pj = b->j + bdiag[i+1]+1;
1328         nz = bdiag[i] - bdiag[i+1]-1;
1329         for (j=0; j<nz; j++) {
1330           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1331         }
1332 
1333         /* Check zero pivot */
1334         sctx.rs = rs;
1335         sctx.pv = rtmp1[i];
1336         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1337         if (sctx.newshift) break;
1338 
1339         /* Mark diagonal and invert diagonal for simplier triangular solves */
1340         pv  = b->a + bdiag[i];
1341         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1342         break;
1343 
1344       case 2:
1345         /*----------*/
1346         /* zero rtmp1 and rtmp2 */
1347         /* L part */
1348         nz    = bi[i+1] - bi[i];
1349         bjtmp = bj + bi[i];
1350         for  (j=0; j<nz; j++) {
1351           col        = bjtmp[j];
1352           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1353         }
1354 
1355         /* U part */
1356         nz    = bdiag[i]-bdiag[i+1];
1357         bjtmp = bj + bdiag[i+1]+1;
1358         for  (j=0; j<nz; j++) {
1359           col        = bjtmp[j];
1360           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1361         }
1362 
1363         /* load in initial (unfactored row) */
1364         nz    = ai[r[i]+1] - ai[r[i]];
1365         ajtmp = aj + ai[r[i]];
1366         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1367         for (j=0; j<nz; j++) {
1368           col        = ics[ajtmp[j]];
1369           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1370         }
1371         /* ZeropivotApply(): shift the diagonal of the matrix  */
1372         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1373 
1374         /* elimination */
1375         bjtmp = bj + bi[i];
1376         row   = *bjtmp++; /* pivot row */
1377         nzL   = bi[i+1] - bi[i];
1378         for (k=0; k < nzL; k++) {
1379           pc1 = rtmp1 + row;
1380           pc2 = rtmp2 + row;
1381           if (*pc1 != 0.0 || *pc2 != 0.0) {
1382             pv   = b->a + bdiag[row];
1383             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1384             *pc1 = mul1;       *pc2 = mul2;
1385 
1386             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1387             pv = b->a + bdiag[row+1]+1;
1388             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1389             for (j=0; j<nz; j++) {
1390               col         = pj[j];
1391               rtmp1[col] -= mul1 * pv[j];
1392               rtmp2[col] -= mul2 * pv[j];
1393             }
1394             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1395           }
1396           row = *bjtmp++;
1397         }
1398 
1399         /* finished row i; check zero pivot, then stick row i into b->a */
1400         rs = 0.0;
1401         /* L part */
1402         pc1 = b->a + bi[i];
1403         pj  = b->j + bi[i];
1404         nz  = bi[i+1] - bi[i];
1405         for (j=0; j<nz; j++) {
1406           col    = pj[j];
1407           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1408         }
1409         /* U part */
1410         pc1 = b->a + bdiag[i+1]+1;
1411         pj  = b->j + bdiag[i+1]+1;
1412         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1413         for (j=0; j<nz; j++) {
1414           col    = pj[j];
1415           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1416         }
1417 
1418         sctx.rs = rs;
1419         sctx.pv = rtmp1[i];
1420         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1421         if (sctx.newshift) break;
1422         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1423         *pc1 = 1.0/sctx.pv;
1424 
1425         /* Now take care of diagonal 2x2 block. */
1426         pc2 = rtmp2 + i;
1427         if (*pc2 != 0.0) {
1428           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1429           *pc2 = mul1;          /* insert L entry */
1430           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1431           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1432           for (j=0; j<nz; j++) {
1433             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1434           }
1435           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1436         }
1437 
1438         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1439         rs = 0.0;
1440         /* L part */
1441         pc2 = b->a + bi[i+1];
1442         pj  = b->j + bi[i+1];
1443         nz  = bi[i+2] - bi[i+1];
1444         for (j=0; j<nz; j++) {
1445           col    = pj[j];
1446           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1447         }
1448         /* U part */
1449         pc2 = b->a + bdiag[i+2]+1;
1450         pj  = b->j + bdiag[i+2]+1;
1451         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1452         for (j=0; j<nz; j++) {
1453           col    = pj[j];
1454           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1455         }
1456 
1457         sctx.rs = rs;
1458         sctx.pv = rtmp2[i+1];
1459         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1460         if (sctx.newshift) break;
1461         pc2  = b->a + bdiag[i+1];
1462         *pc2 = 1.0/sctx.pv;
1463         break;
1464 
1465       case 3:
1466         /*----------*/
1467         /* zero rtmp */
1468         /* L part */
1469         nz    = bi[i+1] - bi[i];
1470         bjtmp = bj + bi[i];
1471         for  (j=0; j<nz; j++) {
1472           col        = bjtmp[j];
1473           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1474         }
1475 
1476         /* U part */
1477         nz    = bdiag[i]-bdiag[i+1];
1478         bjtmp = bj + bdiag[i+1]+1;
1479         for  (j=0; j<nz; j++) {
1480           col        = bjtmp[j];
1481           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482         }
1483 
1484         /* load in initial (unfactored row) */
1485         nz    = ai[r[i]+1] - ai[r[i]];
1486         ajtmp = aj + ai[r[i]];
1487         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1488         for (j=0; j<nz; j++) {
1489           col        = ics[ajtmp[j]];
1490           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1491         }
1492         /* ZeropivotApply(): shift the diagonal of the matrix  */
1493         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1494 
1495         /* elimination */
1496         bjtmp = bj + bi[i];
1497         row   = *bjtmp++; /* pivot row */
1498         nzL   = bi[i+1] - bi[i];
1499         for (k=0; k < nzL; k++) {
1500           pc1 = rtmp1 + row;
1501           pc2 = rtmp2 + row;
1502           pc3 = rtmp3 + row;
1503           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1504             pv   = b->a + bdiag[row];
1505             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1506             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1507 
1508             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1509             pv = b->a + bdiag[row+1]+1;
1510             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1511             for (j=0; j<nz; j++) {
1512               col         = pj[j];
1513               rtmp1[col] -= mul1 * pv[j];
1514               rtmp2[col] -= mul2 * pv[j];
1515               rtmp3[col] -= mul3 * pv[j];
1516             }
1517             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1518           }
1519           row = *bjtmp++;
1520         }
1521 
1522         /* finished row i; check zero pivot, then stick row i into b->a */
1523         rs = 0.0;
1524         /* L part */
1525         pc1 = b->a + bi[i];
1526         pj  = b->j + bi[i];
1527         nz  = bi[i+1] - bi[i];
1528         for (j=0; j<nz; j++) {
1529           col    = pj[j];
1530           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1531         }
1532         /* U part */
1533         pc1 = b->a + bdiag[i+1]+1;
1534         pj  = b->j + bdiag[i+1]+1;
1535         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1536         for (j=0; j<nz; j++) {
1537           col    = pj[j];
1538           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539         }
1540 
1541         sctx.rs = rs;
1542         sctx.pv = rtmp1[i];
1543         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1544         if (sctx.newshift) break;
1545         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1546         *pc1 = 1.0/sctx.pv;
1547 
1548         /* Now take care of 1st column of diagonal 3x3 block. */
1549         pc2 = rtmp2 + i;
1550         pc3 = rtmp3 + i;
1551         if (*pc2 != 0.0 || *pc3 != 0.0) {
1552           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1553           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1554           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1555           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1556           for (j=0; j<nz; j++) {
1557             col         = pj[j];
1558             rtmp2[col] -= mul2 * rtmp1[col];
1559             rtmp3[col] -= mul3 * rtmp1[col];
1560           }
1561           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1562         }
1563 
1564         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1565         rs = 0.0;
1566         /* L part */
1567         pc2 = b->a + bi[i+1];
1568         pj  = b->j + bi[i+1];
1569         nz  = bi[i+2] - bi[i+1];
1570         for (j=0; j<nz; j++) {
1571           col    = pj[j];
1572           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1573         }
1574         /* U part */
1575         pc2 = b->a + bdiag[i+2]+1;
1576         pj  = b->j + bdiag[i+2]+1;
1577         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1578         for (j=0; j<nz; j++) {
1579           col    = pj[j];
1580           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581         }
1582 
1583         sctx.rs = rs;
1584         sctx.pv = rtmp2[i+1];
1585         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1586         if (sctx.newshift) break;
1587         pc2  = b->a + bdiag[i+1];
1588         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1589 
1590         /* Now take care of 2nd column of diagonal 3x3 block. */
1591         pc3 = rtmp3 + i+1;
1592         if (*pc3 != 0.0) {
1593           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1594           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1595           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1596           for (j=0; j<nz; j++) {
1597             col         = pj[j];
1598             rtmp3[col] -= mul3 * rtmp2[col];
1599           }
1600           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1601         }
1602 
1603         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1604         rs = 0.0;
1605         /* L part */
1606         pc3 = b->a + bi[i+2];
1607         pj  = b->j + bi[i+2];
1608         nz  = bi[i+3] - bi[i+2];
1609         for (j=0; j<nz; j++) {
1610           col    = pj[j];
1611           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1612         }
1613         /* U part */
1614         pc3 = b->a + bdiag[i+3]+1;
1615         pj  = b->j + bdiag[i+3]+1;
1616         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1617         for (j=0; j<nz; j++) {
1618           col    = pj[j];
1619           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620         }
1621 
1622         sctx.rs = rs;
1623         sctx.pv = rtmp3[i+2];
1624         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1625         if (sctx.newshift) break;
1626         pc3  = b->a + bdiag[i+2];
1627         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1628         break;
1629       case 4:
1630         /*----------*/
1631         /* zero rtmp */
1632         /* L part */
1633         nz    = bi[i+1] - bi[i];
1634         bjtmp = bj + bi[i];
1635         for  (j=0; j<nz; j++) {
1636           col        = bjtmp[j];
1637           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1638         }
1639 
1640         /* U part */
1641         nz    = bdiag[i]-bdiag[i+1];
1642         bjtmp = bj + bdiag[i+1]+1;
1643         for  (j=0; j<nz; j++) {
1644           col        = bjtmp[j];
1645           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1646         }
1647 
1648         /* load in initial (unfactored row) */
1649         nz    = ai[r[i]+1] - ai[r[i]];
1650         ajtmp = aj + ai[r[i]];
1651         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1652         for (j=0; j<nz; j++) {
1653           col        = ics[ajtmp[j]];
1654           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1655         }
1656         /* ZeropivotApply(): shift the diagonal of the matrix  */
1657         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1658 
1659         /* elimination */
1660         bjtmp = bj + bi[i];
1661         row   = *bjtmp++; /* pivot row */
1662         nzL   = bi[i+1] - bi[i];
1663         for (k=0; k < nzL; k++) {
1664           pc1 = rtmp1 + row;
1665           pc2 = rtmp2 + row;
1666           pc3 = rtmp3 + row;
1667           pc4 = rtmp4 + row;
1668           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1669             pv   = b->a + bdiag[row];
1670             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1671             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1672 
1673             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1674             pv = b->a + bdiag[row+1]+1;
1675             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1676             for (j=0; j<nz; j++) {
1677               col         = pj[j];
1678               rtmp1[col] -= mul1 * pv[j];
1679               rtmp2[col] -= mul2 * pv[j];
1680               rtmp3[col] -= mul3 * pv[j];
1681               rtmp4[col] -= mul4 * pv[j];
1682             }
1683             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1684           }
1685           row = *bjtmp++;
1686         }
1687 
1688         /* finished row i; check zero pivot, then stick row i into b->a */
1689         rs = 0.0;
1690         /* L part */
1691         pc1 = b->a + bi[i];
1692         pj  = b->j + bi[i];
1693         nz  = bi[i+1] - bi[i];
1694         for (j=0; j<nz; j++) {
1695           col    = pj[j];
1696           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1697         }
1698         /* U part */
1699         pc1 = b->a + bdiag[i+1]+1;
1700         pj  = b->j + bdiag[i+1]+1;
1701         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1702         for (j=0; j<nz; j++) {
1703           col    = pj[j];
1704           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1705         }
1706 
1707         sctx.rs = rs;
1708         sctx.pv = rtmp1[i];
1709         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1710         if (sctx.newshift) break;
1711         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1712         *pc1 = 1.0/sctx.pv;
1713 
1714         /* Now take care of 1st column of diagonal 4x4 block. */
1715         pc2 = rtmp2 + i;
1716         pc3 = rtmp3 + i;
1717         pc4 = rtmp4 + i;
1718         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1719           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1720           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1721           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1722           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1723           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1724           for (j=0; j<nz; j++) {
1725             col         = pj[j];
1726             rtmp2[col] -= mul2 * rtmp1[col];
1727             rtmp3[col] -= mul3 * rtmp1[col];
1728             rtmp4[col] -= mul4 * rtmp1[col];
1729           }
1730           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1731         }
1732 
1733         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1734         rs = 0.0;
1735         /* L part */
1736         pc2 = b->a + bi[i+1];
1737         pj  = b->j + bi[i+1];
1738         nz  = bi[i+2] - bi[i+1];
1739         for (j=0; j<nz; j++) {
1740           col    = pj[j];
1741           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1742         }
1743         /* U part */
1744         pc2 = b->a + bdiag[i+2]+1;
1745         pj  = b->j + bdiag[i+2]+1;
1746         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1747         for (j=0; j<nz; j++) {
1748           col    = pj[j];
1749           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1750         }
1751 
1752         sctx.rs = rs;
1753         sctx.pv = rtmp2[i+1];
1754         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1755         if (sctx.newshift) break;
1756         pc2  = b->a + bdiag[i+1];
1757         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1758 
1759         /* Now take care of 2nd column of diagonal 4x4 block. */
1760         pc3 = rtmp3 + i+1;
1761         pc4 = rtmp4 + i+1;
1762         if (*pc3 != 0.0 || *pc4 != 0.0) {
1763           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1764           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1765           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1766           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1767           for (j=0; j<nz; j++) {
1768             col         = pj[j];
1769             rtmp3[col] -= mul3 * rtmp2[col];
1770             rtmp4[col] -= mul4 * rtmp2[col];
1771           }
1772           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1773         }
1774 
1775         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1776         rs = 0.0;
1777         /* L part */
1778         pc3 = b->a + bi[i+2];
1779         pj  = b->j + bi[i+2];
1780         nz  = bi[i+3] - bi[i+2];
1781         for (j=0; j<nz; j++) {
1782           col    = pj[j];
1783           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1784         }
1785         /* U part */
1786         pc3 = b->a + bdiag[i+3]+1;
1787         pj  = b->j + bdiag[i+3]+1;
1788         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1789         for (j=0; j<nz; j++) {
1790           col    = pj[j];
1791           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1792         }
1793 
1794         sctx.rs = rs;
1795         sctx.pv = rtmp3[i+2];
1796         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1797         if (sctx.newshift) break;
1798         pc3  = b->a + bdiag[i+2];
1799         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1800 
1801         /* Now take care of 3rd column of diagonal 4x4 block. */
1802         pc4 = rtmp4 + i+2;
1803         if (*pc4 != 0.0) {
1804           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1805           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1806           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1807           for (j=0; j<nz; j++) {
1808             col         = pj[j];
1809             rtmp4[col] -= mul4 * rtmp3[col];
1810           }
1811           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1812         }
1813 
1814         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1815         rs = 0.0;
1816         /* L part */
1817         pc4 = b->a + bi[i+3];
1818         pj  = b->j + bi[i+3];
1819         nz  = bi[i+4] - bi[i+3];
1820         for (j=0; j<nz; j++) {
1821           col    = pj[j];
1822           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1823         }
1824         /* U part */
1825         pc4 = b->a + bdiag[i+4]+1;
1826         pj  = b->j + bdiag[i+4]+1;
1827         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1828         for (j=0; j<nz; j++) {
1829           col    = pj[j];
1830           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1831         }
1832 
1833         sctx.rs = rs;
1834         sctx.pv = rtmp4[i+3];
1835         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1836         if (sctx.newshift) break;
1837         pc4  = b->a + bdiag[i+3];
1838         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1839         break;
1840 
1841       default:
1842         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1843       }
1844       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1845       i += nodesz;                 /* Update the row */
1846     }
1847 
1848     /* MatPivotRefine() */
1849     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1850       /*
1851        * if no shift in this attempt & shifting & started shifting & can refine,
1852        * then try lower shift
1853        */
1854       sctx.shift_hi       = sctx.shift_fraction;
1855       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1856       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1857       sctx.newshift       = PETSC_TRUE;
1858       sctx.nshift++;
1859     }
1860   } while (sctx.newshift);
1861 
1862   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1863   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1864   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1865   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1866 
1867   if (b->inode.size) {
1868     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1869   } else {
1870     C->ops->solve           = MatSolve_SeqAIJ;
1871   }
1872   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1873   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1874   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1875   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1876   C->assembled              = PETSC_TRUE;
1877   C->preallocated           = PETSC_TRUE;
1878 
1879   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1880 
1881   /* MatShiftView(A,info,&sctx) */
1882   if (sctx.nshift) {
1883     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1884       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1885     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1886       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1887     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1888       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1889     }
1890   }
1891   PetscFunctionReturn(0);
1892 }
1893 
1894 #undef __FUNCT__
1895 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1896 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1897 {
1898   Mat             C     = B;
1899   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1900   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1901   PetscErrorCode  ierr;
1902   const PetscInt  *r,*ic,*c,*ics;
1903   PetscInt        n   = A->rmap->n,*bi = b->i;
1904   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1905   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1906   PetscInt        *ai = a->i,*aj = a->j;
1907   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1908   PetscScalar     mul1,mul2,mul3,tmp;
1909   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1910   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1911   PetscReal       rs=0.0;
1912   FactorShiftCtx  sctx;
1913 
1914   PetscFunctionBegin;
1915   sctx.shift_top      = 0;
1916   sctx.nshift_max     = 0;
1917   sctx.shift_lo       = 0;
1918   sctx.shift_hi       = 0;
1919   sctx.shift_fraction = 0;
1920 
1921   /* if both shift schemes are chosen by user, only use info->shiftpd */
1922   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1923     sctx.shift_top = 0;
1924     for (i=0; i<n; i++) {
1925       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1926       rs    = 0.0;
1927       ajtmp = aj + ai[i];
1928       rtmp1 = aa + ai[i];
1929       nz    = ai[i+1] - ai[i];
1930       for (j=0; j<nz; j++) {
1931         if (*ajtmp != i) {
1932           rs += PetscAbsScalar(*rtmp1++);
1933         } else {
1934           rs -= PetscRealPart(*rtmp1++);
1935         }
1936         ajtmp++;
1937       }
1938       if (rs>sctx.shift_top) sctx.shift_top = rs;
1939     }
1940     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1941     sctx.shift_top *= 1.1;
1942     sctx.nshift_max = 5;
1943     sctx.shift_lo   = 0.;
1944     sctx.shift_hi   = 1.;
1945   }
1946   sctx.shift_amount = 0;
1947   sctx.nshift       = 0;
1948 
1949   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1950   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1951   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1952   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1953   ics    = ic;
1954 
1955   node_max = a->inode.node_count;
1956   ns       = a->inode.size;
1957   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1958 
1959   /* If max inode size > 3, split it into two inodes.*/
1960   /* also map the inode sizes according to the ordering */
1961   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1962   for (i=0,j=0; i<node_max; ++i,++j) {
1963     if (ns[i]>3) {
1964       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1965       ++j;
1966       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1967     } else {
1968       tmp_vec1[j] = ns[i];
1969     }
1970   }
1971   /* Use the correct node_max */
1972   node_max = j;
1973 
1974   /* Now reorder the inode info based on mat re-ordering info */
1975   /* First create a row -> inode_size_array_index map */
1976   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1977   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1978   for (i=0,row=0; i<node_max; i++) {
1979     nodesz = tmp_vec1[i];
1980     for (j=0; j<nodesz; j++,row++) {
1981       nsmap[row] = i;
1982     }
1983   }
1984   /* Using nsmap, create a reordered ns structure */
1985   for (i=0,j=0; i< node_max; i++) {
1986     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1987     tmp_vec2[i] = nodesz;
1988     j          += nodesz;
1989   }
1990   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1991   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1992   /* Now use the correct ns */
1993   ns = tmp_vec2;
1994 
1995   do {
1996     sctx.newshift = PETSC_FALSE;
1997     /* Now loop over each block-row, and do the factorization */
1998     for (i=0,row=0; i<node_max; i++) {
1999       nodesz = ns[i];
2000       nz     = bi[row+1] - bi[row];
2001       bjtmp  = bj + bi[row];
2002 
2003       switch (nodesz) {
2004       case 1:
2005         for  (j=0; j<nz; j++) {
2006           idx         = bjtmp[j];
2007           rtmp11[idx] = 0.0;
2008         }
2009 
2010         /* load in initial (unfactored row) */
2011         idx    = r[row];
2012         nz_tmp = ai[idx+1] - ai[idx];
2013         ajtmp  = aj + ai[idx];
2014         v1     = aa + ai[idx];
2015 
2016         for (j=0; j<nz_tmp; j++) {
2017           idx         = ics[ajtmp[j]];
2018           rtmp11[idx] = v1[j];
2019         }
2020         rtmp11[ics[r[row]]] += sctx.shift_amount;
2021 
2022         prow = *bjtmp++;
2023         while (prow < row) {
2024           pc1 = rtmp11 + prow;
2025           if (*pc1 != 0.0) {
2026             pv     = ba + bd[prow];
2027             pj     = nbj + bd[prow];
2028             mul1   = *pc1 * *pv++;
2029             *pc1   = mul1;
2030             nz_tmp = bi[prow+1] - bd[prow] - 1;
2031             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2032             for (j=0; j<nz_tmp; j++) {
2033               tmp          = pv[j];
2034               idx          = pj[j];
2035               rtmp11[idx] -= mul1 * tmp;
2036             }
2037           }
2038           prow = *bjtmp++;
2039         }
2040         pj  = bj + bi[row];
2041         pc1 = ba + bi[row];
2042 
2043         sctx.pv     = rtmp11[row];
2044         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2045         rs          = 0.0;
2046         for (j=0; j<nz; j++) {
2047           idx    = pj[j];
2048           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2049           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2050         }
2051         sctx.rs = rs;
2052         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2053         if (sctx.newshift) goto endofwhile;
2054         break;
2055 
2056       case 2:
2057         for (j=0; j<nz; j++) {
2058           idx         = bjtmp[j];
2059           rtmp11[idx] = 0.0;
2060           rtmp22[idx] = 0.0;
2061         }
2062 
2063         /* load in initial (unfactored row) */
2064         idx    = r[row];
2065         nz_tmp = ai[idx+1] - ai[idx];
2066         ajtmp  = aj + ai[idx];
2067         v1     = aa + ai[idx];
2068         v2     = aa + ai[idx+1];
2069         for (j=0; j<nz_tmp; j++) {
2070           idx         = ics[ajtmp[j]];
2071           rtmp11[idx] = v1[j];
2072           rtmp22[idx] = v2[j];
2073         }
2074         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2075         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2076 
2077         prow = *bjtmp++;
2078         while (prow < row) {
2079           pc1 = rtmp11 + prow;
2080           pc2 = rtmp22 + prow;
2081           if (*pc1 != 0.0 || *pc2 != 0.0) {
2082             pv   = ba + bd[prow];
2083             pj   = nbj + bd[prow];
2084             mul1 = *pc1 * *pv;
2085             mul2 = *pc2 * *pv;
2086             ++pv;
2087             *pc1 = mul1;
2088             *pc2 = mul2;
2089 
2090             nz_tmp = bi[prow+1] - bd[prow] - 1;
2091             for (j=0; j<nz_tmp; j++) {
2092               tmp          = pv[j];
2093               idx          = pj[j];
2094               rtmp11[idx] -= mul1 * tmp;
2095               rtmp22[idx] -= mul2 * tmp;
2096             }
2097             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2098           }
2099           prow = *bjtmp++;
2100         }
2101 
2102         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2103         pc1 = rtmp11 + prow;
2104         pc2 = rtmp22 + prow;
2105 
2106         sctx.pv = *pc1;
2107         pj      = bj + bi[prow];
2108         rs      = 0.0;
2109         for (j=0; j<nz; j++) {
2110           idx = pj[j];
2111           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2112         }
2113         sctx.rs = rs;
2114         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2115         if (sctx.newshift) goto endofwhile;
2116 
2117         if (*pc2 != 0.0) {
2118           pj     = nbj + bd[prow];
2119           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2120           *pc2   = mul2;
2121           nz_tmp = bi[prow+1] - bd[prow] - 1;
2122           for (j=0; j<nz_tmp; j++) {
2123             idx          = pj[j];
2124             tmp          = rtmp11[idx];
2125             rtmp22[idx] -= mul2 * tmp;
2126           }
2127           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2128         }
2129 
2130         pj  = bj + bi[row];
2131         pc1 = ba + bi[row];
2132         pc2 = ba + bi[row+1];
2133 
2134         sctx.pv       = rtmp22[row+1];
2135         rs            = 0.0;
2136         rtmp11[row]   = 1.0/rtmp11[row];
2137         rtmp22[row+1] = 1.0/rtmp22[row+1];
2138         /* copy row entries from dense representation to sparse */
2139         for (j=0; j<nz; j++) {
2140           idx    = pj[j];
2141           pc1[j] = rtmp11[idx];
2142           pc2[j] = rtmp22[idx];
2143           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2144         }
2145         sctx.rs = rs;
2146         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2147         if (sctx.newshift) goto endofwhile;
2148         break;
2149 
2150       case 3:
2151         for  (j=0; j<nz; j++) {
2152           idx         = bjtmp[j];
2153           rtmp11[idx] = 0.0;
2154           rtmp22[idx] = 0.0;
2155           rtmp33[idx] = 0.0;
2156         }
2157         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2158         idx    = r[row];
2159         nz_tmp = ai[idx+1] - ai[idx];
2160         ajtmp  = aj + ai[idx];
2161         v1     = aa + ai[idx];
2162         v2     = aa + ai[idx+1];
2163         v3     = aa + ai[idx+2];
2164         for (j=0; j<nz_tmp; j++) {
2165           idx         = ics[ajtmp[j]];
2166           rtmp11[idx] = v1[j];
2167           rtmp22[idx] = v2[j];
2168           rtmp33[idx] = v3[j];
2169         }
2170         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2171         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2172         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2173 
2174         /* loop over all pivot row blocks above this row block */
2175         prow = *bjtmp++;
2176         while (prow < row) {
2177           pc1 = rtmp11 + prow;
2178           pc2 = rtmp22 + prow;
2179           pc3 = rtmp33 + prow;
2180           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2181             pv   = ba  + bd[prow];
2182             pj   = nbj + bd[prow];
2183             mul1 = *pc1 * *pv;
2184             mul2 = *pc2 * *pv;
2185             mul3 = *pc3 * *pv;
2186             ++pv;
2187             *pc1 = mul1;
2188             *pc2 = mul2;
2189             *pc3 = mul3;
2190 
2191             nz_tmp = bi[prow+1] - bd[prow] - 1;
2192             /* update this row based on pivot row */
2193             for (j=0; j<nz_tmp; j++) {
2194               tmp          = pv[j];
2195               idx          = pj[j];
2196               rtmp11[idx] -= mul1 * tmp;
2197               rtmp22[idx] -= mul2 * tmp;
2198               rtmp33[idx] -= mul3 * tmp;
2199             }
2200             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2201           }
2202           prow = *bjtmp++;
2203         }
2204 
2205         /* Now take care of diagonal 3x3 block in this set of rows */
2206         /* note: prow = row here */
2207         pc1 = rtmp11 + prow;
2208         pc2 = rtmp22 + prow;
2209         pc3 = rtmp33 + prow;
2210 
2211         sctx.pv = *pc1;
2212         pj      = bj + bi[prow];
2213         rs      = 0.0;
2214         for (j=0; j<nz; j++) {
2215           idx = pj[j];
2216           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2217         }
2218         sctx.rs = rs;
2219         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2220         if (sctx.newshift) goto endofwhile;
2221 
2222         if (*pc2 != 0.0 || *pc3 != 0.0) {
2223           mul2   = (*pc2)/(*pc1);
2224           mul3   = (*pc3)/(*pc1);
2225           *pc2   = mul2;
2226           *pc3   = mul3;
2227           nz_tmp = bi[prow+1] - bd[prow] - 1;
2228           pj     = nbj + bd[prow];
2229           for (j=0; j<nz_tmp; j++) {
2230             idx          = pj[j];
2231             tmp          = rtmp11[idx];
2232             rtmp22[idx] -= mul2 * tmp;
2233             rtmp33[idx] -= mul3 * tmp;
2234           }
2235           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2236         }
2237         ++prow;
2238 
2239         pc2     = rtmp22 + prow;
2240         pc3     = rtmp33 + prow;
2241         sctx.pv = *pc2;
2242         pj      = bj + bi[prow];
2243         rs      = 0.0;
2244         for (j=0; j<nz; j++) {
2245           idx = pj[j];
2246           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2247         }
2248         sctx.rs = rs;
2249         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2250         if (sctx.newshift) goto endofwhile;
2251 
2252         if (*pc3 != 0.0) {
2253           mul3   = (*pc3)/(*pc2);
2254           *pc3   = mul3;
2255           pj     = nbj + bd[prow];
2256           nz_tmp = bi[prow+1] - bd[prow] - 1;
2257           for (j=0; j<nz_tmp; j++) {
2258             idx          = pj[j];
2259             tmp          = rtmp22[idx];
2260             rtmp33[idx] -= mul3 * tmp;
2261           }
2262           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2263         }
2264 
2265         pj  = bj + bi[row];
2266         pc1 = ba + bi[row];
2267         pc2 = ba + bi[row+1];
2268         pc3 = ba + bi[row+2];
2269 
2270         sctx.pv       = rtmp33[row+2];
2271         rs            = 0.0;
2272         rtmp11[row]   = 1.0/rtmp11[row];
2273         rtmp22[row+1] = 1.0/rtmp22[row+1];
2274         rtmp33[row+2] = 1.0/rtmp33[row+2];
2275         /* copy row entries from dense representation to sparse */
2276         for (j=0; j<nz; j++) {
2277           idx    = pj[j];
2278           pc1[j] = rtmp11[idx];
2279           pc2[j] = rtmp22[idx];
2280           pc3[j] = rtmp33[idx];
2281           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2282         }
2283 
2284         sctx.rs = rs;
2285         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2286         if (sctx.newshift) goto endofwhile;
2287         break;
2288 
2289       default:
2290         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2291       }
2292       row += nodesz;                 /* Update the row */
2293     }
2294 endofwhile:;
2295   } while (sctx.newshift);
2296   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2297   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2298   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2299   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2300   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2301 
2302   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2303   /* do not set solve add, since MatSolve_Inode + Add is faster */
2304   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2305   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2306   C->assembled              = PETSC_TRUE;
2307   C->preallocated           = PETSC_TRUE;
2308   if (sctx.nshift) {
2309     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2310       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2311     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2312       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2313     }
2314   }
2315   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2316   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2317   PetscFunctionReturn(0);
2318 }
2319 
2320 
2321 /* ----------------------------------------------------------- */
2322 #undef __FUNCT__
2323 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2324 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2325 {
2326   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2327   IS                iscol = a->col,isrow = a->row;
2328   PetscErrorCode    ierr;
2329   const PetscInt    *r,*c,*rout,*cout;
2330   PetscInt          i,j,n = A->rmap->n;
2331   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2332   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2333   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2334   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2335   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336   const PetscScalar *b;
2337 
2338   PetscFunctionBegin;
2339   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340   node_max = a->inode.node_count;
2341   ns       = a->inode.size;     /* Node Size array */
2342 
2343   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2344   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2345   tmp  = a->solve_work;
2346 
2347   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2348   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2349 
2350   /* forward solve the lower triangular */
2351   tmps = tmp;
2352   aa   = a_a;
2353   aj   = a_j;
2354   ad   = a->diag;
2355 
2356   for (i = 0,row = 0; i< node_max; ++i) {
2357     nsz = ns[i];
2358     aii = ai[row];
2359     v1  = aa + aii;
2360     vi  = aj + aii;
2361     nz  = ai[row+1]- ai[row];
2362 
2363     if (i < node_max-1) {
2364       /* Prefetch the indices for the next block */
2365       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366       /* Prefetch the data for the next block */
2367       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368     }
2369 
2370     switch (nsz) {               /* Each loop in 'case' is unrolled */
2371     case 1:
2372       sum1 = b[r[row]];
2373       for (j=0; j<nz-1; j+=2) {
2374         i0    = vi[j];
2375         i1    = vi[j+1];
2376         tmp0  = tmps[i0];
2377         tmp1  = tmps[i1];
2378         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379       }
2380       if (j == nz-1) {
2381         tmp0  = tmps[vi[j]];
2382         sum1 -= v1[j]*tmp0;
2383       }
2384       tmp[row++]=sum1;
2385       break;
2386     case 2:
2387       sum1 = b[r[row]];
2388       sum2 = b[r[row+1]];
2389       v2   = aa + ai[row+1];
2390 
2391       for (j=0; j<nz-1; j+=2) {
2392         i0    = vi[j];
2393         i1    = vi[j+1];
2394         tmp0  = tmps[i0];
2395         tmp1  = tmps[i1];
2396         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398       }
2399       if (j == nz-1) {
2400         tmp0  = tmps[vi[j]];
2401         sum1 -= v1[j] *tmp0;
2402         sum2 -= v2[j] *tmp0;
2403       }
2404       sum2     -= v2[nz] * sum1;
2405       tmp[row++]=sum1;
2406       tmp[row++]=sum2;
2407       break;
2408     case 3:
2409       sum1 = b[r[row]];
2410       sum2 = b[r[row+1]];
2411       sum3 = b[r[row+2]];
2412       v2   = aa + ai[row+1];
2413       v3   = aa + ai[row+2];
2414 
2415       for (j=0; j<nz-1; j+=2) {
2416         i0    = vi[j];
2417         i1    = vi[j+1];
2418         tmp0  = tmps[i0];
2419         tmp1  = tmps[i1];
2420         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423       }
2424       if (j == nz-1) {
2425         tmp0  = tmps[vi[j]];
2426         sum1 -= v1[j] *tmp0;
2427         sum2 -= v2[j] *tmp0;
2428         sum3 -= v3[j] *tmp0;
2429       }
2430       sum2     -= v2[nz] * sum1;
2431       sum3     -= v3[nz] * sum1;
2432       sum3     -= v3[nz+1] * sum2;
2433       tmp[row++]=sum1;
2434       tmp[row++]=sum2;
2435       tmp[row++]=sum3;
2436       break;
2437 
2438     case 4:
2439       sum1 = b[r[row]];
2440       sum2 = b[r[row+1]];
2441       sum3 = b[r[row+2]];
2442       sum4 = b[r[row+3]];
2443       v2   = aa + ai[row+1];
2444       v3   = aa + ai[row+2];
2445       v4   = aa + ai[row+3];
2446 
2447       for (j=0; j<nz-1; j+=2) {
2448         i0    = vi[j];
2449         i1    = vi[j+1];
2450         tmp0  = tmps[i0];
2451         tmp1  = tmps[i1];
2452         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456       }
2457       if (j == nz-1) {
2458         tmp0  = tmps[vi[j]];
2459         sum1 -= v1[j] *tmp0;
2460         sum2 -= v2[j] *tmp0;
2461         sum3 -= v3[j] *tmp0;
2462         sum4 -= v4[j] *tmp0;
2463       }
2464       sum2 -= v2[nz] * sum1;
2465       sum3 -= v3[nz] * sum1;
2466       sum4 -= v4[nz] * sum1;
2467       sum3 -= v3[nz+1] * sum2;
2468       sum4 -= v4[nz+1] * sum2;
2469       sum4 -= v4[nz+2] * sum3;
2470 
2471       tmp[row++]=sum1;
2472       tmp[row++]=sum2;
2473       tmp[row++]=sum3;
2474       tmp[row++]=sum4;
2475       break;
2476     case 5:
2477       sum1 = b[r[row]];
2478       sum2 = b[r[row+1]];
2479       sum3 = b[r[row+2]];
2480       sum4 = b[r[row+3]];
2481       sum5 = b[r[row+4]];
2482       v2   = aa + ai[row+1];
2483       v3   = aa + ai[row+2];
2484       v4   = aa + ai[row+3];
2485       v5   = aa + ai[row+4];
2486 
2487       for (j=0; j<nz-1; j+=2) {
2488         i0    = vi[j];
2489         i1    = vi[j+1];
2490         tmp0  = tmps[i0];
2491         tmp1  = tmps[i1];
2492         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497       }
2498       if (j == nz-1) {
2499         tmp0  = tmps[vi[j]];
2500         sum1 -= v1[j] *tmp0;
2501         sum2 -= v2[j] *tmp0;
2502         sum3 -= v3[j] *tmp0;
2503         sum4 -= v4[j] *tmp0;
2504         sum5 -= v5[j] *tmp0;
2505       }
2506 
2507       sum2 -= v2[nz] * sum1;
2508       sum3 -= v3[nz] * sum1;
2509       sum4 -= v4[nz] * sum1;
2510       sum5 -= v5[nz] * sum1;
2511       sum3 -= v3[nz+1] * sum2;
2512       sum4 -= v4[nz+1] * sum2;
2513       sum5 -= v5[nz+1] * sum2;
2514       sum4 -= v4[nz+2] * sum3;
2515       sum5 -= v5[nz+2] * sum3;
2516       sum5 -= v5[nz+3] * sum4;
2517 
2518       tmp[row++]=sum1;
2519       tmp[row++]=sum2;
2520       tmp[row++]=sum3;
2521       tmp[row++]=sum4;
2522       tmp[row++]=sum5;
2523       break;
2524     default:
2525       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526     }
2527   }
2528   /* backward solve the upper triangular */
2529   for (i=node_max -1,row = n-1; i>=0; i--) {
2530     nsz = ns[i];
2531     aii = ad[row+1] + 1;
2532     v1  = aa + aii;
2533     vi  = aj + aii;
2534     nz  = ad[row]- ad[row+1] - 1;
2535 
2536     if (i > 0) {
2537       /* Prefetch the indices for the next block */
2538       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539       /* Prefetch the data for the next block */
2540       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541     }
2542 
2543     switch (nsz) {               /* Each loop in 'case' is unrolled */
2544     case 1:
2545       sum1 = tmp[row];
2546 
2547       for (j=0; j<nz-1; j+=2) {
2548         i0    = vi[j];
2549         i1    = vi[j+1];
2550         tmp0  = tmps[i0];
2551         tmp1  = tmps[i1];
2552         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553       }
2554       if (j == nz-1) {
2555         tmp0  = tmps[vi[j]];
2556         sum1 -= v1[j]*tmp0;
2557       }
2558       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559       break;
2560     case 2:
2561       sum1 = tmp[row];
2562       sum2 = tmp[row-1];
2563       v2   = aa + ad[row] + 1;
2564       for (j=0; j<nz-1; j+=2) {
2565         i0    = vi[j];
2566         i1    = vi[j+1];
2567         tmp0  = tmps[i0];
2568         tmp1  = tmps[i1];
2569         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571       }
2572       if (j == nz-1) {
2573         tmp0  = tmps[vi[j]];
2574         sum1 -= v1[j]* tmp0;
2575         sum2 -= v2[j+1]* tmp0;
2576       }
2577 
2578       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579       sum2     -= v2[0] * tmp0;
2580       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581       break;
2582     case 3:
2583       sum1 = tmp[row];
2584       sum2 = tmp[row -1];
2585       sum3 = tmp[row -2];
2586       v2   = aa + ad[row] + 1;
2587       v3   = aa + ad[row -1] + 1;
2588       for (j=0; j<nz-1; j+=2) {
2589         i0    = vi[j];
2590         i1    = vi[j+1];
2591         tmp0  = tmps[i0];
2592         tmp1  = tmps[i1];
2593         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596       }
2597       if (j== nz-1) {
2598         tmp0  = tmps[vi[j]];
2599         sum1 -= v1[j] * tmp0;
2600         sum2 -= v2[j+1] * tmp0;
2601         sum3 -= v3[j+2] * tmp0;
2602       }
2603       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604       sum2     -= v2[0]* tmp0;
2605       sum3     -= v3[1] * tmp0;
2606       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607       sum3     -= v3[0]* tmp0;
2608       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609 
2610       break;
2611     case 4:
2612       sum1 = tmp[row];
2613       sum2 = tmp[row -1];
2614       sum3 = tmp[row -2];
2615       sum4 = tmp[row -3];
2616       v2   = aa + ad[row]+1;
2617       v3   = aa + ad[row -1]+1;
2618       v4   = aa + ad[row -2]+1;
2619 
2620       for (j=0; j<nz-1; j+=2) {
2621         i0    = vi[j];
2622         i1    = vi[j+1];
2623         tmp0  = tmps[i0];
2624         tmp1  = tmps[i1];
2625         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2626         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629       }
2630       if (j== nz-1) {
2631         tmp0  = tmps[vi[j]];
2632         sum1 -= v1[j] * tmp0;
2633         sum2 -= v2[j+1] * tmp0;
2634         sum3 -= v3[j+2] * tmp0;
2635         sum4 -= v4[j+3] * tmp0;
2636       }
2637 
2638       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639       sum2     -= v2[0] * tmp0;
2640       sum3     -= v3[1] * tmp0;
2641       sum4     -= v4[2] * tmp0;
2642       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643       sum3     -= v3[0] * tmp0;
2644       sum4     -= v4[1] * tmp0;
2645       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646       sum4     -= v4[0] * tmp0;
2647       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648       break;
2649     case 5:
2650       sum1 = tmp[row];
2651       sum2 = tmp[row -1];
2652       sum3 = tmp[row -2];
2653       sum4 = tmp[row -3];
2654       sum5 = tmp[row -4];
2655       v2   = aa + ad[row]+1;
2656       v3   = aa + ad[row -1]+1;
2657       v4   = aa + ad[row -2]+1;
2658       v5   = aa + ad[row -3]+1;
2659       for (j=0; j<nz-1; j+=2) {
2660         i0    = vi[j];
2661         i1    = vi[j+1];
2662         tmp0  = tmps[i0];
2663         tmp1  = tmps[i1];
2664         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669       }
2670       if (j==nz-1) {
2671         tmp0  = tmps[vi[j]];
2672         sum1 -= v1[j] * tmp0;
2673         sum2 -= v2[j+1] * tmp0;
2674         sum3 -= v3[j+2] * tmp0;
2675         sum4 -= v4[j+3] * tmp0;
2676         sum5 -= v5[j+4] * tmp0;
2677       }
2678 
2679       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680       sum2     -= v2[0] * tmp0;
2681       sum3     -= v3[1] * tmp0;
2682       sum4     -= v4[2] * tmp0;
2683       sum5     -= v5[3] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685       sum3     -= v3[0] * tmp0;
2686       sum4     -= v4[1] * tmp0;
2687       sum5     -= v5[2] * tmp0;
2688       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689       sum4     -= v4[0] * tmp0;
2690       sum5     -= v5[1] * tmp0;
2691       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692       sum5     -= v5[0] * tmp0;
2693       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694       break;
2695     default:
2696       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697     }
2698   }
2699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2703   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 
2708 /*
2709      Makes a longer coloring[] array and calls the usual code with that
2710 */
2711 #undef __FUNCT__
2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714 {
2715   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2716   PetscErrorCode  ierr;
2717   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718   PetscInt        *colorused,i;
2719   ISColoringValue *newcolor;
2720 
2721   PetscFunctionBegin;
2722   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2723   /* loop over inodes, marking a color for each column*/
2724   row = 0;
2725   for (i=0; i<m; i++) {
2726     for (j=0; j<ns[i]; j++) {
2727       newcolor[row++] = coloring[i] + j*ncolors;
2728     }
2729   }
2730 
2731   /* eliminate unneeded colors */
2732   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2733   for (i=0; i<n; i++) {
2734     colorused[newcolor[i]] = 1;
2735   }
2736 
2737   for (i=1; i<5*ncolors; i++) {
2738     colorused[i] += colorused[i-1];
2739   }
2740   ncolors = colorused[5*ncolors-1];
2741   for (i=0; i<n; i++) {
2742     newcolor[i] = colorused[newcolor[i]]-1;
2743   }
2744   ierr = PetscFree(colorused);CHKERRQ(ierr);
2745   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2746   ierr = PetscFree(coloring);CHKERRQ(ierr);
2747   PetscFunctionReturn(0);
2748 }
2749 
2750 #include <petsc/private/kernels/blockinvert.h>
2751 
2752 #undef __FUNCT__
2753 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2754 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2755 {
2756   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2757   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2758   MatScalar         *ibdiag,*bdiag,work[25],*t;
2759   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2760   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2761   const PetscScalar *xb, *b;
2762   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2763   PetscErrorCode    ierr;
2764   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2765   PetscInt          sz,k,ipvt[5];
2766   PetscBool         allowzeropivot,zeropivotdetected=PETSC_FALSE;
2767   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2768 
2769   PetscFunctionBegin;
2770   allowzeropivot = PetscNot(A->erroriffailure);
2771   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2772   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2773   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2774 
2775   if (!a->inode.ibdiagvalid) {
2776     if (!a->inode.ibdiag) {
2777       /* calculate space needed for diagonal blocks */
2778       for (i=0; i<m; i++) {
2779         cnt += sizes[i]*sizes[i];
2780       }
2781       a->inode.bdiagsize = cnt;
2782 
2783       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2784     }
2785 
2786     /* copy over the diagonal blocks and invert them */
2787     ibdiag = a->inode.ibdiag;
2788     bdiag  = a->inode.bdiag;
2789     cnt    = 0;
2790     for (i=0, row = 0; i<m; i++) {
2791       for (j=0; j<sizes[i]; j++) {
2792         for (k=0; k<sizes[i]; k++) {
2793           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2794         }
2795       }
2796       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2797 
2798       switch (sizes[i]) {
2799       case 1:
2800         /* Create matrix data structure */
2801         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2802           if (allowzeropivot) {
2803             zeropivotdetected = PETSC_TRUE;
2804             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2805           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2806         }
2807         ibdiag[cnt] = 1.0/ibdiag[cnt];
2808         break;
2809       case 2:
2810         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2811         break;
2812       case 3:
2813         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2814         break;
2815       case 4:
2816         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2817         break;
2818       case 5:
2819         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2820         break;
2821       default:
2822         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2823       }
2824       if (zeropivotdetected) {
2825         A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2826       }
2827 
2828       cnt += sizes[i]*sizes[i];
2829       row += sizes[i];
2830     }
2831     a->inode.ibdiagvalid = PETSC_TRUE;
2832   }
2833   ibdiag = a->inode.ibdiag;
2834   bdiag  = a->inode.bdiag;
2835   t      = a->inode.ssor_work;
2836 
2837   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2838   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2839   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2840   if (flag & SOR_ZERO_INITIAL_GUESS) {
2841     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2842 
2843       for (i=0, row=0; i<m; i++) {
2844         sz  = diag[row] - ii[row];
2845         v1  = a->a + ii[row];
2846         idx = a->j + ii[row];
2847 
2848         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2849         switch (sizes[i]) {
2850         case 1:
2851 
2852           sum1 = b[row];
2853           for (n = 0; n<sz-1; n+=2) {
2854             i1    = idx[0];
2855             i2    = idx[1];
2856             idx  += 2;
2857             tmp0  = x[i1];
2858             tmp1  = x[i2];
2859             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2860           }
2861 
2862           if (n == sz-1) {
2863             tmp0  = x[*idx];
2864             sum1 -= *v1 * tmp0;
2865           }
2866           t[row]   = sum1;
2867           x[row++] = sum1*(*ibdiag++);
2868           break;
2869         case 2:
2870           v2   = a->a + ii[row+1];
2871           sum1 = b[row];
2872           sum2 = b[row+1];
2873           for (n = 0; n<sz-1; n+=2) {
2874             i1    = idx[0];
2875             i2    = idx[1];
2876             idx  += 2;
2877             tmp0  = x[i1];
2878             tmp1  = x[i2];
2879             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2880             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2881           }
2882 
2883           if (n == sz-1) {
2884             tmp0  = x[*idx];
2885             sum1 -= v1[0] * tmp0;
2886             sum2 -= v2[0] * tmp0;
2887           }
2888           t[row]   = sum1;
2889           t[row+1] = sum2;
2890           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2891           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2892           ibdiag  += 4;
2893           break;
2894         case 3:
2895           v2   = a->a + ii[row+1];
2896           v3   = a->a + ii[row+2];
2897           sum1 = b[row];
2898           sum2 = b[row+1];
2899           sum3 = b[row+2];
2900           for (n = 0; n<sz-1; n+=2) {
2901             i1    = idx[0];
2902             i2    = idx[1];
2903             idx  += 2;
2904             tmp0  = x[i1];
2905             tmp1  = x[i2];
2906             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2907             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2908             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2909           }
2910 
2911           if (n == sz-1) {
2912             tmp0  = x[*idx];
2913             sum1 -= v1[0] * tmp0;
2914             sum2 -= v2[0] * tmp0;
2915             sum3 -= v3[0] * tmp0;
2916           }
2917           t[row]   = sum1;
2918           t[row+1] = sum2;
2919           t[row+2] = sum3;
2920           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2921           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2922           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2923           ibdiag  += 9;
2924           break;
2925         case 4:
2926           v2   = a->a + ii[row+1];
2927           v3   = a->a + ii[row+2];
2928           v4   = a->a + ii[row+3];
2929           sum1 = b[row];
2930           sum2 = b[row+1];
2931           sum3 = b[row+2];
2932           sum4 = b[row+3];
2933           for (n = 0; n<sz-1; n+=2) {
2934             i1    = idx[0];
2935             i2    = idx[1];
2936             idx  += 2;
2937             tmp0  = x[i1];
2938             tmp1  = x[i2];
2939             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943           }
2944 
2945           if (n == sz-1) {
2946             tmp0  = x[*idx];
2947             sum1 -= v1[0] * tmp0;
2948             sum2 -= v2[0] * tmp0;
2949             sum3 -= v3[0] * tmp0;
2950             sum4 -= v4[0] * tmp0;
2951           }
2952           t[row]   = sum1;
2953           t[row+1] = sum2;
2954           t[row+2] = sum3;
2955           t[row+3] = sum4;
2956           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2957           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2958           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2959           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2960           ibdiag  += 16;
2961           break;
2962         case 5:
2963           v2   = a->a + ii[row+1];
2964           v3   = a->a + ii[row+2];
2965           v4   = a->a + ii[row+3];
2966           v5   = a->a + ii[row+4];
2967           sum1 = b[row];
2968           sum2 = b[row+1];
2969           sum3 = b[row+2];
2970           sum4 = b[row+3];
2971           sum5 = b[row+4];
2972           for (n = 0; n<sz-1; n+=2) {
2973             i1    = idx[0];
2974             i2    = idx[1];
2975             idx  += 2;
2976             tmp0  = x[i1];
2977             tmp1  = x[i2];
2978             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983           }
2984 
2985           if (n == sz-1) {
2986             tmp0  = x[*idx];
2987             sum1 -= v1[0] * tmp0;
2988             sum2 -= v2[0] * tmp0;
2989             sum3 -= v3[0] * tmp0;
2990             sum4 -= v4[0] * tmp0;
2991             sum5 -= v5[0] * tmp0;
2992           }
2993           t[row]   = sum1;
2994           t[row+1] = sum2;
2995           t[row+2] = sum3;
2996           t[row+3] = sum4;
2997           t[row+4] = sum5;
2998           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2999           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3000           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3001           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3002           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3003           ibdiag  += 25;
3004           break;
3005         default:
3006           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3007         }
3008       }
3009 
3010       xb   = t;
3011       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3012     } else xb = b;
3013     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3014 
3015       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3016       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3017         ibdiag -= sizes[i]*sizes[i];
3018         sz      = ii[row+1] - diag[row] - 1;
3019         v1      = a->a + diag[row] + 1;
3020         idx     = a->j + diag[row] + 1;
3021 
3022         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3023         switch (sizes[i]) {
3024         case 1:
3025 
3026           sum1 = xb[row];
3027           for (n = 0; n<sz-1; n+=2) {
3028             i1    = idx[0];
3029             i2    = idx[1];
3030             idx  += 2;
3031             tmp0  = x[i1];
3032             tmp1  = x[i2];
3033             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3034           }
3035 
3036           if (n == sz-1) {
3037             tmp0  = x[*idx];
3038             sum1 -= *v1*tmp0;
3039           }
3040           x[row--] = sum1*(*ibdiag);
3041           break;
3042 
3043         case 2:
3044 
3045           sum1 = xb[row];
3046           sum2 = xb[row-1];
3047           /* note that sum1 is associated with the second of the two rows */
3048           v2 = a->a + diag[row-1] + 2;
3049           for (n = 0; n<sz-1; n+=2) {
3050             i1    = idx[0];
3051             i2    = idx[1];
3052             idx  += 2;
3053             tmp0  = x[i1];
3054             tmp1  = x[i2];
3055             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3056             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3057           }
3058 
3059           if (n == sz-1) {
3060             tmp0  = x[*idx];
3061             sum1 -= *v1*tmp0;
3062             sum2 -= *v2*tmp0;
3063           }
3064           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3065           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3066           break;
3067         case 3:
3068 
3069           sum1 = xb[row];
3070           sum2 = xb[row-1];
3071           sum3 = xb[row-2];
3072           v2   = a->a + diag[row-1] + 2;
3073           v3   = a->a + diag[row-2] + 3;
3074           for (n = 0; n<sz-1; n+=2) {
3075             i1    = idx[0];
3076             i2    = idx[1];
3077             idx  += 2;
3078             tmp0  = x[i1];
3079             tmp1  = x[i2];
3080             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083           }
3084 
3085           if (n == sz-1) {
3086             tmp0  = x[*idx];
3087             sum1 -= *v1*tmp0;
3088             sum2 -= *v2*tmp0;
3089             sum3 -= *v3*tmp0;
3090           }
3091           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3092           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3093           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3094           break;
3095         case 4:
3096 
3097           sum1 = xb[row];
3098           sum2 = xb[row-1];
3099           sum3 = xb[row-2];
3100           sum4 = xb[row-3];
3101           v2   = a->a + diag[row-1] + 2;
3102           v3   = a->a + diag[row-2] + 3;
3103           v4   = a->a + diag[row-3] + 4;
3104           for (n = 0; n<sz-1; n+=2) {
3105             i1    = idx[0];
3106             i2    = idx[1];
3107             idx  += 2;
3108             tmp0  = x[i1];
3109             tmp1  = x[i2];
3110             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3111             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3112             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3113             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3114           }
3115 
3116           if (n == sz-1) {
3117             tmp0  = x[*idx];
3118             sum1 -= *v1*tmp0;
3119             sum2 -= *v2*tmp0;
3120             sum3 -= *v3*tmp0;
3121             sum4 -= *v4*tmp0;
3122           }
3123           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3124           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3125           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3126           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3127           break;
3128         case 5:
3129 
3130           sum1 = xb[row];
3131           sum2 = xb[row-1];
3132           sum3 = xb[row-2];
3133           sum4 = xb[row-3];
3134           sum5 = xb[row-4];
3135           v2   = a->a + diag[row-1] + 2;
3136           v3   = a->a + diag[row-2] + 3;
3137           v4   = a->a + diag[row-3] + 4;
3138           v5   = a->a + diag[row-4] + 5;
3139           for (n = 0; n<sz-1; n+=2) {
3140             i1    = idx[0];
3141             i2    = idx[1];
3142             idx  += 2;
3143             tmp0  = x[i1];
3144             tmp1  = x[i2];
3145             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3146             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3147             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3148             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3149             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3150           }
3151 
3152           if (n == sz-1) {
3153             tmp0  = x[*idx];
3154             sum1 -= *v1*tmp0;
3155             sum2 -= *v2*tmp0;
3156             sum3 -= *v3*tmp0;
3157             sum4 -= *v4*tmp0;
3158             sum5 -= *v5*tmp0;
3159           }
3160           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3161           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3162           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3163           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3164           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3165           break;
3166         default:
3167           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3168         }
3169       }
3170 
3171       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3172     }
3173     its--;
3174   }
3175   while (its--) {
3176 
3177     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3178       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3179            i<m;
3180            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3181 
3182         sz  = diag[row] - ii[row];
3183         v1  = a->a + ii[row];
3184         idx = a->j + ii[row];
3185         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3186         switch (sizes[i]) {
3187         case 1:
3188           sum1 = b[row];
3189           for (n = 0; n<sz-1; n+=2) {
3190             i1    = idx[0];
3191             i2    = idx[1];
3192             idx  += 2;
3193             tmp0  = x[i1];
3194             tmp1  = x[i2];
3195             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3196           }
3197           if (n == sz-1) {
3198             tmp0  = x[*idx++];
3199             sum1 -= *v1 * tmp0;
3200             v1++;
3201           }
3202           t[row]   = sum1;
3203           sz      = ii[row+1] - diag[row] - 1;
3204           idx     = a->j + diag[row] + 1;
3205           v1 += 1;
3206           for (n = 0; n<sz-1; n+=2) {
3207             i1    = idx[0];
3208             i2    = idx[1];
3209             idx  += 2;
3210             tmp0  = x[i1];
3211             tmp1  = x[i2];
3212             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3213           }
3214           if (n == sz-1) {
3215             tmp0  = x[*idx++];
3216             sum1 -= *v1 * tmp0;
3217           }
3218           /* in MatSOR_SeqAIJ this line would be
3219            *
3220            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3221            *
3222            * but omega == 1, so this becomes
3223            *
3224            * x[row] = sum1*(*ibdiag++);
3225            *
3226            */
3227           x[row] = sum1*(*ibdiag);
3228           break;
3229         case 2:
3230           v2   = a->a + ii[row+1];
3231           sum1 = b[row];
3232           sum2 = b[row+1];
3233           for (n = 0; n<sz-1; n+=2) {
3234             i1    = idx[0];
3235             i2    = idx[1];
3236             idx  += 2;
3237             tmp0  = x[i1];
3238             tmp1  = x[i2];
3239             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3240             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3241           }
3242           if (n == sz-1) {
3243             tmp0  = x[*idx++];
3244             sum1 -= v1[0] * tmp0;
3245             sum2 -= v2[0] * tmp0;
3246             v1++; v2++;
3247           }
3248           t[row]   = sum1;
3249           t[row+1] = sum2;
3250           sz      = ii[row+1] - diag[row] - 2;
3251           idx     = a->j + diag[row] + 2;
3252           v1 += 2;
3253           v2 += 2;
3254           for (n = 0; n<sz-1; n+=2) {
3255             i1    = idx[0];
3256             i2    = idx[1];
3257             idx  += 2;
3258             tmp0  = x[i1];
3259             tmp1  = x[i2];
3260             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3261             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3262           }
3263           if (n == sz-1) {
3264             tmp0  = x[*idx];
3265             sum1 -= v1[0] * tmp0;
3266             sum2 -= v2[0] * tmp0;
3267           }
3268           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3269           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3270           break;
3271         case 3:
3272           v2   = a->a + ii[row+1];
3273           v3   = a->a + ii[row+2];
3274           sum1 = b[row];
3275           sum2 = b[row+1];
3276           sum3 = b[row+2];
3277           for (n = 0; n<sz-1; n+=2) {
3278             i1    = idx[0];
3279             i2    = idx[1];
3280             idx  += 2;
3281             tmp0  = x[i1];
3282             tmp1  = x[i2];
3283             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3284             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3285             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3286           }
3287           if (n == sz-1) {
3288             tmp0  = x[*idx++];
3289             sum1 -= v1[0] * tmp0;
3290             sum2 -= v2[0] * tmp0;
3291             sum3 -= v3[0] * tmp0;
3292             v1++; v2++; v3++;
3293           }
3294           t[row]   = sum1;
3295           t[row+1] = sum2;
3296           t[row+2] = sum3;
3297           sz      = ii[row+1] - diag[row] - 3;
3298           idx     = a->j + diag[row] + 3;
3299           v1 += 3;
3300           v2 += 3;
3301           v3 += 3;
3302           for (n = 0; n<sz-1; n+=2) {
3303             i1    = idx[0];
3304             i2    = idx[1];
3305             idx  += 2;
3306             tmp0  = x[i1];
3307             tmp1  = x[i2];
3308             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3309             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3310             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3311           }
3312           if (n == sz-1) {
3313             tmp0  = x[*idx];
3314             sum1 -= v1[0] * tmp0;
3315             sum2 -= v2[0] * tmp0;
3316             sum3 -= v3[0] * tmp0;
3317           }
3318           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3319           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3320           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3321           break;
3322         case 4:
3323           v2   = a->a + ii[row+1];
3324           v3   = a->a + ii[row+2];
3325           v4   = a->a + ii[row+3];
3326           sum1 = b[row];
3327           sum2 = b[row+1];
3328           sum3 = b[row+2];
3329           sum4 = b[row+3];
3330           for (n = 0; n<sz-1; n+=2) {
3331             i1    = idx[0];
3332             i2    = idx[1];
3333             idx  += 2;
3334             tmp0  = x[i1];
3335             tmp1  = x[i2];
3336             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3337             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3338             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3339             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3340           }
3341           if (n == sz-1) {
3342             tmp0  = x[*idx++];
3343             sum1 -= v1[0] * tmp0;
3344             sum2 -= v2[0] * tmp0;
3345             sum3 -= v3[0] * tmp0;
3346             sum4 -= v4[0] * tmp0;
3347             v1++; v2++; v3++; v4++;
3348           }
3349           t[row]   = sum1;
3350           t[row+1] = sum2;
3351           t[row+2] = sum3;
3352           t[row+3] = sum4;
3353           sz      = ii[row+1] - diag[row] - 4;
3354           idx     = a->j + diag[row] + 4;
3355           v1 += 4;
3356           v2 += 4;
3357           v3 += 4;
3358           v4 += 4;
3359           for (n = 0; n<sz-1; n+=2) {
3360             i1    = idx[0];
3361             i2    = idx[1];
3362             idx  += 2;
3363             tmp0  = x[i1];
3364             tmp1  = x[i2];
3365             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3366             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3367             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3368             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3369           }
3370           if (n == sz-1) {
3371             tmp0  = x[*idx];
3372             sum1 -= v1[0] * tmp0;
3373             sum2 -= v2[0] * tmp0;
3374             sum3 -= v3[0] * tmp0;
3375             sum4 -= v4[0] * tmp0;
3376           }
3377           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3378           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3379           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3380           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3381           break;
3382         case 5:
3383           v2   = a->a + ii[row+1];
3384           v3   = a->a + ii[row+2];
3385           v4   = a->a + ii[row+3];
3386           v5   = a->a + ii[row+4];
3387           sum1 = b[row];
3388           sum2 = b[row+1];
3389           sum3 = b[row+2];
3390           sum4 = b[row+3];
3391           sum5 = b[row+4];
3392           for (n = 0; n<sz-1; n+=2) {
3393             i1    = idx[0];
3394             i2    = idx[1];
3395             idx  += 2;
3396             tmp0  = x[i1];
3397             tmp1  = x[i2];
3398             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3399             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3400             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3401             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3402             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3403           }
3404           if (n == sz-1) {
3405             tmp0  = x[*idx++];
3406             sum1 -= v1[0] * tmp0;
3407             sum2 -= v2[0] * tmp0;
3408             sum3 -= v3[0] * tmp0;
3409             sum4 -= v4[0] * tmp0;
3410             sum5 -= v5[0] * tmp0;
3411             v1++; v2++; v3++; v4++; v5++;
3412           }
3413           t[row]   = sum1;
3414           t[row+1] = sum2;
3415           t[row+2] = sum3;
3416           t[row+3] = sum4;
3417           t[row+4] = sum5;
3418           sz      = ii[row+1] - diag[row] - 5;
3419           idx     = a->j + diag[row] + 5;
3420           v1 += 5;
3421           v2 += 5;
3422           v3 += 5;
3423           v4 += 5;
3424           v5 += 5;
3425           for (n = 0; n<sz-1; n+=2) {
3426             i1    = idx[0];
3427             i2    = idx[1];
3428             idx  += 2;
3429             tmp0  = x[i1];
3430             tmp1  = x[i2];
3431             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3432             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3433             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3434             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3435             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3436           }
3437           if (n == sz-1) {
3438             tmp0  = x[*idx];
3439             sum1 -= v1[0] * tmp0;
3440             sum2 -= v2[0] * tmp0;
3441             sum3 -= v3[0] * tmp0;
3442             sum4 -= v4[0] * tmp0;
3443             sum5 -= v5[0] * tmp0;
3444           }
3445           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3446           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3447           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3448           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3449           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3450           break;
3451         default:
3452           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3453         }
3454       }
3455       xb   = t;
3456       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3457     } else xb = b;
3458 
3459     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3460 
3461       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3462       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3463         ibdiag -= sizes[i]*sizes[i];
3464 
3465         /* set RHS */
3466         if (xb == b) {
3467           /* whole (old way) */
3468           sz      = ii[row+1] - ii[row];
3469           idx     = a->j + ii[row];
3470           switch (sizes[i]) {
3471           case 5:
3472             v5      = a->a + ii[row-4];
3473           case 4: /* fall through */
3474             v4      = a->a + ii[row-3];
3475           case 3:
3476             v3      = a->a + ii[row-2];
3477           case 2:
3478             v2      = a->a + ii[row-1];
3479           case 1:
3480             v1      = a->a + ii[row];
3481             break;
3482           default:
3483             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3484           }
3485         } else {
3486           /* upper, no diag */
3487           sz      = ii[row+1] - diag[row] - 1;
3488           idx     = a->j + diag[row] + 1;
3489           switch (sizes[i]) {
3490           case 5:
3491             v5      = a->a + diag[row-4] + 5;
3492           case 4: /* fall through */
3493             v4      = a->a + diag[row-3] + 4;
3494           case 3:
3495             v3      = a->a + diag[row-2] + 3;
3496           case 2:
3497             v2      = a->a + diag[row-1] + 2;
3498           case 1:
3499             v1      = a->a + diag[row] + 1;
3500           }
3501         }
3502         /* set sum */
3503         switch (sizes[i]) {
3504         case 5:
3505           sum5 = xb[row-4];
3506         case 4: /* fall through */
3507           sum4 = xb[row-3];
3508         case 3:
3509           sum3 = xb[row-2];
3510         case 2:
3511           sum2 = xb[row-1];
3512         case 1:
3513           /* note that sum1 is associated with the last row */
3514           sum1 = xb[row];
3515         }
3516         /* do sums */
3517         for (n = 0; n<sz-1; n+=2) {
3518             i1    = idx[0];
3519             i2    = idx[1];
3520             idx  += 2;
3521             tmp0  = x[i1];
3522             tmp1  = x[i2];
3523             switch (sizes[i]) {
3524             case 5:
3525               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3526             case 4: /* fall through */
3527               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3528             case 3:
3529               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3530             case 2:
3531               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3532             case 1:
3533               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3534             }
3535         }
3536         /* ragged edge */
3537         if (n == sz-1) {
3538           tmp0  = x[*idx];
3539           switch (sizes[i]) {
3540           case 5:
3541             sum5 -= *v5*tmp0;
3542           case 4: /* fall through */
3543             sum4 -= *v4*tmp0;
3544           case 3:
3545             sum3 -= *v3*tmp0;
3546           case 2:
3547             sum2 -= *v2*tmp0;
3548           case 1:
3549             sum1 -= *v1*tmp0;
3550           }
3551         }
3552         /* update */
3553         if (xb == b) {
3554           /* whole (old way) w/ diag */
3555           switch (sizes[i]) {
3556           case 5:
3557             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3558             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3559             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3560             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3561             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3562             break;
3563           case 4:
3564             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3565             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3566             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3567             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3568             break;
3569           case 3:
3570             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3571             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3572             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3573             break;
3574           case 2:
3575             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3576             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3577             break;
3578           case 1:
3579             x[row--] += sum1*(*ibdiag);
3580             break;
3581           }
3582         } else {
3583           /* no diag so set =  */
3584           switch (sizes[i]) {
3585           case 5:
3586             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3587             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3588             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3589             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3590             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3591             break;
3592           case 4:
3593             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3594             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3595             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3596             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3597             break;
3598           case 3:
3599             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3600             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3601             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3602             break;
3603           case 2:
3604             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3605             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3606             break;
3607           case 1:
3608             x[row--] = sum1*(*ibdiag);
3609             break;
3610           }
3611         }
3612       }
3613       if (xb == b) {
3614         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3615       } else {
3616         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3617       }
3618     }
3619   }
3620   if (flag & SOR_EISENSTAT) {
3621     /*
3622           Apply  (U + D)^-1  where D is now the block diagonal
3623     */
3624     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3625     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3626       ibdiag -= sizes[i]*sizes[i];
3627       sz      = ii[row+1] - diag[row] - 1;
3628       v1      = a->a + diag[row] + 1;
3629       idx     = a->j + diag[row] + 1;
3630       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3631       switch (sizes[i]) {
3632       case 1:
3633 
3634         sum1 = b[row];
3635         for (n = 0; n<sz-1; n+=2) {
3636           i1    = idx[0];
3637           i2    = idx[1];
3638           idx  += 2;
3639           tmp0  = x[i1];
3640           tmp1  = x[i2];
3641           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3642         }
3643 
3644         if (n == sz-1) {
3645           tmp0  = x[*idx];
3646           sum1 -= *v1*tmp0;
3647         }
3648         x[row] = sum1*(*ibdiag);row--;
3649         break;
3650 
3651       case 2:
3652 
3653         sum1 = b[row];
3654         sum2 = b[row-1];
3655         /* note that sum1 is associated with the second of the two rows */
3656         v2 = a->a + diag[row-1] + 2;
3657         for (n = 0; n<sz-1; n+=2) {
3658           i1    = idx[0];
3659           i2    = idx[1];
3660           idx  += 2;
3661           tmp0  = x[i1];
3662           tmp1  = x[i2];
3663           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3664           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3665         }
3666 
3667         if (n == sz-1) {
3668           tmp0  = x[*idx];
3669           sum1 -= *v1*tmp0;
3670           sum2 -= *v2*tmp0;
3671         }
3672         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3673         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3674         row     -= 2;
3675         break;
3676       case 3:
3677 
3678         sum1 = b[row];
3679         sum2 = b[row-1];
3680         sum3 = b[row-2];
3681         v2   = a->a + diag[row-1] + 2;
3682         v3   = a->a + diag[row-2] + 3;
3683         for (n = 0; n<sz-1; n+=2) {
3684           i1    = idx[0];
3685           i2    = idx[1];
3686           idx  += 2;
3687           tmp0  = x[i1];
3688           tmp1  = x[i2];
3689           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3690           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3691           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3692         }
3693 
3694         if (n == sz-1) {
3695           tmp0  = x[*idx];
3696           sum1 -= *v1*tmp0;
3697           sum2 -= *v2*tmp0;
3698           sum3 -= *v3*tmp0;
3699         }
3700         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3701         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3702         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3703         row     -= 3;
3704         break;
3705       case 4:
3706 
3707         sum1 = b[row];
3708         sum2 = b[row-1];
3709         sum3 = b[row-2];
3710         sum4 = b[row-3];
3711         v2   = a->a + diag[row-1] + 2;
3712         v3   = a->a + diag[row-2] + 3;
3713         v4   = a->a + diag[row-3] + 4;
3714         for (n = 0; n<sz-1; n+=2) {
3715           i1    = idx[0];
3716           i2    = idx[1];
3717           idx  += 2;
3718           tmp0  = x[i1];
3719           tmp1  = x[i2];
3720           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724         }
3725 
3726         if (n == sz-1) {
3727           tmp0  = x[*idx];
3728           sum1 -= *v1*tmp0;
3729           sum2 -= *v2*tmp0;
3730           sum3 -= *v3*tmp0;
3731           sum4 -= *v4*tmp0;
3732         }
3733         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3734         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3735         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3736         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3737         row     -= 4;
3738         break;
3739       case 5:
3740 
3741         sum1 = b[row];
3742         sum2 = b[row-1];
3743         sum3 = b[row-2];
3744         sum4 = b[row-3];
3745         sum5 = b[row-4];
3746         v2   = a->a + diag[row-1] + 2;
3747         v3   = a->a + diag[row-2] + 3;
3748         v4   = a->a + diag[row-3] + 4;
3749         v5   = a->a + diag[row-4] + 5;
3750         for (n = 0; n<sz-1; n+=2) {
3751           i1    = idx[0];
3752           i2    = idx[1];
3753           idx  += 2;
3754           tmp0  = x[i1];
3755           tmp1  = x[i2];
3756           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3757           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3758           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3759           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3760           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3761         }
3762 
3763         if (n == sz-1) {
3764           tmp0  = x[*idx];
3765           sum1 -= *v1*tmp0;
3766           sum2 -= *v2*tmp0;
3767           sum3 -= *v3*tmp0;
3768           sum4 -= *v4*tmp0;
3769           sum5 -= *v5*tmp0;
3770         }
3771         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3772         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3773         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3774         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3775         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3776         row     -= 5;
3777         break;
3778       default:
3779         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3780       }
3781     }
3782     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3783 
3784     /*
3785            t = b - D x    where D is the block diagonal
3786     */
3787     cnt = 0;
3788     for (i=0, row=0; i<m; i++) {
3789       switch (sizes[i]) {
3790       case 1:
3791         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3792         break;
3793       case 2:
3794         x1       = x[row]; x2 = x[row+1];
3795         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3796         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3797         t[row]   = b[row] - tmp1;
3798         t[row+1] = b[row+1] - tmp2; row += 2;
3799         cnt     += 4;
3800         break;
3801       case 3:
3802         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3803         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3804         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3805         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3806         t[row]   = b[row] - tmp1;
3807         t[row+1] = b[row+1] - tmp2;
3808         t[row+2] = b[row+2] - tmp3; row += 3;
3809         cnt     += 9;
3810         break;
3811       case 4:
3812         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3813         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3814         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3815         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3816         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3817         t[row]   = b[row] - tmp1;
3818         t[row+1] = b[row+1] - tmp2;
3819         t[row+2] = b[row+2] - tmp3;
3820         t[row+3] = b[row+3] - tmp4; row += 4;
3821         cnt     += 16;
3822         break;
3823       case 5:
3824         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3825         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3826         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3827         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3828         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3829         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3830         t[row]   = b[row] - tmp1;
3831         t[row+1] = b[row+1] - tmp2;
3832         t[row+2] = b[row+2] - tmp3;
3833         t[row+3] = b[row+3] - tmp4;
3834         t[row+4] = b[row+4] - tmp5;row += 5;
3835         cnt     += 25;
3836         break;
3837       default:
3838         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3839       }
3840     }
3841     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3842 
3843 
3844 
3845     /*
3846           Apply (L + D)^-1 where D is the block diagonal
3847     */
3848     for (i=0, row=0; i<m; i++) {
3849       sz  = diag[row] - ii[row];
3850       v1  = a->a + ii[row];
3851       idx = a->j + ii[row];
3852       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3853       switch (sizes[i]) {
3854       case 1:
3855 
3856         sum1 = t[row];
3857         for (n = 0; n<sz-1; n+=2) {
3858           i1    = idx[0];
3859           i2    = idx[1];
3860           idx  += 2;
3861           tmp0  = t[i1];
3862           tmp1  = t[i2];
3863           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3864         }
3865 
3866         if (n == sz-1) {
3867           tmp0  = t[*idx];
3868           sum1 -= *v1 * tmp0;
3869         }
3870         x[row] += t[row] = sum1*(*ibdiag++); row++;
3871         break;
3872       case 2:
3873         v2   = a->a + ii[row+1];
3874         sum1 = t[row];
3875         sum2 = t[row+1];
3876         for (n = 0; n<sz-1; n+=2) {
3877           i1    = idx[0];
3878           i2    = idx[1];
3879           idx  += 2;
3880           tmp0  = t[i1];
3881           tmp1  = t[i2];
3882           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3883           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3884         }
3885 
3886         if (n == sz-1) {
3887           tmp0  = t[*idx];
3888           sum1 -= v1[0] * tmp0;
3889           sum2 -= v2[0] * tmp0;
3890         }
3891         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3892         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3893         ibdiag   += 4; row += 2;
3894         break;
3895       case 3:
3896         v2   = a->a + ii[row+1];
3897         v3   = a->a + ii[row+2];
3898         sum1 = t[row];
3899         sum2 = t[row+1];
3900         sum3 = t[row+2];
3901         for (n = 0; n<sz-1; n+=2) {
3902           i1    = idx[0];
3903           i2    = idx[1];
3904           idx  += 2;
3905           tmp0  = t[i1];
3906           tmp1  = t[i2];
3907           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3908           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3909           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3910         }
3911 
3912         if (n == sz-1) {
3913           tmp0  = t[*idx];
3914           sum1 -= v1[0] * tmp0;
3915           sum2 -= v2[0] * tmp0;
3916           sum3 -= v3[0] * tmp0;
3917         }
3918         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3919         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3920         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3921         ibdiag   += 9; row += 3;
3922         break;
3923       case 4:
3924         v2   = a->a + ii[row+1];
3925         v3   = a->a + ii[row+2];
3926         v4   = a->a + ii[row+3];
3927         sum1 = t[row];
3928         sum2 = t[row+1];
3929         sum3 = t[row+2];
3930         sum4 = t[row+3];
3931         for (n = 0; n<sz-1; n+=2) {
3932           i1    = idx[0];
3933           i2    = idx[1];
3934           idx  += 2;
3935           tmp0  = t[i1];
3936           tmp1  = t[i2];
3937           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3938           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3939           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3940           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3941         }
3942 
3943         if (n == sz-1) {
3944           tmp0  = t[*idx];
3945           sum1 -= v1[0] * tmp0;
3946           sum2 -= v2[0] * tmp0;
3947           sum3 -= v3[0] * tmp0;
3948           sum4 -= v4[0] * tmp0;
3949         }
3950         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3951         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3952         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3953         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3954         ibdiag   += 16; row += 4;
3955         break;
3956       case 5:
3957         v2   = a->a + ii[row+1];
3958         v3   = a->a + ii[row+2];
3959         v4   = a->a + ii[row+3];
3960         v5   = a->a + ii[row+4];
3961         sum1 = t[row];
3962         sum2 = t[row+1];
3963         sum3 = t[row+2];
3964         sum4 = t[row+3];
3965         sum5 = t[row+4];
3966         for (n = 0; n<sz-1; n+=2) {
3967           i1    = idx[0];
3968           i2    = idx[1];
3969           idx  += 2;
3970           tmp0  = t[i1];
3971           tmp1  = t[i2];
3972           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3973           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3974           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3975           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3976           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3977         }
3978 
3979         if (n == sz-1) {
3980           tmp0  = t[*idx];
3981           sum1 -= v1[0] * tmp0;
3982           sum2 -= v2[0] * tmp0;
3983           sum3 -= v3[0] * tmp0;
3984           sum4 -= v4[0] * tmp0;
3985           sum5 -= v5[0] * tmp0;
3986         }
3987         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3988         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3989         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3990         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3991         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3992         ibdiag   += 25; row += 5;
3993         break;
3994       default:
3995         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3996       }
3997     }
3998     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3999   }
4000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4001   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4002   PetscFunctionReturn(0);
4003 }
4004 
4005 #undef __FUNCT__
4006 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
4007 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4008 {
4009   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4010   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4011   const MatScalar   *bdiag = a->inode.bdiag;
4012   const PetscScalar *b;
4013   PetscErrorCode    ierr;
4014   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4015   const PetscInt    *sizes = a->inode.size;
4016 
4017   PetscFunctionBegin;
4018   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4019   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4020   cnt  = 0;
4021   for (i=0, row=0; i<m; i++) {
4022     switch (sizes[i]) {
4023     case 1:
4024       x[row] = b[row]*bdiag[cnt++];row++;
4025       break;
4026     case 2:
4027       x1       = b[row]; x2 = b[row+1];
4028       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4029       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4030       x[row++] = tmp1;
4031       x[row++] = tmp2;
4032       cnt     += 4;
4033       break;
4034     case 3:
4035       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4036       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4037       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4038       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4039       x[row++] = tmp1;
4040       x[row++] = tmp2;
4041       x[row++] = tmp3;
4042       cnt     += 9;
4043       break;
4044     case 4:
4045       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4046       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4047       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4048       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4049       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4050       x[row++] = tmp1;
4051       x[row++] = tmp2;
4052       x[row++] = tmp3;
4053       x[row++] = tmp4;
4054       cnt     += 16;
4055       break;
4056     case 5:
4057       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4058       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4059       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4060       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4061       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4062       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4063       x[row++] = tmp1;
4064       x[row++] = tmp2;
4065       x[row++] = tmp3;
4066       x[row++] = tmp4;
4067       x[row++] = tmp5;
4068       cnt     += 25;
4069       break;
4070     default:
4071       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4072     }
4073   }
4074   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4075   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4076   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4077   PetscFunctionReturn(0);
4078 }
4079 
4080 /*
4081     samestructure indicates that the matrix has not changed its nonzero structure so we
4082     do not need to recompute the inodes
4083 */
4084 #undef __FUNCT__
4085 #define __FUNCT__ "MatSeqAIJCheckInode"
4086 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4087 {
4088   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4089   PetscErrorCode ierr;
4090   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4091   PetscBool      flag;
4092   const PetscInt *idx,*idy,*ii;
4093 
4094   PetscFunctionBegin;
4095   if (!a->inode.use) PetscFunctionReturn(0);
4096   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4097 
4098   m = A->rmap->n;
4099   if (a->inode.size) ns = a->inode.size;
4100   else {
4101     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4102   }
4103 
4104   i          = 0;
4105   node_count = 0;
4106   idx        = a->j;
4107   ii         = a->i;
4108   while (i < m) {                /* For each row */
4109     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4110     /* Limits the number of elements in a node to 'a->inode.limit' */
4111     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4112       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4113       if (nzy != nzx) break;
4114       idy += nzx;              /* Same nonzero pattern */
4115       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4116       if (!flag) break;
4117     }
4118     ns[node_count++] = blk_size;
4119     idx             += blk_size*nzx;
4120     i                = j;
4121   }
4122   /* If not enough inodes found,, do not use inode version of the routines */
4123   if (!m || node_count > .8*m) {
4124     ierr = PetscFree(ns);CHKERRQ(ierr);
4125 
4126     a->inode.node_count       = 0;
4127     a->inode.size             = NULL;
4128     a->inode.use              = PETSC_FALSE;
4129     A->ops->mult              = MatMult_SeqAIJ;
4130     A->ops->sor               = MatSOR_SeqAIJ;
4131     A->ops->multadd           = MatMultAdd_SeqAIJ;
4132     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4133     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4134     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4135     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4136     A->ops->coloringpatch     = 0;
4137     A->ops->multdiagonalblock = 0;
4138 
4139     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4140   } else {
4141     if (!A->factortype) {
4142       A->ops->mult              = MatMult_SeqAIJ_Inode;
4143       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4144       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4145       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4146       if (A->rmap->n == A->cmap->n) {
4147         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4148         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4149         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4150         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4151         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4152       }
4153     } else {
4154       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4155     }
4156     a->inode.node_count = node_count;
4157     a->inode.size       = ns;
4158     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4159   }
4160   a->inode.checked          = PETSC_TRUE;
4161   a->inode.mat_nonzerostate = A->nonzerostate;
4162   PetscFunctionReturn(0);
4163 }
4164 
4165 #undef __FUNCT__
4166 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4167 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4168 {
4169   Mat            B =*C;
4170   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4171   PetscErrorCode ierr;
4172   PetscInt       m=A->rmap->n;
4173 
4174   PetscFunctionBegin;
4175   c->inode.use       = a->inode.use;
4176   c->inode.limit     = a->inode.limit;
4177   c->inode.max_limit = a->inode.max_limit;
4178   if (a->inode.size) {
4179     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4180     c->inode.node_count = a->inode.node_count;
4181     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4182     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4183     if (!B->factortype) {
4184       B->ops->mult              = MatMult_SeqAIJ_Inode;
4185       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4186       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4187       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4188       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4189       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4190       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4191       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4192       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4193     } else {
4194       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4195     }
4196   } else {
4197     c->inode.size       = 0;
4198     c->inode.node_count = 0;
4199   }
4200   c->inode.ibdiagvalid = PETSC_FALSE;
4201   c->inode.ibdiag      = 0;
4202   c->inode.bdiag       = 0;
4203   PetscFunctionReturn(0);
4204 }
4205 
4206 #undef __FUNCT__
4207 #define __FUNCT__ "MatGetRow_FactoredLU"
4208 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4209 {
4210   PetscInt       k;
4211   const PetscInt *vi;
4212 
4213   PetscFunctionBegin;
4214   vi = aj + ai[row];
4215   for (k=0; k<nzl; k++) cols[k] = vi[k];
4216   vi        = aj + adiag[row];
4217   cols[nzl] = vi[0];
4218   vi        = aj + adiag[row+1]+1;
4219   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4220   PetscFunctionReturn(0);
4221 }
4222 /*
4223    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4224    Modified from MatSeqAIJCheckInode().
4225 
4226    Input Parameters:
4227 .  Mat A - ILU or LU matrix factor
4228 
4229 */
4230 #undef __FUNCT__
4231 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU"
4232 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4233 {
4234   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4235   PetscErrorCode ierr;
4236   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4237   PetscInt       *cols1,*cols2,*ns;
4238   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4239   PetscBool      flag;
4240 
4241   PetscFunctionBegin;
4242   if (!a->inode.use)    PetscFunctionReturn(0);
4243   if (a->inode.checked) PetscFunctionReturn(0);
4244 
4245   m = A->rmap->n;
4246   if (a->inode.size) ns = a->inode.size;
4247   else {
4248     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4249   }
4250 
4251   i          = 0;
4252   node_count = 0;
4253   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4254   while (i < m) {                /* For each row */
4255     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4256     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4257     nzx  = nzl1 + nzu1 + 1;
4258     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4259 
4260     /* Limits the number of elements in a node to 'a->inode.limit' */
4261     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4262       nzl2 = ai[j+1] - ai[j];
4263       nzu2 = adiag[j] - adiag[j+1] - 1;
4264       nzy  = nzl2 + nzu2 + 1;
4265       if (nzy != nzx) break;
4266       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4267       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4268       if (!flag) break;
4269     }
4270     ns[node_count++] = blk_size;
4271     i                = j;
4272   }
4273   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4274   /* If not enough inodes found,, do not use inode version of the routines */
4275   if (!m || node_count > .8*m) {
4276     ierr = PetscFree(ns);CHKERRQ(ierr);
4277 
4278     a->inode.node_count = 0;
4279     a->inode.size       = NULL;
4280     a->inode.use        = PETSC_FALSE;
4281 
4282     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4283   } else {
4284     A->ops->mult              = 0;
4285     A->ops->sor               = 0;
4286     A->ops->multadd           = 0;
4287     A->ops->getrowij          = 0;
4288     A->ops->restorerowij      = 0;
4289     A->ops->getcolumnij       = 0;
4290     A->ops->restorecolumnij   = 0;
4291     A->ops->coloringpatch     = 0;
4292     A->ops->multdiagonalblock = 0;
4293     a->inode.node_count       = node_count;
4294     a->inode.size             = ns;
4295 
4296     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4297   }
4298   a->inode.checked = PETSC_TRUE;
4299   PetscFunctionReturn(0);
4300 }
4301 
4302 #undef __FUNCT__
4303 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4304 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4305 {
4306   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4307 
4308   PetscFunctionBegin;
4309   a->inode.ibdiagvalid = PETSC_FALSE;
4310   PetscFunctionReturn(0);
4311 }
4312 
4313 /*
4314      This is really ugly. if inodes are used this replaces the
4315   permutations with ones that correspond to rows/cols of the matrix
4316   rather then inode blocks
4317 */
4318 #undef __FUNCT__
4319 #define __FUNCT__ "MatInodeAdjustForInodes"
4320 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4321 {
4322   PetscErrorCode ierr;
4323 
4324   PetscFunctionBegin;
4325   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4326   PetscFunctionReturn(0);
4327 }
4328 
4329 #undef __FUNCT__
4330 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4331 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4332 {
4333   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4334   PetscErrorCode ierr;
4335   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4336   const PetscInt *ridx,*cidx;
4337   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4338   PetscInt       nslim_col,*ns_col;
4339   IS             ris = *rperm,cis = *cperm;
4340 
4341   PetscFunctionBegin;
4342   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4343   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4344 
4345   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4346   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4347   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4348 
4349   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4350   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4351 
4352   /* Form the inode structure for the rows of permuted matric using inv perm*/
4353   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4354 
4355   /* Construct the permutations for rows*/
4356   for (i=0,row = 0; i<nslim_row; ++i) {
4357     indx      = ridx[i];
4358     start_val = tns[indx];
4359     end_val   = tns[indx + 1];
4360     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4361   }
4362 
4363   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4364   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4365 
4366   /* Construct permutations for columns */
4367   for (i=0,col=0; i<nslim_col; ++i) {
4368     indx      = cidx[i];
4369     start_val = tns[indx];
4370     end_val   = tns[indx + 1];
4371     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4372   }
4373 
4374   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4375   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4376   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4377   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4378 
4379   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4380   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4381 
4382   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4383   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4384   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4385   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4386   ierr = PetscFree(tns);CHKERRQ(ierr);
4387   PetscFunctionReturn(0);
4388 }
4389 
4390 #undef __FUNCT__
4391 #define __FUNCT__ "MatInodeGetInodeSizes"
4392 /*@C
4393    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4394 
4395    Not Collective
4396 
4397    Input Parameter:
4398 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4399 
4400    Output Parameter:
4401 +  node_count - no of inodes present in the matrix.
4402 .  sizes      - an array of size node_count,with sizes of each inode.
4403 -  limit      - the max size used to generate the inodes.
4404 
4405    Level: advanced
4406 
4407    Notes: This routine returns some internal storage information
4408    of the matrix, it is intended to be used by advanced users.
4409    It should be called after the matrix is assembled.
4410    The contents of the sizes[] array should not be changed.
4411    NULL may be passed for information not requested.
4412 
4413 .keywords: matrix, seqaij, get, inode
4414 
4415 .seealso: MatGetInfo()
4416 @*/
4417 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4418 {
4419   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4420 
4421   PetscFunctionBegin;
4422   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4423   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4424   if (f) {
4425     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4426   }
4427   PetscFunctionReturn(0);
4428 }
4429 
4430 #undef __FUNCT__
4431 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4432 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4433 {
4434   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4435 
4436   PetscFunctionBegin;
4437   if (node_count) *node_count = a->inode.node_count;
4438   if (sizes)      *sizes      = a->inode.size;
4439   if (limit)      *limit      = a->inode.limit;
4440   PetscFunctionReturn(0);
4441 }
4442