xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 8cc058d9cd56c1ccb3be12a47760ddfc446aaffc)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1) {
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
86   }
87   /* allocate space for row pointers */
88   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
89   *iia = ia;
90   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
91   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
92 
93   /* determine the number of columns in each row */
94   ia[0] = oshift;
95   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
96 
97     j    = aj + ai[row] + ishift;
98     jmax = aj + ai[row+1] + ishift;
99     col  = *j++ + ishift;
100     i2   = tvc[col];
101     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
102       ia[i1+1]++;
103       ia[i2+1]++;
104       i2++;                     /* Start col of next node */
105       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
106       i2 = tvc[col];
107     }
108     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
109   }
110 
111   /* shift ia[i] to point to next row */
112   for (i1=1; i1<nslim_row+1; i1++) {
113     row        = ia[i1-1];
114     ia[i1]    += row;
115     work[i1-1] = row - oshift;
116   }
117 
118   /* allocate space for column pointers */
119   nz   = ia[nslim_row] + (!ishift);
120   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
121   *jja = ja;
122 
123   /* loop over lower triangular part putting into ja */
124   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
125     j    = aj + ai[row] + ishift;
126     jmax = aj + ai[row+1] + ishift;
127     col  = *j++ + ishift;
128     i2   = tvc[col];
129     while (i2<i1 && j<jmax) {
130       ja[work[i2]++] = i1 + oshift;
131       ja[work[i1]++] = i2 + oshift;
132       ++i2;
133       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
134       i2 = tvc[col];
135     }
136     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
137 
138   }
139   ierr = PetscFree(work);CHKERRQ(ierr);
140   ierr = PetscFree(tns);CHKERRQ(ierr);
141   ierr = PetscFree(tvc);CHKERRQ(ierr);
142   PetscFunctionReturn(0);
143 }
144 
145 /*
146       This builds nonsymmetric version of nonzero structure,
147 */
148 #undef __FUNCT__
149 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
150 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
151 {
152   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
153   PetscErrorCode ierr;
154   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
155   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
156 
157   PetscFunctionBegin;
158   nslim_row = a->inode.node_count;
159   n         = A->cmap->n;
160 
161   /* Create The column_inode for this matrix */
162   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
163 
164   /* allocate space for reformated column_inode structure */
165   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
166   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
167   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
168 
169   for (i1=0,col=0; i1<nslim_col; ++i1) {
170     nsz = ns_col[i1];
171     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
172   }
173   /* allocate space for row pointers */
174   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
175   *iia = ia;
176   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
177   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
178 
179   /* determine the number of columns in each row */
180   ia[0] = oshift;
181   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
182     j   = aj + ai[row] + ishift;
183     col = *j++ + ishift;
184     i2  = tvc[col];
185     nz  = ai[row+1] - ai[row];
186     while (nz-- > 0) {           /* off-diagonal elemets */
187       ia[i1+1]++;
188       i2++;                     /* Start col of next node */
189       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--;
190       if (nz > 0) i2 = tvc[col];
191     }
192   }
193 
194   /* shift ia[i] to point to next row */
195   for (i1=1; i1<nslim_row+1; i1++) {
196     row        = ia[i1-1];
197     ia[i1]    += row;
198     work[i1-1] = row - oshift;
199   }
200 
201   /* allocate space for column pointers */
202   nz   = ia[nslim_row] + (!ishift);
203   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
204   *jja = ja;
205 
206   /* loop over matrix putting into ja */
207   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
208     j   = aj + ai[row] + ishift;
209     col = *j++ + ishift;
210     i2  = tvc[col];
211     nz  = ai[row+1] - ai[row];
212     while (nz-- > 0) {
213       ja[work[i1]++] = i2 + oshift;
214       ++i2;
215       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--;
216       if (nz > 0) i2 = tvc[col];
217     }
218   }
219   ierr = PetscFree(ns_col);CHKERRQ(ierr);
220   ierr = PetscFree(work);CHKERRQ(ierr);
221   ierr = PetscFree(tns);CHKERRQ(ierr);
222   ierr = PetscFree(tvc);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 #undef __FUNCT__
227 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
228 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
229 {
230   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
231   PetscErrorCode ierr;
232 
233   PetscFunctionBegin;
234   *n = a->inode.node_count;
235   if (!ia) PetscFunctionReturn(0);
236   if (!blockcompressed) {
237     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
238   } else if (symmetric) {
239     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
240   } else {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   }
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
248 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   if (!ia) PetscFunctionReturn(0);
254 
255   if (!blockcompressed) {
256     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
257   } else {
258     ierr = PetscFree(*ia);CHKERRQ(ierr);
259     ierr = PetscFree(*ja);CHKERRQ(ierr);
260   }
261   PetscFunctionReturn(0);
262 }
263 
264 /* ----------------------------------------------------------- */
265 
266 #undef __FUNCT__
267 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
268 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269 {
270   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
271   PetscErrorCode ierr;
272   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
274 
275   PetscFunctionBegin;
276   nslim_row = a->inode.node_count;
277   n         = A->cmap->n;
278 
279   /* Create The column_inode for this matrix */
280   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
281 
282   /* allocate space for reformated column_inode structure */
283   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
284   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
285   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
286 
287   for (i1=0,col=0; i1<nslim_col; ++i1) {
288     nsz = ns_col[i1];
289     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
290   }
291   /* allocate space for column pointers */
292   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
293   *iia = ia;
294   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
295   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
296 
297   /* determine the number of columns in each row */
298   ia[0] = oshift;
299   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
300     j   = aj + ai[row] + ishift;
301     col = *j++ + ishift;
302     i2  = tvc[col];
303     nz  = ai[row+1] - ai[row];
304     while (nz-- > 0) {           /* off-diagonal elemets */
305       /* ia[i1+1]++; */
306       ia[i2+1]++;
307       i2++;
308       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--;
309       if (nz > 0) i2 = tvc[col];
310     }
311   }
312 
313   /* shift ia[i] to point to next col */
314   for (i1=1; i1<nslim_col+1; i1++) {
315     col        = ia[i1-1];
316     ia[i1]    += col;
317     work[i1-1] = col - oshift;
318   }
319 
320   /* allocate space for column pointers */
321   nz   = ia[nslim_col] + (!ishift);
322   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
323   *jja = ja;
324 
325   /* loop over matrix putting into ja */
326   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
327     j   = aj + ai[row] + ishift;
328     col = *j++ + ishift;
329     i2  = tvc[col];
330     nz  = ai[row+1] - ai[row];
331     while (nz-- > 0) {
332       /* ja[work[i1]++] = i2 + oshift; */
333       ja[work[i2]++] = i1 + oshift;
334       i2++;
335       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--;
336       if (nz > 0) i2 = tvc[col];
337     }
338   }
339   ierr = PetscFree(ns_col);CHKERRQ(ierr);
340   ierr = PetscFree(work);CHKERRQ(ierr);
341   ierr = PetscFree(tns);CHKERRQ(ierr);
342   ierr = PetscFree(tvc);CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
348 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
349 {
350   PetscErrorCode ierr;
351 
352   PetscFunctionBegin;
353   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
354   if (!ia) PetscFunctionReturn(0);
355 
356   if (!blockcompressed) {
357     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
358   } else if (symmetric) {
359     /* Since the indices are symmetric it does'nt matter */
360     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
361   } else {
362     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   }
364   PetscFunctionReturn(0);
365 }
366 
367 #undef __FUNCT__
368 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
369 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
370 {
371   PetscErrorCode ierr;
372 
373   PetscFunctionBegin;
374   if (!ia) PetscFunctionReturn(0);
375   if (!blockcompressed) {
376     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
377   } else {
378     ierr = PetscFree(*ia);CHKERRQ(ierr);
379     ierr = PetscFree(*ja);CHKERRQ(ierr);
380   }
381   PetscFunctionReturn(0);
382 }
383 
384 /* ----------------------------------------------------------- */
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
388 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
389 {
390   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
391   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
392   PetscScalar       *y;
393   const PetscScalar *x;
394   const MatScalar   *v1,*v2,*v3,*v4,*v5;
395   PetscErrorCode    ierr;
396   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
397 
398 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
399 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
400 #endif
401 
402   PetscFunctionBegin;
403   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
404   node_max = a->inode.node_count;
405   ns       = a->inode.size;     /* Node Size array */
406   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
407   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
408   idx      = a->j;
409   v1       = a->a;
410   ii       = a->i;
411 
412   for (i = 0,row = 0; i< node_max; ++i) {
413     nsz         = ns[i];
414     n           = ii[1] - ii[0];
415     nonzerorow += (n>0)*nsz;
416     ii         += nsz;
417     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
418     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
419     sz = n;                     /* No of non zeros in this row */
420                                 /* Switch on the size of Node */
421     switch (nsz) {               /* Each loop in 'case' is unrolled */
422     case 1:
423       sum1 = 0.;
424 
425       for (n = 0; n< sz-1; n+=2) {
426         i1    = idx[0];         /* The instructions are ordered to */
427         i2    = idx[1];         /* make the compiler's job easy */
428         idx  += 2;
429         tmp0  = x[i1];
430         tmp1  = x[i2];
431         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432       }
433 
434       if (n == sz-1) {          /* Take care of the last nonzero  */
435         tmp0  = x[*idx++];
436         sum1 += *v1++ *tmp0;
437       }
438       y[row++]=sum1;
439       break;
440     case 2:
441       sum1 = 0.;
442       sum2 = 0.;
443       v2   = v1 + n;
444 
445       for (n = 0; n< sz-1; n+=2) {
446         i1    = idx[0];
447         i2    = idx[1];
448         idx  += 2;
449         tmp0  = x[i1];
450         tmp1  = x[i2];
451         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
452         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
453       }
454       if (n == sz-1) {
455         tmp0  = x[*idx++];
456         sum1 += *v1++ * tmp0;
457         sum2 += *v2++ * tmp0;
458       }
459       y[row++]=sum1;
460       y[row++]=sum2;
461       v1      =v2;              /* Since the next block to be processed starts there*/
462       idx    +=sz;
463       break;
464     case 3:
465       sum1 = 0.;
466       sum2 = 0.;
467       sum3 = 0.;
468       v2   = v1 + n;
469       v3   = v2 + n;
470 
471       for (n = 0; n< sz-1; n+=2) {
472         i1    = idx[0];
473         i2    = idx[1];
474         idx  += 2;
475         tmp0  = x[i1];
476         tmp1  = x[i2];
477         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
478         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
479         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
480       }
481       if (n == sz-1) {
482         tmp0  = x[*idx++];
483         sum1 += *v1++ * tmp0;
484         sum2 += *v2++ * tmp0;
485         sum3 += *v3++ * tmp0;
486       }
487       y[row++]=sum1;
488       y[row++]=sum2;
489       y[row++]=sum3;
490       v1      =v3;              /* Since the next block to be processed starts there*/
491       idx    +=2*sz;
492       break;
493     case 4:
494       sum1 = 0.;
495       sum2 = 0.;
496       sum3 = 0.;
497       sum4 = 0.;
498       v2   = v1 + n;
499       v3   = v2 + n;
500       v4   = v3 + n;
501 
502       for (n = 0; n< sz-1; n+=2) {
503         i1    = idx[0];
504         i2    = idx[1];
505         idx  += 2;
506         tmp0  = x[i1];
507         tmp1  = x[i2];
508         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
509         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
510         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
511         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
512       }
513       if (n == sz-1) {
514         tmp0  = x[*idx++];
515         sum1 += *v1++ * tmp0;
516         sum2 += *v2++ * tmp0;
517         sum3 += *v3++ * tmp0;
518         sum4 += *v4++ * tmp0;
519       }
520       y[row++]=sum1;
521       y[row++]=sum2;
522       y[row++]=sum3;
523       y[row++]=sum4;
524       v1      =v4;              /* Since the next block to be processed starts there*/
525       idx    +=3*sz;
526       break;
527     case 5:
528       sum1 = 0.;
529       sum2 = 0.;
530       sum3 = 0.;
531       sum4 = 0.;
532       sum5 = 0.;
533       v2   = v1 + n;
534       v3   = v2 + n;
535       v4   = v3 + n;
536       v5   = v4 + n;
537 
538       for (n = 0; n<sz-1; n+=2) {
539         i1    = idx[0];
540         i2    = idx[1];
541         idx  += 2;
542         tmp0  = x[i1];
543         tmp1  = x[i2];
544         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
545         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
546         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
547         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
548         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
549       }
550       if (n == sz-1) {
551         tmp0  = x[*idx++];
552         sum1 += *v1++ * tmp0;
553         sum2 += *v2++ * tmp0;
554         sum3 += *v3++ * tmp0;
555         sum4 += *v4++ * tmp0;
556         sum5 += *v5++ * tmp0;
557       }
558       y[row++]=sum1;
559       y[row++]=sum2;
560       y[row++]=sum3;
561       y[row++]=sum4;
562       y[row++]=sum5;
563       v1      =v5;       /* Since the next block to be processed starts there */
564       idx    +=4*sz;
565       break;
566     default:
567       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
568     }
569   }
570   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
571   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
572   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 /* ----------------------------------------------------------- */
576 /* Almost same code as the MatMult_SeqAIJ_Inode() */
577 #undef __FUNCT__
578 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
579 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
580 {
581   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
582   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
583   MatScalar      *v1,*v2,*v3,*v4,*v5;
584   PetscScalar    *x,*y,*z,*zt;
585   PetscErrorCode ierr;
586   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
587 
588   PetscFunctionBegin;
589   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
590   node_max = a->inode.node_count;
591   ns       = a->inode.size;     /* Node Size array */
592 
593   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
594   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
595   if (zz != yy) {
596     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
597   } else {
598     z = y;
599   }
600   zt = z;
601 
602   idx = a->j;
603   v1  = a->a;
604   ii  = a->i;
605 
606   for (i = 0,row = 0; i< node_max; ++i) {
607     nsz = ns[i];
608     n   = ii[1] - ii[0];
609     ii += nsz;
610     sz  = n;                    /* No of non zeros in this row */
611                                 /* Switch on the size of Node */
612     switch (nsz) {               /* Each loop in 'case' is unrolled */
613     case 1:
614       sum1 = *zt++;
615 
616       for (n = 0; n< sz-1; n+=2) {
617         i1    = idx[0];         /* The instructions are ordered to */
618         i2    = idx[1];         /* make the compiler's job easy */
619         idx  += 2;
620         tmp0  = x[i1];
621         tmp1  = x[i2];
622         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623       }
624 
625       if (n   == sz-1) {          /* Take care of the last nonzero  */
626         tmp0  = x[*idx++];
627         sum1 += *v1++ * tmp0;
628       }
629       y[row++]=sum1;
630       break;
631     case 2:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       v2   = v1 + n;
635 
636       for (n = 0; n< sz-1; n+=2) {
637         i1    = idx[0];
638         i2    = idx[1];
639         idx  += 2;
640         tmp0  = x[i1];
641         tmp1  = x[i2];
642         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644       }
645       if (n   == sz-1) {
646         tmp0  = x[*idx++];
647         sum1 += *v1++ * tmp0;
648         sum2 += *v2++ * tmp0;
649       }
650       y[row++]=sum1;
651       y[row++]=sum2;
652       v1      =v2;              /* Since the next block to be processed starts there*/
653       idx    +=sz;
654       break;
655     case 3:
656       sum1 = *zt++;
657       sum2 = *zt++;
658       sum3 = *zt++;
659       v2   = v1 + n;
660       v3   = v2 + n;
661 
662       for (n = 0; n< sz-1; n+=2) {
663         i1    = idx[0];
664         i2    = idx[1];
665         idx  += 2;
666         tmp0  = x[i1];
667         tmp1  = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671       }
672       if (n == sz-1) {
673         tmp0  = x[*idx++];
674         sum1 += *v1++ * tmp0;
675         sum2 += *v2++ * tmp0;
676         sum3 += *v3++ * tmp0;
677       }
678       y[row++]=sum1;
679       y[row++]=sum2;
680       y[row++]=sum3;
681       v1      =v3;              /* Since the next block to be processed starts there*/
682       idx    +=2*sz;
683       break;
684     case 4:
685       sum1 = *zt++;
686       sum2 = *zt++;
687       sum3 = *zt++;
688       sum4 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692 
693       for (n = 0; n< sz-1; n+=2) {
694         i1    = idx[0];
695         i2    = idx[1];
696         idx  += 2;
697         tmp0  = x[i1];
698         tmp1  = x[i2];
699         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703       }
704       if (n == sz-1) {
705         tmp0  = x[*idx++];
706         sum1 += *v1++ * tmp0;
707         sum2 += *v2++ * tmp0;
708         sum3 += *v3++ * tmp0;
709         sum4 += *v4++ * tmp0;
710       }
711       y[row++]=sum1;
712       y[row++]=sum2;
713       y[row++]=sum3;
714       y[row++]=sum4;
715       v1      =v4;              /* Since the next block to be processed starts there*/
716       idx    +=3*sz;
717       break;
718     case 5:
719       sum1 = *zt++;
720       sum2 = *zt++;
721       sum3 = *zt++;
722       sum4 = *zt++;
723       sum5 = *zt++;
724       v2   = v1 + n;
725       v3   = v2 + n;
726       v4   = v3 + n;
727       v5   = v4 + n;
728 
729       for (n = 0; n<sz-1; n+=2) {
730         i1    = idx[0];
731         i2    = idx[1];
732         idx  += 2;
733         tmp0  = x[i1];
734         tmp1  = x[i2];
735         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740       }
741       if (n   == sz-1) {
742         tmp0  = x[*idx++];
743         sum1 += *v1++ * tmp0;
744         sum2 += *v2++ * tmp0;
745         sum3 += *v3++ * tmp0;
746         sum4 += *v4++ * tmp0;
747         sum5 += *v5++ * tmp0;
748       }
749       y[row++]=sum1;
750       y[row++]=sum2;
751       y[row++]=sum3;
752       y[row++]=sum4;
753       y[row++]=sum5;
754       v1      =v5;       /* Since the next block to be processed starts there */
755       idx    +=4*sz;
756       break;
757     default:
758       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759     }
760   }
761   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
763   if (zz != yy) {
764     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
765   }
766   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
767   PetscFunctionReturn(0);
768 }
769 
770 /* ----------------------------------------------------------- */
771 #undef __FUNCT__
772 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
773 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
774 {
775   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
776   IS                iscol = a->col,isrow = a->row;
777   PetscErrorCode    ierr;
778   const PetscInt    *r,*c,*rout,*cout;
779   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
780   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
781   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
782   PetscScalar       sum1,sum2,sum3,sum4,sum5;
783   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
784   const PetscScalar *b;
785 
786   PetscFunctionBegin;
787   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
788   node_max = a->inode.node_count;
789   ns       = a->inode.size;     /* Node Size array */
790 
791   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   tmp  = a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
797 
798   /* forward solve the lower triangular */
799   tmps = tmp;
800   aa   = a_a;
801   aj   = a_j;
802   ad   = a->diag;
803 
804   for (i = 0,row = 0; i< node_max; ++i) {
805     nsz = ns[i];
806     aii = ai[row];
807     v1  = aa + aii;
808     vi  = aj + aii;
809     nz  = ad[row]- aii;
810     if (i < node_max-1) {
811       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
812       * but our indexing to determine it's size could. */
813       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
814       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
815       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
816       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
817     }
818 
819     switch (nsz) {               /* Each loop in 'case' is unrolled */
820     case 1:
821       sum1 = b[*r++];
822       for (j=0; j<nz-1; j+=2) {
823         i0    = vi[0];
824         i1    = vi[1];
825         vi   +=2;
826         tmp0  = tmps[i0];
827         tmp1  = tmps[i1];
828         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
829       }
830       if (j == nz-1) {
831         tmp0  = tmps[*vi++];
832         sum1 -= *v1++ *tmp0;
833       }
834       tmp[row++]=sum1;
835       break;
836     case 2:
837       sum1 = b[*r++];
838       sum2 = b[*r++];
839       v2   = aa + ai[row+1];
840 
841       for (j=0; j<nz-1; j+=2) {
842         i0    = vi[0];
843         i1    = vi[1];
844         vi   +=2;
845         tmp0  = tmps[i0];
846         tmp1  = tmps[i1];
847         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849       }
850       if (j == nz-1) {
851         tmp0  = tmps[*vi++];
852         sum1 -= *v1++ *tmp0;
853         sum2 -= *v2++ *tmp0;
854       }
855       sum2     -= *v2++ *sum1;
856       tmp[row++]=sum1;
857       tmp[row++]=sum2;
858       break;
859     case 3:
860       sum1 = b[*r++];
861       sum2 = b[*r++];
862       sum3 = b[*r++];
863       v2   = aa + ai[row+1];
864       v3   = aa + ai[row+2];
865 
866       for (j=0; j<nz-1; j+=2) {
867         i0    = vi[0];
868         i1    = vi[1];
869         vi   +=2;
870         tmp0  = tmps[i0];
871         tmp1  = tmps[i1];
872         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
873         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
874         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
875       }
876       if (j == nz-1) {
877         tmp0  = tmps[*vi++];
878         sum1 -= *v1++ *tmp0;
879         sum2 -= *v2++ *tmp0;
880         sum3 -= *v3++ *tmp0;
881       }
882       sum2 -= *v2++ * sum1;
883       sum3 -= *v3++ * sum1;
884       sum3 -= *v3++ * sum2;
885 
886       tmp[row++]=sum1;
887       tmp[row++]=sum2;
888       tmp[row++]=sum3;
889       break;
890 
891     case 4:
892       sum1 = b[*r++];
893       sum2 = b[*r++];
894       sum3 = b[*r++];
895       sum4 = b[*r++];
896       v2   = aa + ai[row+1];
897       v3   = aa + ai[row+2];
898       v4   = aa + ai[row+3];
899 
900       for (j=0; j<nz-1; j+=2) {
901         i0    = vi[0];
902         i1    = vi[1];
903         vi   +=2;
904         tmp0  = tmps[i0];
905         tmp1  = tmps[i1];
906         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
907         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
908         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
909         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
910       }
911       if (j == nz-1) {
912         tmp0  = tmps[*vi++];
913         sum1 -= *v1++ *tmp0;
914         sum2 -= *v2++ *tmp0;
915         sum3 -= *v3++ *tmp0;
916         sum4 -= *v4++ *tmp0;
917       }
918       sum2 -= *v2++ * sum1;
919       sum3 -= *v3++ * sum1;
920       sum4 -= *v4++ * sum1;
921       sum3 -= *v3++ * sum2;
922       sum4 -= *v4++ * sum2;
923       sum4 -= *v4++ * sum3;
924 
925       tmp[row++]=sum1;
926       tmp[row++]=sum2;
927       tmp[row++]=sum3;
928       tmp[row++]=sum4;
929       break;
930     case 5:
931       sum1 = b[*r++];
932       sum2 = b[*r++];
933       sum3 = b[*r++];
934       sum4 = b[*r++];
935       sum5 = b[*r++];
936       v2   = aa + ai[row+1];
937       v3   = aa + ai[row+2];
938       v4   = aa + ai[row+3];
939       v5   = aa + ai[row+4];
940 
941       for (j=0; j<nz-1; j+=2) {
942         i0    = vi[0];
943         i1    = vi[1];
944         vi   +=2;
945         tmp0  = tmps[i0];
946         tmp1  = tmps[i1];
947         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
948         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
949         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
950         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
951         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
952       }
953       if (j == nz-1) {
954         tmp0  = tmps[*vi++];
955         sum1 -= *v1++ *tmp0;
956         sum2 -= *v2++ *tmp0;
957         sum3 -= *v3++ *tmp0;
958         sum4 -= *v4++ *tmp0;
959         sum5 -= *v5++ *tmp0;
960       }
961 
962       sum2 -= *v2++ * sum1;
963       sum3 -= *v3++ * sum1;
964       sum4 -= *v4++ * sum1;
965       sum5 -= *v5++ * sum1;
966       sum3 -= *v3++ * sum2;
967       sum4 -= *v4++ * sum2;
968       sum5 -= *v5++ * sum2;
969       sum4 -= *v4++ * sum3;
970       sum5 -= *v5++ * sum3;
971       sum5 -= *v5++ * sum4;
972 
973       tmp[row++]=sum1;
974       tmp[row++]=sum2;
975       tmp[row++]=sum3;
976       tmp[row++]=sum4;
977       tmp[row++]=sum5;
978       break;
979     default:
980       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
981     }
982   }
983   /* backward solve the upper triangular */
984   for (i=node_max -1,row = n-1; i>=0; i--) {
985     nsz = ns[i];
986     aii = ai[row+1] -1;
987     v1  = aa + aii;
988     vi  = aj + aii;
989     nz  = aii- ad[row];
990     switch (nsz) {               /* Each loop in 'case' is unrolled */
991     case 1:
992       sum1 = tmp[row];
993 
994       for (j=nz; j>1; j-=2) {
995         vi   -=2;
996         i0    = vi[2];
997         i1    = vi[1];
998         tmp0  = tmps[i0];
999         tmp1  = tmps[i1];
1000         v1   -= 2;
1001         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002       }
1003       if (j==1) {
1004         tmp0  = tmps[*vi--];
1005         sum1 -= *v1-- * tmp0;
1006       }
1007       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1008       break;
1009     case 2:
1010       sum1 = tmp[row];
1011       sum2 = tmp[row -1];
1012       v2   = aa + ai[row]-1;
1013       for (j=nz; j>1; j-=2) {
1014         vi   -=2;
1015         i0    = vi[2];
1016         i1    = vi[1];
1017         tmp0  = tmps[i0];
1018         tmp1  = tmps[i1];
1019         v1   -= 2;
1020         v2   -= 2;
1021         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1022         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1023       }
1024       if (j==1) {
1025         tmp0  = tmps[*vi--];
1026         sum1 -= *v1-- * tmp0;
1027         sum2 -= *v2-- * tmp0;
1028       }
1029 
1030       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031       sum2   -= *v2-- * tmp0;
1032       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1033       break;
1034     case 3:
1035       sum1 = tmp[row];
1036       sum2 = tmp[row -1];
1037       sum3 = tmp[row -2];
1038       v2   = aa + ai[row]-1;
1039       v3   = aa + ai[row -1]-1;
1040       for (j=nz; j>1; j-=2) {
1041         vi   -=2;
1042         i0    = vi[2];
1043         i1    = vi[1];
1044         tmp0  = tmps[i0];
1045         tmp1  = tmps[i1];
1046         v1   -= 2;
1047         v2   -= 2;
1048         v3   -= 2;
1049         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1050         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1051         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1052       }
1053       if (j==1) {
1054         tmp0  = tmps[*vi--];
1055         sum1 -= *v1-- * tmp0;
1056         sum2 -= *v2-- * tmp0;
1057         sum3 -= *v3-- * tmp0;
1058       }
1059       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1060       sum2   -= *v2-- * tmp0;
1061       sum3   -= *v3-- * tmp0;
1062       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1063       sum3   -= *v3-- * tmp0;
1064       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1065 
1066       break;
1067     case 4:
1068       sum1 = tmp[row];
1069       sum2 = tmp[row -1];
1070       sum3 = tmp[row -2];
1071       sum4 = tmp[row -3];
1072       v2   = aa + ai[row]-1;
1073       v3   = aa + ai[row -1]-1;
1074       v4   = aa + ai[row -2]-1;
1075 
1076       for (j=nz; j>1; j-=2) {
1077         vi   -=2;
1078         i0    = vi[2];
1079         i1    = vi[1];
1080         tmp0  = tmps[i0];
1081         tmp1  = tmps[i1];
1082         v1   -= 2;
1083         v2   -= 2;
1084         v3   -= 2;
1085         v4   -= 2;
1086         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1087         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1088         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1089         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1090       }
1091       if (j==1) {
1092         tmp0  = tmps[*vi--];
1093         sum1 -= *v1-- * tmp0;
1094         sum2 -= *v2-- * tmp0;
1095         sum3 -= *v3-- * tmp0;
1096         sum4 -= *v4-- * tmp0;
1097       }
1098 
1099       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1100       sum2   -= *v2-- * tmp0;
1101       sum3   -= *v3-- * tmp0;
1102       sum4   -= *v4-- * tmp0;
1103       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1104       sum3   -= *v3-- * tmp0;
1105       sum4   -= *v4-- * tmp0;
1106       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1107       sum4   -= *v4-- * tmp0;
1108       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1109       break;
1110     case 5:
1111       sum1 = tmp[row];
1112       sum2 = tmp[row -1];
1113       sum3 = tmp[row -2];
1114       sum4 = tmp[row -3];
1115       sum5 = tmp[row -4];
1116       v2   = aa + ai[row]-1;
1117       v3   = aa + ai[row -1]-1;
1118       v4   = aa + ai[row -2]-1;
1119       v5   = aa + ai[row -3]-1;
1120       for (j=nz; j>1; j-=2) {
1121         vi   -= 2;
1122         i0    = vi[2];
1123         i1    = vi[1];
1124         tmp0  = tmps[i0];
1125         tmp1  = tmps[i1];
1126         v1   -= 2;
1127         v2   -= 2;
1128         v3   -= 2;
1129         v4   -= 2;
1130         v5   -= 2;
1131         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1132         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1133         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1134         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1135         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1136       }
1137       if (j==1) {
1138         tmp0  = tmps[*vi--];
1139         sum1 -= *v1-- * tmp0;
1140         sum2 -= *v2-- * tmp0;
1141         sum3 -= *v3-- * tmp0;
1142         sum4 -= *v4-- * tmp0;
1143         sum5 -= *v5-- * tmp0;
1144       }
1145 
1146       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1147       sum2   -= *v2-- * tmp0;
1148       sum3   -= *v3-- * tmp0;
1149       sum4   -= *v4-- * tmp0;
1150       sum5   -= *v5-- * tmp0;
1151       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1152       sum3   -= *v3-- * tmp0;
1153       sum4   -= *v4-- * tmp0;
1154       sum5   -= *v5-- * tmp0;
1155       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1159       sum5   -= *v5-- * tmp0;
1160       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1161       break;
1162     default:
1163       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1164     }
1165   }
1166   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1168   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1176 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1177 {
1178   Mat             C     =B;
1179   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1180   IS              isrow = b->row,isicol = b->icol;
1181   PetscErrorCode  ierr;
1182   const PetscInt  *r,*ic,*ics;
1183   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1184   PetscInt        i,j,k,nz,nzL,row,*pj;
1185   const PetscInt  *ajtmp,*bjtmp;
1186   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1187   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1188   FactorShiftCtx  sctx;
1189   const PetscInt  *ddiag;
1190   PetscReal       rs;
1191   MatScalar       d;
1192   PetscInt        inod,nodesz,node_max,col;
1193   const PetscInt  *ns;
1194   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1195 
1196   PetscFunctionBegin;
1197   /* MatPivotSetUp(): initialize shift context sctx */
1198   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1199 
1200   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1201     ddiag          = a->diag;
1202     sctx.shift_top = info->zeropivot;
1203     for (i=0; i<n; i++) {
1204       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1205       d  = (aa)[ddiag[i]];
1206       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1207       v  = aa+ai[i];
1208       nz = ai[i+1] - ai[i];
1209       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1210       if (rs>sctx.shift_top) sctx.shift_top = rs;
1211     }
1212     sctx.shift_top *= 1.1;
1213     sctx.nshift_max = 5;
1214     sctx.shift_lo   = 0.;
1215     sctx.shift_hi   = 1.;
1216   }
1217 
1218   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1219   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1220 
1221   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1222   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1223   rtmp2 = rtmp1 + n;
1224   rtmp3 = rtmp2 + n;
1225   rtmp4 = rtmp3 + n;
1226   ics   = ic;
1227 
1228   node_max = a->inode.node_count;
1229   ns       = a->inode.size;
1230   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1231 
1232   /* If max inode size > 4, split it into two inodes.*/
1233   /* also map the inode sizes according to the ordering */
1234   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1235   for (i=0,j=0; i<node_max; ++i,++j) {
1236     if (ns[i] > 4) {
1237       tmp_vec1[j] = 4;
1238       ++j;
1239       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1240     } else {
1241       tmp_vec1[j] = ns[i];
1242     }
1243   }
1244   /* Use the correct node_max */
1245   node_max = j;
1246 
1247   /* Now reorder the inode info based on mat re-ordering info */
1248   /* First create a row -> inode_size_array_index map */
1249   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1250   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1251   for (i=0,row=0; i<node_max; i++) {
1252     nodesz = tmp_vec1[i];
1253     for (j=0; j<nodesz; j++,row++) {
1254       nsmap[row] = i;
1255     }
1256   }
1257   /* Using nsmap, create a reordered ns structure */
1258   for (i=0,j=0; i< node_max; i++) {
1259     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1260     tmp_vec2[i] = nodesz;
1261     j          += nodesz;
1262   }
1263   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1264   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1265 
1266   /* Now use the correct ns */
1267   ns = tmp_vec2;
1268 
1269   do {
1270     sctx.newshift = PETSC_FALSE;
1271     /* Now loop over each block-row, and do the factorization */
1272     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1273       nodesz = ns[inod];
1274 
1275       switch (nodesz) {
1276       case 1:
1277         /*----------*/
1278         /* zero rtmp1 */
1279         /* L part */
1280         nz    = bi[i+1] - bi[i];
1281         bjtmp = bj + bi[i];
1282         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1283 
1284         /* U part */
1285         nz    = bdiag[i]-bdiag[i+1];
1286         bjtmp = bj + bdiag[i+1]+1;
1287         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1288 
1289         /* load in initial (unfactored row) */
1290         nz    = ai[r[i]+1] - ai[r[i]];
1291         ajtmp = aj + ai[r[i]];
1292         v     = aa + ai[r[i]];
1293         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1294 
1295         /* ZeropivotApply() */
1296         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1297 
1298         /* elimination */
1299         bjtmp = bj + bi[i];
1300         row   = *bjtmp++;
1301         nzL   = bi[i+1] - bi[i];
1302         for (k=0; k < nzL; k++) {
1303           pc = rtmp1 + row;
1304           if (*pc != 0.0) {
1305             pv   = b->a + bdiag[row];
1306             mul1 = *pc * (*pv);
1307             *pc  = mul1;
1308             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1309             pv   = b->a + bdiag[row+1]+1;
1310             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1311             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1312             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1313           }
1314           row = *bjtmp++;
1315         }
1316 
1317         /* finished row so stick it into b->a */
1318         rs = 0.0;
1319         /* L part */
1320         pv = b->a + bi[i];
1321         pj = b->j + bi[i];
1322         nz = bi[i+1] - bi[i];
1323         for (j=0; j<nz; j++) {
1324           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1325         }
1326 
1327         /* U part */
1328         pv = b->a + bdiag[i+1]+1;
1329         pj = b->j + bdiag[i+1]+1;
1330         nz = bdiag[i] - bdiag[i+1]-1;
1331         for (j=0; j<nz; j++) {
1332           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* Check zero pivot */
1336         sctx.rs = rs;
1337         sctx.pv = rtmp1[i];
1338         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1339         if (sctx.newshift) break;
1340 
1341         /* Mark diagonal and invert diagonal for simplier triangular solves */
1342         pv  = b->a + bdiag[i];
1343         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1344         break;
1345 
1346       case 2:
1347         /*----------*/
1348         /* zero rtmp1 and rtmp2 */
1349         /* L part */
1350         nz    = bi[i+1] - bi[i];
1351         bjtmp = bj + bi[i];
1352         for  (j=0; j<nz; j++) {
1353           col        = bjtmp[j];
1354           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1355         }
1356 
1357         /* U part */
1358         nz    = bdiag[i]-bdiag[i+1];
1359         bjtmp = bj + bdiag[i+1]+1;
1360         for  (j=0; j<nz; j++) {
1361           col        = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* load in initial (unfactored row) */
1366         nz    = ai[r[i]+1] - ai[r[i]];
1367         ajtmp = aj + ai[r[i]];
1368         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1369         for (j=0; j<nz; j++) {
1370           col        = ics[ajtmp[j]];
1371           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1372         }
1373         /* ZeropivotApply(): shift the diagonal of the matrix  */
1374         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1375 
1376         /* elimination */
1377         bjtmp = bj + bi[i];
1378         row   = *bjtmp++; /* pivot row */
1379         nzL   = bi[i+1] - bi[i];
1380         for (k=0; k < nzL; k++) {
1381           pc1 = rtmp1 + row;
1382           pc2 = rtmp2 + row;
1383           if (*pc1 != 0.0 || *pc2 != 0.0) {
1384             pv   = b->a + bdiag[row];
1385             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1386             *pc1 = mul1;       *pc2 = mul2;
1387 
1388             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1389             pv = b->a + bdiag[row+1]+1;
1390             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1391             for (j=0; j<nz; j++) {
1392               col         = pj[j];
1393               rtmp1[col] -= mul1 * pv[j];
1394               rtmp2[col] -= mul2 * pv[j];
1395             }
1396             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1397           }
1398           row = *bjtmp++;
1399         }
1400 
1401         /* finished row i; check zero pivot, then stick row i into b->a */
1402         rs = 0.0;
1403         /* L part */
1404         pc1 = b->a + bi[i];
1405         pj  = b->j + bi[i];
1406         nz  = bi[i+1] - bi[i];
1407         for (j=0; j<nz; j++) {
1408           col    = pj[j];
1409           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1410         }
1411         /* U part */
1412         pc1 = b->a + bdiag[i+1]+1;
1413         pj  = b->j + bdiag[i+1]+1;
1414         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1415         for (j=0; j<nz; j++) {
1416           col    = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419 
1420         sctx.rs = rs;
1421         sctx.pv = rtmp1[i];
1422         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1423         if (sctx.newshift) break;
1424         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1425         *pc1 = 1.0/sctx.pv;
1426 
1427         /* Now take care of diagonal 2x2 block. */
1428         pc2 = rtmp2 + i;
1429         if (*pc2 != 0.0) {
1430           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1431           *pc2 = mul1;          /* insert L entry */
1432           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1433           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1434           for (j=0; j<nz; j++) {
1435             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1436           }
1437           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1438         }
1439 
1440         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1441         rs = 0.0;
1442         /* L part */
1443         pc2 = b->a + bi[i+1];
1444         pj  = b->j + bi[i+1];
1445         nz  = bi[i+2] - bi[i+1];
1446         for (j=0; j<nz; j++) {
1447           col    = pj[j];
1448           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1449         }
1450         /* U part */
1451         pc2 = b->a + bdiag[i+2]+1;
1452         pj  = b->j + bdiag[i+2]+1;
1453         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1454         for (j=0; j<nz; j++) {
1455           col    = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458 
1459         sctx.rs = rs;
1460         sctx.pv = rtmp2[i+1];
1461         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1462         if (sctx.newshift) break;
1463         pc2  = b->a + bdiag[i+1];
1464         *pc2 = 1.0/sctx.pv;
1465         break;
1466 
1467       case 3:
1468         /*----------*/
1469         /* zero rtmp */
1470         /* L part */
1471         nz    = bi[i+1] - bi[i];
1472         bjtmp = bj + bi[i];
1473         for  (j=0; j<nz; j++) {
1474           col        = bjtmp[j];
1475           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1476         }
1477 
1478         /* U part */
1479         nz    = bdiag[i]-bdiag[i+1];
1480         bjtmp = bj + bdiag[i+1]+1;
1481         for  (j=0; j<nz; j++) {
1482           col        = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* load in initial (unfactored row) */
1487         nz    = ai[r[i]+1] - ai[r[i]];
1488         ajtmp = aj + ai[r[i]];
1489         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1490         for (j=0; j<nz; j++) {
1491           col        = ics[ajtmp[j]];
1492           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1493         }
1494         /* ZeropivotApply(): shift the diagonal of the matrix  */
1495         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1496 
1497         /* elimination */
1498         bjtmp = bj + bi[i];
1499         row   = *bjtmp++; /* pivot row */
1500         nzL   = bi[i+1] - bi[i];
1501         for (k=0; k < nzL; k++) {
1502           pc1 = rtmp1 + row;
1503           pc2 = rtmp2 + row;
1504           pc3 = rtmp3 + row;
1505           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1506             pv   = b->a + bdiag[row];
1507             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1508             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1509 
1510             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1511             pv = b->a + bdiag[row+1]+1;
1512             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1513             for (j=0; j<nz; j++) {
1514               col         = pj[j];
1515               rtmp1[col] -= mul1 * pv[j];
1516               rtmp2[col] -= mul2 * pv[j];
1517               rtmp3[col] -= mul3 * pv[j];
1518             }
1519             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1520           }
1521           row = *bjtmp++;
1522         }
1523 
1524         /* finished row i; check zero pivot, then stick row i into b->a */
1525         rs = 0.0;
1526         /* L part */
1527         pc1 = b->a + bi[i];
1528         pj  = b->j + bi[i];
1529         nz  = bi[i+1] - bi[i];
1530         for (j=0; j<nz; j++) {
1531           col    = pj[j];
1532           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1533         }
1534         /* U part */
1535         pc1 = b->a + bdiag[i+1]+1;
1536         pj  = b->j + bdiag[i+1]+1;
1537         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1538         for (j=0; j<nz; j++) {
1539           col    = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542 
1543         sctx.rs = rs;
1544         sctx.pv = rtmp1[i];
1545         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1546         if (sctx.newshift) break;
1547         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1548         *pc1 = 1.0/sctx.pv;
1549 
1550         /* Now take care of 1st column of diagonal 3x3 block. */
1551         pc2 = rtmp2 + i;
1552         pc3 = rtmp3 + i;
1553         if (*pc2 != 0.0 || *pc3 != 0.0) {
1554           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1555           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1556           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1557           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1558           for (j=0; j<nz; j++) {
1559             col         = pj[j];
1560             rtmp2[col] -= mul2 * rtmp1[col];
1561             rtmp3[col] -= mul3 * rtmp1[col];
1562           }
1563           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1564         }
1565 
1566         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1567         rs = 0.0;
1568         /* L part */
1569         pc2 = b->a + bi[i+1];
1570         pj  = b->j + bi[i+1];
1571         nz  = bi[i+2] - bi[i+1];
1572         for (j=0; j<nz; j++) {
1573           col    = pj[j];
1574           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1575         }
1576         /* U part */
1577         pc2 = b->a + bdiag[i+2]+1;
1578         pj  = b->j + bdiag[i+2]+1;
1579         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1580         for (j=0; j<nz; j++) {
1581           col    = pj[j];
1582           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583         }
1584 
1585         sctx.rs = rs;
1586         sctx.pv = rtmp2[i+1];
1587         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1588         if (sctx.newshift) break;
1589         pc2  = b->a + bdiag[i+1];
1590         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1591 
1592         /* Now take care of 2nd column of diagonal 3x3 block. */
1593         pc3 = rtmp3 + i+1;
1594         if (*pc3 != 0.0) {
1595           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1596           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1597           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1598           for (j=0; j<nz; j++) {
1599             col         = pj[j];
1600             rtmp3[col] -= mul3 * rtmp2[col];
1601           }
1602           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1603         }
1604 
1605         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1606         rs = 0.0;
1607         /* L part */
1608         pc3 = b->a + bi[i+2];
1609         pj  = b->j + bi[i+2];
1610         nz  = bi[i+3] - bi[i+2];
1611         for (j=0; j<nz; j++) {
1612           col    = pj[j];
1613           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1614         }
1615         /* U part */
1616         pc3 = b->a + bdiag[i+3]+1;
1617         pj  = b->j + bdiag[i+3]+1;
1618         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1619         for (j=0; j<nz; j++) {
1620           col    = pj[j];
1621           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622         }
1623 
1624         sctx.rs = rs;
1625         sctx.pv = rtmp3[i+2];
1626         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1627         if (sctx.newshift) break;
1628         pc3  = b->a + bdiag[i+2];
1629         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1630         break;
1631       case 4:
1632         /*----------*/
1633         /* zero rtmp */
1634         /* L part */
1635         nz    = bi[i+1] - bi[i];
1636         bjtmp = bj + bi[i];
1637         for  (j=0; j<nz; j++) {
1638           col        = bjtmp[j];
1639           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1640         }
1641 
1642         /* U part */
1643         nz    = bdiag[i]-bdiag[i+1];
1644         bjtmp = bj + bdiag[i+1]+1;
1645         for  (j=0; j<nz; j++) {
1646           col        = bjtmp[j];
1647           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1648         }
1649 
1650         /* load in initial (unfactored row) */
1651         nz    = ai[r[i]+1] - ai[r[i]];
1652         ajtmp = aj + ai[r[i]];
1653         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1654         for (j=0; j<nz; j++) {
1655           col        = ics[ajtmp[j]];
1656           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1657         }
1658         /* ZeropivotApply(): shift the diagonal of the matrix  */
1659         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1660 
1661         /* elimination */
1662         bjtmp = bj + bi[i];
1663         row   = *bjtmp++; /* pivot row */
1664         nzL   = bi[i+1] - bi[i];
1665         for (k=0; k < nzL; k++) {
1666           pc1 = rtmp1 + row;
1667           pc2 = rtmp2 + row;
1668           pc3 = rtmp3 + row;
1669           pc4 = rtmp4 + row;
1670           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1671             pv   = b->a + bdiag[row];
1672             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1673             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1674 
1675             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1676             pv = b->a + bdiag[row+1]+1;
1677             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1678             for (j=0; j<nz; j++) {
1679               col         = pj[j];
1680               rtmp1[col] -= mul1 * pv[j];
1681               rtmp2[col] -= mul2 * pv[j];
1682               rtmp3[col] -= mul3 * pv[j];
1683               rtmp4[col] -= mul4 * pv[j];
1684             }
1685             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1686           }
1687           row = *bjtmp++;
1688         }
1689 
1690         /* finished row i; check zero pivot, then stick row i into b->a */
1691         rs = 0.0;
1692         /* L part */
1693         pc1 = b->a + bi[i];
1694         pj  = b->j + bi[i];
1695         nz  = bi[i+1] - bi[i];
1696         for (j=0; j<nz; j++) {
1697           col    = pj[j];
1698           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1699         }
1700         /* U part */
1701         pc1 = b->a + bdiag[i+1]+1;
1702         pj  = b->j + bdiag[i+1]+1;
1703         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1704         for (j=0; j<nz; j++) {
1705           col    = pj[j];
1706           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707         }
1708 
1709         sctx.rs = rs;
1710         sctx.pv = rtmp1[i];
1711         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1712         if (sctx.newshift) break;
1713         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1714         *pc1 = 1.0/sctx.pv;
1715 
1716         /* Now take care of 1st column of diagonal 4x4 block. */
1717         pc2 = rtmp2 + i;
1718         pc3 = rtmp3 + i;
1719         pc4 = rtmp4 + i;
1720         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1722           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1723           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1724           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1725           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1726           for (j=0; j<nz; j++) {
1727             col         = pj[j];
1728             rtmp2[col] -= mul2 * rtmp1[col];
1729             rtmp3[col] -= mul3 * rtmp1[col];
1730             rtmp4[col] -= mul4 * rtmp1[col];
1731           }
1732           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1733         }
1734 
1735         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1736         rs = 0.0;
1737         /* L part */
1738         pc2 = b->a + bi[i+1];
1739         pj  = b->j + bi[i+1];
1740         nz  = bi[i+2] - bi[i+1];
1741         for (j=0; j<nz; j++) {
1742           col    = pj[j];
1743           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1744         }
1745         /* U part */
1746         pc2 = b->a + bdiag[i+2]+1;
1747         pj  = b->j + bdiag[i+2]+1;
1748         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1749         for (j=0; j<nz; j++) {
1750           col    = pj[j];
1751           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752         }
1753 
1754         sctx.rs = rs;
1755         sctx.pv = rtmp2[i+1];
1756         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1757         if (sctx.newshift) break;
1758         pc2  = b->a + bdiag[i+1];
1759         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1760 
1761         /* Now take care of 2nd column of diagonal 4x4 block. */
1762         pc3 = rtmp3 + i+1;
1763         pc4 = rtmp4 + i+1;
1764         if (*pc3 != 0.0 || *pc4 != 0.0) {
1765           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1766           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1767           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1768           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1769           for (j=0; j<nz; j++) {
1770             col         = pj[j];
1771             rtmp3[col] -= mul3 * rtmp2[col];
1772             rtmp4[col] -= mul4 * rtmp2[col];
1773           }
1774           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1775         }
1776 
1777         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1778         rs = 0.0;
1779         /* L part */
1780         pc3 = b->a + bi[i+2];
1781         pj  = b->j + bi[i+2];
1782         nz  = bi[i+3] - bi[i+2];
1783         for (j=0; j<nz; j++) {
1784           col    = pj[j];
1785           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1786         }
1787         /* U part */
1788         pc3 = b->a + bdiag[i+3]+1;
1789         pj  = b->j + bdiag[i+3]+1;
1790         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1791         for (j=0; j<nz; j++) {
1792           col    = pj[j];
1793           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794         }
1795 
1796         sctx.rs = rs;
1797         sctx.pv = rtmp3[i+2];
1798         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1799         if (sctx.newshift) break;
1800         pc3  = b->a + bdiag[i+2];
1801         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1802 
1803         /* Now take care of 3rd column of diagonal 4x4 block. */
1804         pc4 = rtmp4 + i+2;
1805         if (*pc4 != 0.0) {
1806           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1807           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1808           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1809           for (j=0; j<nz; j++) {
1810             col         = pj[j];
1811             rtmp4[col] -= mul4 * rtmp3[col];
1812           }
1813           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1814         }
1815 
1816         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1817         rs = 0.0;
1818         /* L part */
1819         pc4 = b->a + bi[i+3];
1820         pj  = b->j + bi[i+3];
1821         nz  = bi[i+4] - bi[i+3];
1822         for (j=0; j<nz; j++) {
1823           col    = pj[j];
1824           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1825         }
1826         /* U part */
1827         pc4 = b->a + bdiag[i+4]+1;
1828         pj  = b->j + bdiag[i+4]+1;
1829         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1830         for (j=0; j<nz; j++) {
1831           col    = pj[j];
1832           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833         }
1834 
1835         sctx.rs = rs;
1836         sctx.pv = rtmp4[i+3];
1837         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1838         if (sctx.newshift) break;
1839         pc4  = b->a + bdiag[i+3];
1840         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1841         break;
1842 
1843       default:
1844         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1845       }
1846       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1847       i += nodesz;                 /* Update the row */
1848     }
1849 
1850     /* MatPivotRefine() */
1851     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1852       /*
1853        * if no shift in this attempt & shifting & started shifting & can refine,
1854        * then try lower shift
1855        */
1856       sctx.shift_hi       = sctx.shift_fraction;
1857       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1858       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1859       sctx.newshift       = PETSC_TRUE;
1860       sctx.nshift++;
1861     }
1862   } while (sctx.newshift);
1863 
1864   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1865   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1866   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1867   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1868 
1869   C->ops->solve             = MatSolve_SeqAIJ;
1870   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1871   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1872   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1873   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1874   C->assembled              = PETSC_TRUE;
1875   C->preallocated           = PETSC_TRUE;
1876 
1877   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1878 
1879   /* MatShiftView(A,info,&sctx) */
1880   if (sctx.nshift) {
1881     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1882       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1883     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1884       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1885     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1886       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1887     }
1888   }
1889   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 #undef __FUNCT__
1894 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1895 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1896 {
1897   Mat             C     = B;
1898   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1899   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1900   PetscErrorCode  ierr;
1901   const PetscInt  *r,*ic,*c,*ics;
1902   PetscInt        n   = A->rmap->n,*bi = b->i;
1903   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1904   PetscInt        i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1905   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1906   PetscScalar     mul1,mul2,mul3,tmp;
1907   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1908   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1909   PetscReal       rs=0.0;
1910   FactorShiftCtx  sctx;
1911 
1912   PetscFunctionBegin;
1913   sctx.shift_top      = 0;
1914   sctx.nshift_max     = 0;
1915   sctx.shift_lo       = 0;
1916   sctx.shift_hi       = 0;
1917   sctx.shift_fraction = 0;
1918 
1919   /* if both shift schemes are chosen by user, only use info->shiftpd */
1920   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1921     sctx.shift_top = 0;
1922     for (i=0; i<n; i++) {
1923       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1924       rs    = 0.0;
1925       ajtmp = aj + ai[i];
1926       rtmp1 = aa + ai[i];
1927       nz    = ai[i+1] - ai[i];
1928       for (j=0; j<nz; j++) {
1929         if (*ajtmp != i) {
1930           rs += PetscAbsScalar(*rtmp1++);
1931         } else {
1932           rs -= PetscRealPart(*rtmp1++);
1933         }
1934         ajtmp++;
1935       }
1936       if (rs>sctx.shift_top) sctx.shift_top = rs;
1937     }
1938     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1939     sctx.shift_top *= 1.1;
1940     sctx.nshift_max = 5;
1941     sctx.shift_lo   = 0.;
1942     sctx.shift_hi   = 1.;
1943   }
1944   sctx.shift_amount = 0;
1945   sctx.nshift       = 0;
1946 
1947   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1948   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1949   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1950   ierr   = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1951   ierr   = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1952   ics    = ic;
1953   rtmp22 = rtmp11 + n;
1954   rtmp33 = rtmp22 + n;
1955 
1956   node_max = a->inode.node_count;
1957   ns       = a->inode.size;
1958   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1959 
1960   /* If max inode size > 3, split it into two inodes.*/
1961   /* also map the inode sizes according to the ordering */
1962   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1963   for (i=0,j=0; i<node_max; ++i,++j) {
1964     if (ns[i]>3) {
1965       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1966       ++j;
1967       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1968     } else {
1969       tmp_vec1[j] = ns[i];
1970     }
1971   }
1972   /* Use the correct node_max */
1973   node_max = j;
1974 
1975   /* Now reorder the inode info based on mat re-ordering info */
1976   /* First create a row -> inode_size_array_index map */
1977   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1978   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1979   for (i=0,row=0; i<node_max; i++) {
1980     nodesz = tmp_vec1[i];
1981     for (j=0; j<nodesz; j++,row++) {
1982       nsmap[row] = i;
1983     }
1984   }
1985   /* Using nsmap, create a reordered ns structure */
1986   for (i=0,j=0; i< node_max; i++) {
1987     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1988     tmp_vec2[i] = nodesz;
1989     j          += nodesz;
1990   }
1991   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1992   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1993   /* Now use the correct ns */
1994   ns = tmp_vec2;
1995 
1996   do {
1997     sctx.newshift = PETSC_FALSE;
1998     /* Now loop over each block-row, and do the factorization */
1999     for (i=0,row=0; i<node_max; i++) {
2000       nodesz = ns[i];
2001       nz     = bi[row+1] - bi[row];
2002       bjtmp  = bj + bi[row];
2003 
2004       switch (nodesz) {
2005       case 1:
2006         for  (j=0; j<nz; j++) {
2007           idx         = bjtmp[j];
2008           rtmp11[idx] = 0.0;
2009         }
2010 
2011         /* load in initial (unfactored row) */
2012         idx    = r[row];
2013         nz_tmp = ai[idx+1] - ai[idx];
2014         ajtmp  = aj + ai[idx];
2015         v1     = aa + ai[idx];
2016 
2017         for (j=0; j<nz_tmp; j++) {
2018           idx         = ics[ajtmp[j]];
2019           rtmp11[idx] = v1[j];
2020         }
2021         rtmp11[ics[r[row]]] += sctx.shift_amount;
2022 
2023         prow = *bjtmp++;
2024         while (prow < row) {
2025           pc1 = rtmp11 + prow;
2026           if (*pc1 != 0.0) {
2027             pv     = ba + bd[prow];
2028             pj     = nbj + bd[prow];
2029             mul1   = *pc1 * *pv++;
2030             *pc1   = mul1;
2031             nz_tmp = bi[prow+1] - bd[prow] - 1;
2032             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2033             for (j=0; j<nz_tmp; j++) {
2034               tmp          = pv[j];
2035               idx          = pj[j];
2036               rtmp11[idx] -= mul1 * tmp;
2037             }
2038           }
2039           prow = *bjtmp++;
2040         }
2041         pj  = bj + bi[row];
2042         pc1 = ba + bi[row];
2043 
2044         sctx.pv     = rtmp11[row];
2045         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2046         rs          = 0.0;
2047         for (j=0; j<nz; j++) {
2048           idx    = pj[j];
2049           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2050           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2051         }
2052         sctx.rs = rs;
2053         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2054         if (sctx.newshift) goto endofwhile;
2055         break;
2056 
2057       case 2:
2058         for (j=0; j<nz; j++) {
2059           idx         = bjtmp[j];
2060           rtmp11[idx] = 0.0;
2061           rtmp22[idx] = 0.0;
2062         }
2063 
2064         /* load in initial (unfactored row) */
2065         idx    = r[row];
2066         nz_tmp = ai[idx+1] - ai[idx];
2067         ajtmp  = aj + ai[idx];
2068         v1     = aa + ai[idx];
2069         v2     = aa + ai[idx+1];
2070         for (j=0; j<nz_tmp; j++) {
2071           idx         = ics[ajtmp[j]];
2072           rtmp11[idx] = v1[j];
2073           rtmp22[idx] = v2[j];
2074         }
2075         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2076         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2077 
2078         prow = *bjtmp++;
2079         while (prow < row) {
2080           pc1 = rtmp11 + prow;
2081           pc2 = rtmp22 + prow;
2082           if (*pc1 != 0.0 || *pc2 != 0.0) {
2083             pv   = ba + bd[prow];
2084             pj   = nbj + bd[prow];
2085             mul1 = *pc1 * *pv;
2086             mul2 = *pc2 * *pv;
2087             ++pv;
2088             *pc1 = mul1;
2089             *pc2 = mul2;
2090 
2091             nz_tmp = bi[prow+1] - bd[prow] - 1;
2092             for (j=0; j<nz_tmp; j++) {
2093               tmp          = pv[j];
2094               idx          = pj[j];
2095               rtmp11[idx] -= mul1 * tmp;
2096               rtmp22[idx] -= mul2 * tmp;
2097             }
2098             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2099           }
2100           prow = *bjtmp++;
2101         }
2102 
2103         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2104         pc1 = rtmp11 + prow;
2105         pc2 = rtmp22 + prow;
2106 
2107         sctx.pv = *pc1;
2108         pj      = bj + bi[prow];
2109         rs      = 0.0;
2110         for (j=0; j<nz; j++) {
2111           idx = pj[j];
2112           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2113         }
2114         sctx.rs = rs;
2115         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2116         if (sctx.newshift) goto endofwhile;
2117 
2118         if (*pc2 != 0.0) {
2119           pj     = nbj + bd[prow];
2120           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2121           *pc2   = mul2;
2122           nz_tmp = bi[prow+1] - bd[prow] - 1;
2123           for (j=0; j<nz_tmp; j++) {
2124             idx          = pj[j];
2125             tmp          = rtmp11[idx];
2126             rtmp22[idx] -= mul2 * tmp;
2127           }
2128           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2129         }
2130 
2131         pj  = bj + bi[row];
2132         pc1 = ba + bi[row];
2133         pc2 = ba + bi[row+1];
2134 
2135         sctx.pv       = rtmp22[row+1];
2136         rs            = 0.0;
2137         rtmp11[row]   = 1.0/rtmp11[row];
2138         rtmp22[row+1] = 1.0/rtmp22[row+1];
2139         /* copy row entries from dense representation to sparse */
2140         for (j=0; j<nz; j++) {
2141           idx    = pj[j];
2142           pc1[j] = rtmp11[idx];
2143           pc2[j] = rtmp22[idx];
2144           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2145         }
2146         sctx.rs = rs;
2147         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2148         if (sctx.newshift) goto endofwhile;
2149         break;
2150 
2151       case 3:
2152         for  (j=0; j<nz; j++) {
2153           idx         = bjtmp[j];
2154           rtmp11[idx] = 0.0;
2155           rtmp22[idx] = 0.0;
2156           rtmp33[idx] = 0.0;
2157         }
2158         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2159         idx    = r[row];
2160         nz_tmp = ai[idx+1] - ai[idx];
2161         ajtmp  = aj + ai[idx];
2162         v1     = aa + ai[idx];
2163         v2     = aa + ai[idx+1];
2164         v3     = aa + ai[idx+2];
2165         for (j=0; j<nz_tmp; j++) {
2166           idx         = ics[ajtmp[j]];
2167           rtmp11[idx] = v1[j];
2168           rtmp22[idx] = v2[j];
2169           rtmp33[idx] = v3[j];
2170         }
2171         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2172         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2173         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2174 
2175         /* loop over all pivot row blocks above this row block */
2176         prow = *bjtmp++;
2177         while (prow < row) {
2178           pc1 = rtmp11 + prow;
2179           pc2 = rtmp22 + prow;
2180           pc3 = rtmp33 + prow;
2181           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2182             pv   = ba  + bd[prow];
2183             pj   = nbj + bd[prow];
2184             mul1 = *pc1 * *pv;
2185             mul2 = *pc2 * *pv;
2186             mul3 = *pc3 * *pv;
2187             ++pv;
2188             *pc1 = mul1;
2189             *pc2 = mul2;
2190             *pc3 = mul3;
2191 
2192             nz_tmp = bi[prow+1] - bd[prow] - 1;
2193             /* update this row based on pivot row */
2194             for (j=0; j<nz_tmp; j++) {
2195               tmp          = pv[j];
2196               idx          = pj[j];
2197               rtmp11[idx] -= mul1 * tmp;
2198               rtmp22[idx] -= mul2 * tmp;
2199               rtmp33[idx] -= mul3 * tmp;
2200             }
2201             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2202           }
2203           prow = *bjtmp++;
2204         }
2205 
2206         /* Now take care of diagonal 3x3 block in this set of rows */
2207         /* note: prow = row here */
2208         pc1 = rtmp11 + prow;
2209         pc2 = rtmp22 + prow;
2210         pc3 = rtmp33 + prow;
2211 
2212         sctx.pv = *pc1;
2213         pj      = bj + bi[prow];
2214         rs      = 0.0;
2215         for (j=0; j<nz; j++) {
2216           idx = pj[j];
2217           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2218         }
2219         sctx.rs = rs;
2220         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2221         if (sctx.newshift) goto endofwhile;
2222 
2223         if (*pc2 != 0.0 || *pc3 != 0.0) {
2224           mul2   = (*pc2)/(*pc1);
2225           mul3   = (*pc3)/(*pc1);
2226           *pc2   = mul2;
2227           *pc3   = mul3;
2228           nz_tmp = bi[prow+1] - bd[prow] - 1;
2229           pj     = nbj + bd[prow];
2230           for (j=0; j<nz_tmp; j++) {
2231             idx          = pj[j];
2232             tmp          = rtmp11[idx];
2233             rtmp22[idx] -= mul2 * tmp;
2234             rtmp33[idx] -= mul3 * tmp;
2235           }
2236           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2237         }
2238         ++prow;
2239 
2240         pc2     = rtmp22 + prow;
2241         pc3     = rtmp33 + prow;
2242         sctx.pv = *pc2;
2243         pj      = bj + bi[prow];
2244         rs      = 0.0;
2245         for (j=0; j<nz; j++) {
2246           idx = pj[j];
2247           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2248         }
2249         sctx.rs = rs;
2250         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2251         if (sctx.newshift) goto endofwhile;
2252 
2253         if (*pc3 != 0.0) {
2254           mul3   = (*pc3)/(*pc2);
2255           *pc3   = mul3;
2256           pj     = nbj + bd[prow];
2257           nz_tmp = bi[prow+1] - bd[prow] - 1;
2258           for (j=0; j<nz_tmp; j++) {
2259             idx          = pj[j];
2260             tmp          = rtmp22[idx];
2261             rtmp33[idx] -= mul3 * tmp;
2262           }
2263           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2264         }
2265 
2266         pj  = bj + bi[row];
2267         pc1 = ba + bi[row];
2268         pc2 = ba + bi[row+1];
2269         pc3 = ba + bi[row+2];
2270 
2271         sctx.pv       = rtmp33[row+2];
2272         rs            = 0.0;
2273         rtmp11[row]   = 1.0/rtmp11[row];
2274         rtmp22[row+1] = 1.0/rtmp22[row+1];
2275         rtmp33[row+2] = 1.0/rtmp33[row+2];
2276         /* copy row entries from dense representation to sparse */
2277         for (j=0; j<nz; j++) {
2278           idx    = pj[j];
2279           pc1[j] = rtmp11[idx];
2280           pc2[j] = rtmp22[idx];
2281           pc3[j] = rtmp33[idx];
2282           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2283         }
2284 
2285         sctx.rs = rs;
2286         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2287         if (sctx.newshift) goto endofwhile;
2288         break;
2289 
2290       default:
2291         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2292       }
2293       row += nodesz;                 /* Update the row */
2294     }
2295 endofwhile:;
2296   } while (sctx.newshift);
2297   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2298   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2299   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2300   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2301   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2302 
2303   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2304   /* do not set solve add, since MatSolve_Inode + Add is faster */
2305   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2306   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2307   C->assembled              = PETSC_TRUE;
2308   C->preallocated           = PETSC_TRUE;
2309   if (sctx.nshift) {
2310     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2311       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2312     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2313       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2314     }
2315   }
2316   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2317   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2318   PetscFunctionReturn(0);
2319 }
2320 
2321 
2322 /* ----------------------------------------------------------- */
2323 #undef __FUNCT__
2324 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2325 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2326 {
2327   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2328   IS                iscol = a->col,isrow = a->row;
2329   PetscErrorCode    ierr;
2330   const PetscInt    *r,*c,*rout,*cout;
2331   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2332   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2333   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2334   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2335   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336   const PetscScalar *b;
2337 
2338   PetscFunctionBegin;
2339   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340   node_max = a->inode.node_count;
2341   ns       = a->inode.size;     /* Node Size array */
2342 
2343   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2344   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2345   tmp  = a->solve_work;
2346 
2347   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2348   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2349 
2350   /* forward solve the lower triangular */
2351   tmps = tmp;
2352   aa   = a_a;
2353   aj   = a_j;
2354   ad   = a->diag;
2355 
2356   for (i = 0,row = 0; i< node_max; ++i) {
2357     nsz = ns[i];
2358     aii = ai[row];
2359     v1  = aa + aii;
2360     vi  = aj + aii;
2361     nz  = ai[row+1]- ai[row];
2362 
2363     if (i < node_max-1) {
2364       /* Prefetch the indices for the next block */
2365       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366       /* Prefetch the data for the next block */
2367       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368     }
2369 
2370     switch (nsz) {               /* Each loop in 'case' is unrolled */
2371     case 1:
2372       sum1 = b[r[row]];
2373       for (j=0; j<nz-1; j+=2) {
2374         i0    = vi[j];
2375         i1    = vi[j+1];
2376         tmp0  = tmps[i0];
2377         tmp1  = tmps[i1];
2378         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379       }
2380       if (j == nz-1) {
2381         tmp0  = tmps[vi[j]];
2382         sum1 -= v1[j]*tmp0;
2383       }
2384       tmp[row++]=sum1;
2385       break;
2386     case 2:
2387       sum1 = b[r[row]];
2388       sum2 = b[r[row+1]];
2389       v2   = aa + ai[row+1];
2390 
2391       for (j=0; j<nz-1; j+=2) {
2392         i0    = vi[j];
2393         i1    = vi[j+1];
2394         tmp0  = tmps[i0];
2395         tmp1  = tmps[i1];
2396         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398       }
2399       if (j == nz-1) {
2400         tmp0  = tmps[vi[j]];
2401         sum1 -= v1[j] *tmp0;
2402         sum2 -= v2[j] *tmp0;
2403       }
2404       sum2     -= v2[nz] * sum1;
2405       tmp[row++]=sum1;
2406       tmp[row++]=sum2;
2407       break;
2408     case 3:
2409       sum1 = b[r[row]];
2410       sum2 = b[r[row+1]];
2411       sum3 = b[r[row+2]];
2412       v2   = aa + ai[row+1];
2413       v3   = aa + ai[row+2];
2414 
2415       for (j=0; j<nz-1; j+=2) {
2416         i0    = vi[j];
2417         i1    = vi[j+1];
2418         tmp0  = tmps[i0];
2419         tmp1  = tmps[i1];
2420         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423       }
2424       if (j == nz-1) {
2425         tmp0  = tmps[vi[j]];
2426         sum1 -= v1[j] *tmp0;
2427         sum2 -= v2[j] *tmp0;
2428         sum3 -= v3[j] *tmp0;
2429       }
2430       sum2     -= v2[nz] * sum1;
2431       sum3     -= v3[nz] * sum1;
2432       sum3     -= v3[nz+1] * sum2;
2433       tmp[row++]=sum1;
2434       tmp[row++]=sum2;
2435       tmp[row++]=sum3;
2436       break;
2437 
2438     case 4:
2439       sum1 = b[r[row]];
2440       sum2 = b[r[row+1]];
2441       sum3 = b[r[row+2]];
2442       sum4 = b[r[row+3]];
2443       v2   = aa + ai[row+1];
2444       v3   = aa + ai[row+2];
2445       v4   = aa + ai[row+3];
2446 
2447       for (j=0; j<nz-1; j+=2) {
2448         i0    = vi[j];
2449         i1    = vi[j+1];
2450         tmp0  = tmps[i0];
2451         tmp1  = tmps[i1];
2452         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456       }
2457       if (j == nz-1) {
2458         tmp0  = tmps[vi[j]];
2459         sum1 -= v1[j] *tmp0;
2460         sum2 -= v2[j] *tmp0;
2461         sum3 -= v3[j] *tmp0;
2462         sum4 -= v4[j] *tmp0;
2463       }
2464       sum2 -= v2[nz] * sum1;
2465       sum3 -= v3[nz] * sum1;
2466       sum4 -= v4[nz] * sum1;
2467       sum3 -= v3[nz+1] * sum2;
2468       sum4 -= v4[nz+1] * sum2;
2469       sum4 -= v4[nz+2] * sum3;
2470 
2471       tmp[row++]=sum1;
2472       tmp[row++]=sum2;
2473       tmp[row++]=sum3;
2474       tmp[row++]=sum4;
2475       break;
2476     case 5:
2477       sum1 = b[r[row]];
2478       sum2 = b[r[row+1]];
2479       sum3 = b[r[row+2]];
2480       sum4 = b[r[row+3]];
2481       sum5 = b[r[row+4]];
2482       v2   = aa + ai[row+1];
2483       v3   = aa + ai[row+2];
2484       v4   = aa + ai[row+3];
2485       v5   = aa + ai[row+4];
2486 
2487       for (j=0; j<nz-1; j+=2) {
2488         i0    = vi[j];
2489         i1    = vi[j+1];
2490         tmp0  = tmps[i0];
2491         tmp1  = tmps[i1];
2492         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497       }
2498       if (j == nz-1) {
2499         tmp0  = tmps[vi[j]];
2500         sum1 -= v1[j] *tmp0;
2501         sum2 -= v2[j] *tmp0;
2502         sum3 -= v3[j] *tmp0;
2503         sum4 -= v4[j] *tmp0;
2504         sum5 -= v5[j] *tmp0;
2505       }
2506 
2507       sum2 -= v2[nz] * sum1;
2508       sum3 -= v3[nz] * sum1;
2509       sum4 -= v4[nz] * sum1;
2510       sum5 -= v5[nz] * sum1;
2511       sum3 -= v3[nz+1] * sum2;
2512       sum4 -= v4[nz+1] * sum2;
2513       sum5 -= v5[nz+1] * sum2;
2514       sum4 -= v4[nz+2] * sum3;
2515       sum5 -= v5[nz+2] * sum3;
2516       sum5 -= v5[nz+3] * sum4;
2517 
2518       tmp[row++]=sum1;
2519       tmp[row++]=sum2;
2520       tmp[row++]=sum3;
2521       tmp[row++]=sum4;
2522       tmp[row++]=sum5;
2523       break;
2524     default:
2525       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526     }
2527   }
2528   /* backward solve the upper triangular */
2529   for (i=node_max -1,row = n-1; i>=0; i--) {
2530     nsz = ns[i];
2531     aii = ad[row+1] + 1;
2532     v1  = aa + aii;
2533     vi  = aj + aii;
2534     nz  = ad[row]- ad[row+1] - 1;
2535 
2536     if (i > 0) {
2537       /* Prefetch the indices for the next block */
2538       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539       /* Prefetch the data for the next block */
2540       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541     }
2542 
2543     switch (nsz) {               /* Each loop in 'case' is unrolled */
2544     case 1:
2545       sum1 = tmp[row];
2546 
2547       for (j=0; j<nz-1; j+=2) {
2548         i0    = vi[j];
2549         i1    = vi[j+1];
2550         tmp0  = tmps[i0];
2551         tmp1  = tmps[i1];
2552         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553       }
2554       if (j == nz-1) {
2555         tmp0  = tmps[vi[j]];
2556         sum1 -= v1[j]*tmp0;
2557       }
2558       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559       break;
2560     case 2:
2561       sum1 = tmp[row];
2562       sum2 = tmp[row-1];
2563       v2   = aa + ad[row] + 1;
2564       for (j=0; j<nz-1; j+=2) {
2565         i0    = vi[j];
2566         i1    = vi[j+1];
2567         tmp0  = tmps[i0];
2568         tmp1  = tmps[i1];
2569         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571       }
2572       if (j == nz-1) {
2573         tmp0  = tmps[vi[j]];
2574         sum1 -= v1[j]* tmp0;
2575         sum2 -= v2[j+1]* tmp0;
2576       }
2577 
2578       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579       sum2     -= v2[0] * tmp0;
2580       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581       break;
2582     case 3:
2583       sum1 = tmp[row];
2584       sum2 = tmp[row -1];
2585       sum3 = tmp[row -2];
2586       v2   = aa + ad[row] + 1;
2587       v3   = aa + ad[row -1] + 1;
2588       for (j=0; j<nz-1; j+=2) {
2589         i0    = vi[j];
2590         i1    = vi[j+1];
2591         tmp0  = tmps[i0];
2592         tmp1  = tmps[i1];
2593         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596       }
2597       if (j== nz-1) {
2598         tmp0  = tmps[vi[j]];
2599         sum1 -= v1[j] * tmp0;
2600         sum2 -= v2[j+1] * tmp0;
2601         sum3 -= v3[j+2] * tmp0;
2602       }
2603       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604       sum2     -= v2[0]* tmp0;
2605       sum3     -= v3[1] * tmp0;
2606       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607       sum3     -= v3[0]* tmp0;
2608       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609 
2610       break;
2611     case 4:
2612       sum1 = tmp[row];
2613       sum2 = tmp[row -1];
2614       sum3 = tmp[row -2];
2615       sum4 = tmp[row -3];
2616       v2   = aa + ad[row]+1;
2617       v3   = aa + ad[row -1]+1;
2618       v4   = aa + ad[row -2]+1;
2619 
2620       for (j=0; j<nz-1; j+=2) {
2621         i0    = vi[j];
2622         i1    = vi[j+1];
2623         tmp0  = tmps[i0];
2624         tmp1  = tmps[i1];
2625         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2626         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629       }
2630       if (j== nz-1) {
2631         tmp0  = tmps[vi[j]];
2632         sum1 -= v1[j] * tmp0;
2633         sum2 -= v2[j+1] * tmp0;
2634         sum3 -= v3[j+2] * tmp0;
2635         sum4 -= v4[j+3] * tmp0;
2636       }
2637 
2638       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639       sum2     -= v2[0] * tmp0;
2640       sum3     -= v3[1] * tmp0;
2641       sum4     -= v4[2] * tmp0;
2642       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643       sum3     -= v3[0] * tmp0;
2644       sum4     -= v4[1] * tmp0;
2645       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646       sum4     -= v4[0] * tmp0;
2647       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648       break;
2649     case 5:
2650       sum1 = tmp[row];
2651       sum2 = tmp[row -1];
2652       sum3 = tmp[row -2];
2653       sum4 = tmp[row -3];
2654       sum5 = tmp[row -4];
2655       v2   = aa + ad[row]+1;
2656       v3   = aa + ad[row -1]+1;
2657       v4   = aa + ad[row -2]+1;
2658       v5   = aa + ad[row -3]+1;
2659       for (j=0; j<nz-1; j+=2) {
2660         i0    = vi[j];
2661         i1    = vi[j+1];
2662         tmp0  = tmps[i0];
2663         tmp1  = tmps[i1];
2664         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669       }
2670       if (j==nz-1) {
2671         tmp0  = tmps[vi[j]];
2672         sum1 -= v1[j] * tmp0;
2673         sum2 -= v2[j+1] * tmp0;
2674         sum3 -= v3[j+2] * tmp0;
2675         sum4 -= v4[j+3] * tmp0;
2676         sum5 -= v5[j+4] * tmp0;
2677       }
2678 
2679       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680       sum2     -= v2[0] * tmp0;
2681       sum3     -= v3[1] * tmp0;
2682       sum4     -= v4[2] * tmp0;
2683       sum5     -= v5[3] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685       sum3     -= v3[0] * tmp0;
2686       sum4     -= v4[1] * tmp0;
2687       sum5     -= v5[2] * tmp0;
2688       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689       sum4     -= v4[0] * tmp0;
2690       sum5     -= v5[1] * tmp0;
2691       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692       sum5     -= v5[0] * tmp0;
2693       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694       break;
2695     default:
2696       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697     }
2698   }
2699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2703   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 
2708 /*
2709      Makes a longer coloring[] array and calls the usual code with that
2710 */
2711 #undef __FUNCT__
2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714 {
2715   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2716   PetscErrorCode  ierr;
2717   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718   PetscInt        *colorused,i;
2719   ISColoringValue *newcolor;
2720 
2721   PetscFunctionBegin;
2722   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2723   /* loop over inodes, marking a color for each column*/
2724   row = 0;
2725   for (i=0; i<m; i++) {
2726     for (j=0; j<ns[i]; j++) {
2727       newcolor[row++] = coloring[i] + j*ncolors;
2728     }
2729   }
2730 
2731   /* eliminate unneeded colors */
2732   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2733   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2734   for (i=0; i<n; i++) {
2735     colorused[newcolor[i]] = 1;
2736   }
2737 
2738   for (i=1; i<5*ncolors; i++) {
2739     colorused[i] += colorused[i-1];
2740   }
2741   ncolors = colorused[5*ncolors-1];
2742   for (i=0; i<n; i++) {
2743     newcolor[i] = colorused[newcolor[i]]-1;
2744   }
2745   ierr = PetscFree(colorused);CHKERRQ(ierr);
2746   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2747   ierr = PetscFree(coloring);CHKERRQ(ierr);
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 #include <../src/mat/blockinvert.h>
2752 
2753 #undef __FUNCT__
2754 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2755 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2756 {
2757   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2758   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2759   MatScalar         *ibdiag,*bdiag,work[25],*t;
2760   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2761   const MatScalar   *v = a->a,*v1,*v2,*v3,*v4,*v5;
2762   const PetscScalar *xb, *b;
2763   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2764   PetscErrorCode    ierr;
2765   PetscInt          n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2766   PetscInt          *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2767 
2768   PetscFunctionBegin;
2769   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2770   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2771 
2772   if (!a->inode.ibdiagvalid) {
2773     if (!a->inode.ibdiag) {
2774       /* calculate space needed for diagonal blocks */
2775       for (i=0; i<m; i++) {
2776         cnt += sizes[i]*sizes[i];
2777       }
2778       a->inode.bdiagsize = cnt;
2779 
2780       ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2781     }
2782 
2783     /* copy over the diagonal blocks and invert them */
2784     ibdiag = a->inode.ibdiag;
2785     bdiag  = a->inode.bdiag;
2786     cnt    = 0;
2787     for (i=0, row = 0; i<m; i++) {
2788       for (j=0; j<sizes[i]; j++) {
2789         for (k=0; k<sizes[i]; k++) {
2790           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2791         }
2792       }
2793       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2794 
2795       switch (sizes[i]) {
2796       case 1:
2797         /* Create matrix data structure */
2798         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2799         ibdiag[cnt] = 1.0/ibdiag[cnt];
2800         break;
2801       case 2:
2802         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2803         break;
2804       case 3:
2805         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2806         break;
2807       case 4:
2808         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2809         break;
2810       case 5:
2811         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2812         break;
2813       default:
2814         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2815       }
2816       cnt += sizes[i]*sizes[i];
2817       row += sizes[i];
2818     }
2819     a->inode.ibdiagvalid = PETSC_TRUE;
2820   }
2821   ibdiag = a->inode.ibdiag;
2822   bdiag  = a->inode.bdiag;
2823   t      = a->inode.ssor_work;
2824 
2825   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2826   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2827   CHKMEMQ;
2828   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2829   if (flag & SOR_ZERO_INITIAL_GUESS) {
2830     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2831 
2832       for (i=0, row=0; i<m; i++) {
2833         sz  = diag[row] - ii[row];
2834         v1  = a->a + ii[row];
2835         idx = a->j + ii[row];
2836 
2837         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2838         switch (sizes[i]) {
2839         case 1:
2840 
2841           sum1 = b[row];
2842           for (n = 0; n<sz-1; n+=2) {
2843             i1    = idx[0];
2844             i2    = idx[1];
2845             idx  += 2;
2846             tmp0  = x[i1];
2847             tmp1  = x[i2];
2848             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2849           }
2850 
2851           if (n == sz-1) {
2852             tmp0  = x[*idx];
2853             sum1 -= *v1 * tmp0;
2854           }
2855           t[row]   = sum1;
2856           x[row++] = sum1*(*ibdiag++);
2857           break;
2858         case 2:
2859           v2   = a->a + ii[row+1];
2860           sum1 = b[row];
2861           sum2 = b[row+1];
2862           for (n = 0; n<sz-1; n+=2) {
2863             i1    = idx[0];
2864             i2    = idx[1];
2865             idx  += 2;
2866             tmp0  = x[i1];
2867             tmp1  = x[i2];
2868             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2869             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2870           }
2871 
2872           if (n == sz-1) {
2873             tmp0  = x[*idx];
2874             sum1 -= v1[0] * tmp0;
2875             sum2 -= v2[0] * tmp0;
2876           }
2877           t[row]   = sum1;
2878           t[row+1] = sum2;
2879           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2880           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2881           ibdiag  += 4;
2882           break;
2883         case 3:
2884           v2   = a->a + ii[row+1];
2885           v3   = a->a + ii[row+2];
2886           sum1 = b[row];
2887           sum2 = b[row+1];
2888           sum3 = b[row+2];
2889           for (n = 0; n<sz-1; n+=2) {
2890             i1    = idx[0];
2891             i2    = idx[1];
2892             idx  += 2;
2893             tmp0  = x[i1];
2894             tmp1  = x[i2];
2895             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2896             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2897             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2898           }
2899 
2900           if (n == sz-1) {
2901             tmp0  = x[*idx];
2902             sum1 -= v1[0] * tmp0;
2903             sum2 -= v2[0] * tmp0;
2904             sum3 -= v3[0] * tmp0;
2905           }
2906           t[row]   = sum1;
2907           t[row+1] = sum2;
2908           t[row+2] = sum3;
2909           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2910           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2911           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2912           ibdiag  += 9;
2913           break;
2914         case 4:
2915           v2   = a->a + ii[row+1];
2916           v3   = a->a + ii[row+2];
2917           v4   = a->a + ii[row+3];
2918           sum1 = b[row];
2919           sum2 = b[row+1];
2920           sum3 = b[row+2];
2921           sum4 = b[row+3];
2922           for (n = 0; n<sz-1; n+=2) {
2923             i1    = idx[0];
2924             i2    = idx[1];
2925             idx  += 2;
2926             tmp0  = x[i1];
2927             tmp1  = x[i2];
2928             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2929             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2930             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2931             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2932           }
2933 
2934           if (n == sz-1) {
2935             tmp0  = x[*idx];
2936             sum1 -= v1[0] * tmp0;
2937             sum2 -= v2[0] * tmp0;
2938             sum3 -= v3[0] * tmp0;
2939             sum4 -= v4[0] * tmp0;
2940           }
2941           t[row]   = sum1;
2942           t[row+1] = sum2;
2943           t[row+2] = sum3;
2944           t[row+3] = sum4;
2945           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2946           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2947           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2948           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2949           ibdiag  += 16;
2950           break;
2951         case 5:
2952           v2   = a->a + ii[row+1];
2953           v3   = a->a + ii[row+2];
2954           v4   = a->a + ii[row+3];
2955           v5   = a->a + ii[row+4];
2956           sum1 = b[row];
2957           sum2 = b[row+1];
2958           sum3 = b[row+2];
2959           sum4 = b[row+3];
2960           sum5 = b[row+4];
2961           for (n = 0; n<sz-1; n+=2) {
2962             i1    = idx[0];
2963             i2    = idx[1];
2964             idx  += 2;
2965             tmp0  = x[i1];
2966             tmp1  = x[i2];
2967             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2968             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2969             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2970             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2971             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2972           }
2973 
2974           if (n == sz-1) {
2975             tmp0  = x[*idx];
2976             sum1 -= v1[0] * tmp0;
2977             sum2 -= v2[0] * tmp0;
2978             sum3 -= v3[0] * tmp0;
2979             sum4 -= v4[0] * tmp0;
2980             sum5 -= v5[0] * tmp0;
2981           }
2982           t[row]   = sum1;
2983           t[row+1] = sum2;
2984           t[row+2] = sum3;
2985           t[row+3] = sum4;
2986           t[row+4] = sum5;
2987           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2988           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2989           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2990           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2991           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2992           ibdiag  += 25;
2993           break;
2994         default:
2995           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2996         }
2997       }
2998 
2999       xb   = t;
3000       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3001     } else xb = b;
3002     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3003 
3004       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3005       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3006         ibdiag -= sizes[i]*sizes[i];
3007         sz      = ii[row+1] - diag[row] - 1;
3008         v1      = a->a + diag[row] + 1;
3009         idx     = a->j + diag[row] + 1;
3010 
3011         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3012         switch (sizes[i]) {
3013         case 1:
3014 
3015           sum1 = xb[row];
3016           for (n = 0; n<sz-1; n+=2) {
3017             i1    = idx[0];
3018             i2    = idx[1];
3019             idx  += 2;
3020             tmp0  = x[i1];
3021             tmp1  = x[i2];
3022             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3023           }
3024 
3025           if (n == sz-1) {
3026             tmp0  = x[*idx];
3027             sum1 -= *v1*tmp0;
3028           }
3029           x[row--] = sum1*(*ibdiag);
3030           break;
3031 
3032         case 2:
3033 
3034           sum1 = xb[row];
3035           sum2 = xb[row-1];
3036           /* note that sum1 is associated with the second of the two rows */
3037           v2 = a->a + diag[row-1] + 2;
3038           for (n = 0; n<sz-1; n+=2) {
3039             i1    = idx[0];
3040             i2    = idx[1];
3041             idx  += 2;
3042             tmp0  = x[i1];
3043             tmp1  = x[i2];
3044             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3045             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3046           }
3047 
3048           if (n == sz-1) {
3049             tmp0  = x[*idx];
3050             sum1 -= *v1*tmp0;
3051             sum2 -= *v2*tmp0;
3052           }
3053           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3054           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3055           break;
3056         case 3:
3057 
3058           sum1 = xb[row];
3059           sum2 = xb[row-1];
3060           sum3 = xb[row-2];
3061           v2   = a->a + diag[row-1] + 2;
3062           v3   = a->a + diag[row-2] + 3;
3063           for (n = 0; n<sz-1; n+=2) {
3064             i1    = idx[0];
3065             i2    = idx[1];
3066             idx  += 2;
3067             tmp0  = x[i1];
3068             tmp1  = x[i2];
3069             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3070             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3071             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3072           }
3073 
3074           if (n == sz-1) {
3075             tmp0  = x[*idx];
3076             sum1 -= *v1*tmp0;
3077             sum2 -= *v2*tmp0;
3078             sum3 -= *v3*tmp0;
3079           }
3080           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3081           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3082           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3083           break;
3084         case 4:
3085 
3086           sum1 = xb[row];
3087           sum2 = xb[row-1];
3088           sum3 = xb[row-2];
3089           sum4 = xb[row-3];
3090           v2   = a->a + diag[row-1] + 2;
3091           v3   = a->a + diag[row-2] + 3;
3092           v4   = a->a + diag[row-3] + 4;
3093           for (n = 0; n<sz-1; n+=2) {
3094             i1    = idx[0];
3095             i2    = idx[1];
3096             idx  += 2;
3097             tmp0  = x[i1];
3098             tmp1  = x[i2];
3099             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3100             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3101             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3102             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3103           }
3104 
3105           if (n == sz-1) {
3106             tmp0  = x[*idx];
3107             sum1 -= *v1*tmp0;
3108             sum2 -= *v2*tmp0;
3109             sum3 -= *v3*tmp0;
3110             sum4 -= *v4*tmp0;
3111           }
3112           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3113           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3114           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3115           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3116           break;
3117         case 5:
3118 
3119           sum1 = xb[row];
3120           sum2 = xb[row-1];
3121           sum3 = xb[row-2];
3122           sum4 = xb[row-3];
3123           sum5 = xb[row-4];
3124           v2   = a->a + diag[row-1] + 2;
3125           v3   = a->a + diag[row-2] + 3;
3126           v4   = a->a + diag[row-3] + 4;
3127           v5   = a->a + diag[row-4] + 5;
3128           for (n = 0; n<sz-1; n+=2) {
3129             i1    = idx[0];
3130             i2    = idx[1];
3131             idx  += 2;
3132             tmp0  = x[i1];
3133             tmp1  = x[i2];
3134             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3135             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3136             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3137             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3138             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3139           }
3140 
3141           if (n == sz-1) {
3142             tmp0  = x[*idx];
3143             sum1 -= *v1*tmp0;
3144             sum2 -= *v2*tmp0;
3145             sum3 -= *v3*tmp0;
3146             sum4 -= *v4*tmp0;
3147             sum5 -= *v5*tmp0;
3148           }
3149           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3150           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3151           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3152           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3153           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3154           break;
3155         default:
3156           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3157         }
3158       }
3159 
3160       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3161     }
3162     its--;
3163   }
3164   while (its--) {
3165 
3166     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3167       ibdiag = a->inode.ibdiag;
3168 
3169       for (i=0, row=0; i<m; i++) {
3170         sz  = ii[row + 1] - ii[row];
3171         v1  = a->a + ii[row];
3172         idx = a->j + ii[row];
3173 
3174         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3175         switch (sizes[i]) {
3176         case 1:
3177 
3178           sum1 = b[row];
3179           for (n = 0; n<sz-1; n+=2) {
3180             i1    = idx[0];
3181             i2    = idx[1];
3182             idx  += 2;
3183             tmp0  = x[i1];
3184             tmp1  = x[i2];
3185             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3186           }
3187 
3188           if (n == sz-1) {
3189             tmp0  = x[*idx];
3190             sum1 -= *v1 * tmp0;
3191           }
3192 
3193           /* in MatSOR_SeqAIJ this line would be
3194            *
3195            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3196            *
3197            * but omega == 1, so this becomes
3198            *
3199            * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3200            *
3201            * but bdiag and ibdiag cancel each other, so we can change this
3202            * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3203            * block sizes as well
3204            */
3205           x[row++] += sum1*(*ibdiag++);
3206           break;
3207         case 2:
3208           v2   = a->a + ii[row+1];
3209           sum1 = b[row];
3210           sum2 = b[row+1];
3211           for (n = 0; n<sz-1; n+=2) {
3212             i1    = idx[0];
3213             i2    = idx[1];
3214             idx  += 2;
3215             tmp0  = x[i1];
3216             tmp1  = x[i2];
3217             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3218             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3219           }
3220 
3221           if (n == sz-1) {
3222             tmp0  = x[*idx];
3223             sum1 -= v1[0] * tmp0;
3224             sum2 -= v2[0] * tmp0;
3225           }
3226           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3227           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3228           ibdiag   += 4;
3229           break;
3230         case 3:
3231           v2   = a->a + ii[row+1];
3232           v3   = a->a + ii[row+2];
3233           sum1 = b[row];
3234           sum2 = b[row+1];
3235           sum3 = b[row+2];
3236           for (n = 0; n<sz-1; n+=2) {
3237             i1    = idx[0];
3238             i2    = idx[1];
3239             idx  += 2;
3240             tmp0  = x[i1];
3241             tmp1  = x[i2];
3242             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3243             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3244             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3245           }
3246 
3247           if (n == sz-1) {
3248             tmp0  = x[*idx];
3249             sum1 -= v1[0] * tmp0;
3250             sum2 -= v2[0] * tmp0;
3251             sum3 -= v3[0] * tmp0;
3252           }
3253           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3254           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3255           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3256           ibdiag   += 9;
3257           break;
3258         case 4:
3259           v2   = a->a + ii[row+1];
3260           v3   = a->a + ii[row+2];
3261           v4   = a->a + ii[row+3];
3262           sum1 = b[row];
3263           sum2 = b[row+1];
3264           sum3 = b[row+2];
3265           sum4 = b[row+3];
3266           for (n = 0; n<sz-1; n+=2) {
3267             i1    = idx[0];
3268             i2    = idx[1];
3269             idx  += 2;
3270             tmp0  = x[i1];
3271             tmp1  = x[i2];
3272             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3275             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3276           }
3277 
3278           if (n == sz-1) {
3279             tmp0  = x[*idx];
3280             sum1 -= v1[0] * tmp0;
3281             sum2 -= v2[0] * tmp0;
3282             sum3 -= v3[0] * tmp0;
3283             sum4 -= v4[0] * tmp0;
3284           }
3285           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3286           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3287           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3288           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3289           ibdiag   += 16;
3290           break;
3291         case 5:
3292           v2   = a->a + ii[row+1];
3293           v3   = a->a + ii[row+2];
3294           v4   = a->a + ii[row+3];
3295           v5   = a->a + ii[row+4];
3296           sum1 = b[row];
3297           sum2 = b[row+1];
3298           sum3 = b[row+2];
3299           sum4 = b[row+3];
3300           sum5 = b[row+4];
3301           for (n = 0; n<sz-1; n+=2) {
3302             i1    = idx[0];
3303             i2    = idx[1];
3304             idx  += 2;
3305             tmp0  = x[i1];
3306             tmp1  = x[i2];
3307             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3308             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3309             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3310             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3311             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3312           }
3313 
3314           if (n == sz-1) {
3315             tmp0  = x[*idx];
3316             sum1 -= v1[0] * tmp0;
3317             sum2 -= v2[0] * tmp0;
3318             sum3 -= v3[0] * tmp0;
3319             sum4 -= v4[0] * tmp0;
3320             sum5 -= v5[0] * tmp0;
3321           }
3322           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3323           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3324           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3325           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3326           x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3327           ibdiag   += 25;
3328           break;
3329         default:
3330           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3331         }
3332       }
3333 
3334       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3335     }
3336     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3337 
3338       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3339       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3340         ibdiag -= sizes[i]*sizes[i];
3341         sz      = ii[row+1] - ii[row];
3342         v1      = a->a + ii[row];
3343         idx     = a->j + ii[row];
3344 
3345         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3346         switch (sizes[i]) {
3347         case 1:
3348 
3349           sum1 = b[row];
3350           for (n = 0; n<sz-1; n+=2) {
3351             i1    = idx[0];
3352             i2    = idx[1];
3353             idx  += 2;
3354             tmp0  = x[i1];
3355             tmp1  = x[i2];
3356             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3357           }
3358 
3359           if (n == sz-1) {
3360             tmp0  = x[*idx];
3361             sum1 -= *v1*tmp0;
3362           }
3363           x[row--] += sum1*(*ibdiag);
3364           break;
3365 
3366         case 2:
3367 
3368           sum1 = b[row];
3369           sum2 = b[row-1];
3370           /* note that sum1 is associated with the second of the two rows */
3371           v2 = a->a + ii[row - 1];
3372           for (n = 0; n<sz-1; n+=2) {
3373             i1    = idx[0];
3374             i2    = idx[1];
3375             idx  += 2;
3376             tmp0  = x[i1];
3377             tmp1  = x[i2];
3378             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3379             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3380           }
3381 
3382           if (n == sz-1) {
3383             tmp0  = x[*idx];
3384             sum1 -= *v1*tmp0;
3385             sum2 -= *v2*tmp0;
3386           }
3387           x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3388           x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3389           break;
3390         case 3:
3391 
3392           sum1 = b[row];
3393           sum2 = b[row-1];
3394           sum3 = b[row-2];
3395           v2   = a->a + ii[row-1];
3396           v3   = a->a + ii[row-2];
3397           for (n = 0; n<sz-1; n+=2) {
3398             i1    = idx[0];
3399             i2    = idx[1];
3400             idx  += 2;
3401             tmp0  = x[i1];
3402             tmp1  = x[i2];
3403             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3404             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3405             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3406           }
3407 
3408           if (n == sz-1) {
3409             tmp0  = x[*idx];
3410             sum1 -= *v1*tmp0;
3411             sum2 -= *v2*tmp0;
3412             sum3 -= *v3*tmp0;
3413           }
3414           x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3415           x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3416           x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3417           break;
3418         case 4:
3419 
3420           sum1 = b[row];
3421           sum2 = b[row-1];
3422           sum3 = b[row-2];
3423           sum4 = b[row-3];
3424           v2   = a->a + ii[row-1];
3425           v3   = a->a + ii[row-2];
3426           v4   = a->a + ii[row-3];
3427           for (n = 0; n<sz-1; n+=2) {
3428             i1    = idx[0];
3429             i2    = idx[1];
3430             idx  += 2;
3431             tmp0  = x[i1];
3432             tmp1  = x[i2];
3433             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3434             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3435             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3436             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3437           }
3438 
3439           if (n == sz-1) {
3440             tmp0  = x[*idx];
3441             sum1 -= *v1*tmp0;
3442             sum2 -= *v2*tmp0;
3443             sum3 -= *v3*tmp0;
3444             sum4 -= *v4*tmp0;
3445           }
3446           x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3447           x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3448           x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3449           x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3450           break;
3451         case 5:
3452 
3453           sum1 = b[row];
3454           sum2 = b[row-1];
3455           sum3 = b[row-2];
3456           sum4 = b[row-3];
3457           sum5 = b[row-4];
3458           v2   = a->a + ii[row-1];
3459           v3   = a->a + ii[row-2];
3460           v4   = a->a + ii[row-3];
3461           v5   = a->a + ii[row-4];
3462           for (n = 0; n<sz-1; n+=2) {
3463             i1    = idx[0];
3464             i2    = idx[1];
3465             idx  += 2;
3466             tmp0  = x[i1];
3467             tmp1  = x[i2];
3468             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3469             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3470             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3471             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3472             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3473           }
3474 
3475           if (n == sz-1) {
3476             tmp0  = x[*idx];
3477             sum1 -= *v1*tmp0;
3478             sum2 -= *v2*tmp0;
3479             sum3 -= *v3*tmp0;
3480             sum4 -= *v4*tmp0;
3481             sum5 -= *v5*tmp0;
3482           }
3483           x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3484           x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3485           x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3486           x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3487           x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3488           break;
3489         default:
3490           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3491         }
3492       }
3493 
3494       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3495     }
3496   }
3497   if (flag & SOR_EISENSTAT) {
3498     /*
3499           Apply  (U + D)^-1  where D is now the block diagonal
3500     */
3501     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3502     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3503       ibdiag -= sizes[i]*sizes[i];
3504       sz      = ii[row+1] - diag[row] - 1;
3505       v1      = a->a + diag[row] + 1;
3506       idx     = a->j + diag[row] + 1;
3507       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3508       switch (sizes[i]) {
3509       case 1:
3510 
3511         sum1 = b[row];
3512         for (n = 0; n<sz-1; n+=2) {
3513           i1    = idx[0];
3514           i2    = idx[1];
3515           idx  += 2;
3516           tmp0  = x[i1];
3517           tmp1  = x[i2];
3518           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3519         }
3520 
3521         if (n == sz-1) {
3522           tmp0  = x[*idx];
3523           sum1 -= *v1*tmp0;
3524         }
3525         x[row] = sum1*(*ibdiag);row--;
3526         break;
3527 
3528       case 2:
3529 
3530         sum1 = b[row];
3531         sum2 = b[row-1];
3532         /* note that sum1 is associated with the second of the two rows */
3533         v2 = a->a + diag[row-1] + 2;
3534         for (n = 0; n<sz-1; n+=2) {
3535           i1    = idx[0];
3536           i2    = idx[1];
3537           idx  += 2;
3538           tmp0  = x[i1];
3539           tmp1  = x[i2];
3540           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3541           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3542         }
3543 
3544         if (n == sz-1) {
3545           tmp0  = x[*idx];
3546           sum1 -= *v1*tmp0;
3547           sum2 -= *v2*tmp0;
3548         }
3549         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3550         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3551         row     -= 2;
3552         break;
3553       case 3:
3554 
3555         sum1 = b[row];
3556         sum2 = b[row-1];
3557         sum3 = b[row-2];
3558         v2   = a->a + diag[row-1] + 2;
3559         v3   = a->a + diag[row-2] + 3;
3560         for (n = 0; n<sz-1; n+=2) {
3561           i1    = idx[0];
3562           i2    = idx[1];
3563           idx  += 2;
3564           tmp0  = x[i1];
3565           tmp1  = x[i2];
3566           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3567           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3568           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3569         }
3570 
3571         if (n == sz-1) {
3572           tmp0  = x[*idx];
3573           sum1 -= *v1*tmp0;
3574           sum2 -= *v2*tmp0;
3575           sum3 -= *v3*tmp0;
3576         }
3577         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3578         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3579         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3580         row     -= 3;
3581         break;
3582       case 4:
3583 
3584         sum1 = b[row];
3585         sum2 = b[row-1];
3586         sum3 = b[row-2];
3587         sum4 = b[row-3];
3588         v2   = a->a + diag[row-1] + 2;
3589         v3   = a->a + diag[row-2] + 3;
3590         v4   = a->a + diag[row-3] + 4;
3591         for (n = 0; n<sz-1; n+=2) {
3592           i1    = idx[0];
3593           i2    = idx[1];
3594           idx  += 2;
3595           tmp0  = x[i1];
3596           tmp1  = x[i2];
3597           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3598           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3599           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3600           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3601         }
3602 
3603         if (n == sz-1) {
3604           tmp0  = x[*idx];
3605           sum1 -= *v1*tmp0;
3606           sum2 -= *v2*tmp0;
3607           sum3 -= *v3*tmp0;
3608           sum4 -= *v4*tmp0;
3609         }
3610         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3611         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3612         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3613         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3614         row     -= 4;
3615         break;
3616       case 5:
3617 
3618         sum1 = b[row];
3619         sum2 = b[row-1];
3620         sum3 = b[row-2];
3621         sum4 = b[row-3];
3622         sum5 = b[row-4];
3623         v2   = a->a + diag[row-1] + 2;
3624         v3   = a->a + diag[row-2] + 3;
3625         v4   = a->a + diag[row-3] + 4;
3626         v5   = a->a + diag[row-4] + 5;
3627         for (n = 0; n<sz-1; n+=2) {
3628           i1    = idx[0];
3629           i2    = idx[1];
3630           idx  += 2;
3631           tmp0  = x[i1];
3632           tmp1  = x[i2];
3633           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3634           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3635           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3636           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3637           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3638         }
3639 
3640         if (n == sz-1) {
3641           tmp0  = x[*idx];
3642           sum1 -= *v1*tmp0;
3643           sum2 -= *v2*tmp0;
3644           sum3 -= *v3*tmp0;
3645           sum4 -= *v4*tmp0;
3646           sum5 -= *v5*tmp0;
3647         }
3648         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3649         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3650         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3651         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3652         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3653         row     -= 5;
3654         break;
3655       default:
3656         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3657       }
3658     }
3659     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3660 
3661     /*
3662            t = b - D x    where D is the block diagonal
3663     */
3664     cnt = 0;
3665     for (i=0, row=0; i<m; i++) {
3666       switch (sizes[i]) {
3667       case 1:
3668         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3669         break;
3670       case 2:
3671         x1       = x[row]; x2 = x[row+1];
3672         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3673         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3674         t[row]   = b[row] - tmp1;
3675         t[row+1] = b[row+1] - tmp2; row += 2;
3676         cnt     += 4;
3677         break;
3678       case 3:
3679         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3680         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3681         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3682         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3683         t[row]   = b[row] - tmp1;
3684         t[row+1] = b[row+1] - tmp2;
3685         t[row+2] = b[row+2] - tmp3; row += 3;
3686         cnt     += 9;
3687         break;
3688       case 4:
3689         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3690         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3691         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3692         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3693         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3694         t[row]   = b[row] - tmp1;
3695         t[row+1] = b[row+1] - tmp2;
3696         t[row+2] = b[row+2] - tmp3;
3697         t[row+3] = b[row+3] - tmp4; row += 4;
3698         cnt     += 16;
3699         break;
3700       case 5:
3701         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3702         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3703         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3704         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3705         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3706         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3707         t[row]   = b[row] - tmp1;
3708         t[row+1] = b[row+1] - tmp2;
3709         t[row+2] = b[row+2] - tmp3;
3710         t[row+3] = b[row+3] - tmp4;
3711         t[row+4] = b[row+4] - tmp5;row += 5;
3712         cnt     += 25;
3713         break;
3714       default:
3715         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3716       }
3717     }
3718     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3719 
3720 
3721 
3722     /*
3723           Apply (L + D)^-1 where D is the block diagonal
3724     */
3725     for (i=0, row=0; i<m; i++) {
3726       sz  = diag[row] - ii[row];
3727       v1  = a->a + ii[row];
3728       idx = a->j + ii[row];
3729       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3730       switch (sizes[i]) {
3731       case 1:
3732 
3733         sum1 = t[row];
3734         for (n = 0; n<sz-1; n+=2) {
3735           i1    = idx[0];
3736           i2    = idx[1];
3737           idx  += 2;
3738           tmp0  = t[i1];
3739           tmp1  = t[i2];
3740           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3741         }
3742 
3743         if (n == sz-1) {
3744           tmp0  = t[*idx];
3745           sum1 -= *v1 * tmp0;
3746         }
3747         x[row] += t[row] = sum1*(*ibdiag++); row++;
3748         break;
3749       case 2:
3750         v2   = a->a + ii[row+1];
3751         sum1 = t[row];
3752         sum2 = t[row+1];
3753         for (n = 0; n<sz-1; n+=2) {
3754           i1    = idx[0];
3755           i2    = idx[1];
3756           idx  += 2;
3757           tmp0  = t[i1];
3758           tmp1  = t[i2];
3759           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3760           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3761         }
3762 
3763         if (n == sz-1) {
3764           tmp0  = t[*idx];
3765           sum1 -= v1[0] * tmp0;
3766           sum2 -= v2[0] * tmp0;
3767         }
3768         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3769         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3770         ibdiag   += 4; row += 2;
3771         break;
3772       case 3:
3773         v2   = a->a + ii[row+1];
3774         v3   = a->a + ii[row+2];
3775         sum1 = t[row];
3776         sum2 = t[row+1];
3777         sum3 = t[row+2];
3778         for (n = 0; n<sz-1; n+=2) {
3779           i1    = idx[0];
3780           i2    = idx[1];
3781           idx  += 2;
3782           tmp0  = t[i1];
3783           tmp1  = t[i2];
3784           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3785           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3786           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3787         }
3788 
3789         if (n == sz-1) {
3790           tmp0  = t[*idx];
3791           sum1 -= v1[0] * tmp0;
3792           sum2 -= v2[0] * tmp0;
3793           sum3 -= v3[0] * tmp0;
3794         }
3795         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3796         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3797         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3798         ibdiag   += 9; row += 3;
3799         break;
3800       case 4:
3801         v2   = a->a + ii[row+1];
3802         v3   = a->a + ii[row+2];
3803         v4   = a->a + ii[row+3];
3804         sum1 = t[row];
3805         sum2 = t[row+1];
3806         sum3 = t[row+2];
3807         sum4 = t[row+3];
3808         for (n = 0; n<sz-1; n+=2) {
3809           i1    = idx[0];
3810           i2    = idx[1];
3811           idx  += 2;
3812           tmp0  = t[i1];
3813           tmp1  = t[i2];
3814           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3815           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3816           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3817           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3818         }
3819 
3820         if (n == sz-1) {
3821           tmp0  = t[*idx];
3822           sum1 -= v1[0] * tmp0;
3823           sum2 -= v2[0] * tmp0;
3824           sum3 -= v3[0] * tmp0;
3825           sum4 -= v4[0] * tmp0;
3826         }
3827         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3828         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3829         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3830         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3831         ibdiag   += 16; row += 4;
3832         break;
3833       case 5:
3834         v2   = a->a + ii[row+1];
3835         v3   = a->a + ii[row+2];
3836         v4   = a->a + ii[row+3];
3837         v5   = a->a + ii[row+4];
3838         sum1 = t[row];
3839         sum2 = t[row+1];
3840         sum3 = t[row+2];
3841         sum4 = t[row+3];
3842         sum5 = t[row+4];
3843         for (n = 0; n<sz-1; n+=2) {
3844           i1    = idx[0];
3845           i2    = idx[1];
3846           idx  += 2;
3847           tmp0  = t[i1];
3848           tmp1  = t[i2];
3849           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3850           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3851           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3852           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3853           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3854         }
3855 
3856         if (n == sz-1) {
3857           tmp0  = t[*idx];
3858           sum1 -= v1[0] * tmp0;
3859           sum2 -= v2[0] * tmp0;
3860           sum3 -= v3[0] * tmp0;
3861           sum4 -= v4[0] * tmp0;
3862           sum5 -= v5[0] * tmp0;
3863         }
3864         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3865         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3866         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3867         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3868         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3869         ibdiag   += 25; row += 5;
3870         break;
3871       default:
3872         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3873       }
3874     }
3875     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3876   }
3877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3878   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3879   CHKMEMQ;  PetscFunctionReturn(0);
3880 }
3881 
3882 #undef __FUNCT__
3883 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3884 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3885 {
3886   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3887   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3888   const MatScalar   *bdiag = a->inode.bdiag;
3889   const PetscScalar *b;
3890   PetscErrorCode    ierr;
3891   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3892   const PetscInt    *sizes = a->inode.size;
3893 
3894   PetscFunctionBegin;
3895   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3896   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3897   cnt  = 0;
3898   for (i=0, row=0; i<m; i++) {
3899     switch (sizes[i]) {
3900     case 1:
3901       x[row] = b[row]*bdiag[cnt++];row++;
3902       break;
3903     case 2:
3904       x1       = b[row]; x2 = b[row+1];
3905       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3906       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3907       x[row++] = tmp1;
3908       x[row++] = tmp2;
3909       cnt     += 4;
3910       break;
3911     case 3:
3912       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3913       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3914       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3915       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3916       x[row++] = tmp1;
3917       x[row++] = tmp2;
3918       x[row++] = tmp3;
3919       cnt     += 9;
3920       break;
3921     case 4:
3922       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3923       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3924       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3925       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3926       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3927       x[row++] = tmp1;
3928       x[row++] = tmp2;
3929       x[row++] = tmp3;
3930       x[row++] = tmp4;
3931       cnt     += 16;
3932       break;
3933     case 5:
3934       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3935       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3936       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3937       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3938       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3939       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3940       x[row++] = tmp1;
3941       x[row++] = tmp2;
3942       x[row++] = tmp3;
3943       x[row++] = tmp4;
3944       x[row++] = tmp5;
3945       cnt     += 25;
3946       break;
3947     default:
3948       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3949     }
3950   }
3951   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3952   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3953   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3954   PetscFunctionReturn(0);
3955 }
3956 
3957 /*
3958     samestructure indicates that the matrix has not changed its nonzero structure so we
3959     do not need to recompute the inodes
3960 */
3961 #undef __FUNCT__
3962 #define __FUNCT__ "Mat_CheckInode"
3963 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
3964 {
3965   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3966   PetscErrorCode ierr;
3967   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3968   PetscBool      flag;
3969 
3970   PetscFunctionBegin;
3971   if (!a->inode.use) PetscFunctionReturn(0);
3972   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3973 
3974 
3975   m = A->rmap->n;
3976   if (a->inode.size) ns = a->inode.size;
3977   else {
3978     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
3979   }
3980 
3981   i          = 0;
3982   node_count = 0;
3983   idx        = a->j;
3984   ii         = a->i;
3985   while (i < m) {                /* For each row */
3986     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3987     /* Limits the number of elements in a node to 'a->inode.limit' */
3988     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3989       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
3990       if (nzy != nzx) break;
3991       idy += nzx;              /* Same nonzero pattern */
3992       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3993       if (!flag) break;
3994     }
3995     ns[node_count++] = blk_size;
3996     idx             += blk_size*nzx;
3997     i                = j;
3998   }
3999   /* If not enough inodes found,, do not use inode version of the routines */
4000   if (!m || node_count > .8*m) {
4001     ierr = PetscFree(ns);CHKERRQ(ierr);
4002 
4003     a->inode.node_count       = 0;
4004     a->inode.size             = NULL;
4005     a->inode.use              = PETSC_FALSE;
4006     A->ops->mult              = MatMult_SeqAIJ;
4007     A->ops->sor               = MatSOR_SeqAIJ;
4008     A->ops->multadd           = MatMultAdd_SeqAIJ;
4009     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4010     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4011     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4012     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4013     A->ops->coloringpatch     = 0;
4014     A->ops->multdiagonalblock = 0;
4015 
4016     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4017   } else {
4018     if (!A->factortype) {
4019       A->ops->mult              = MatMult_SeqAIJ_Inode;
4020       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4021       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4022       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4023       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4024       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4025       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4026       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4027       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4028     } else {
4029       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4030     }
4031     a->inode.node_count = node_count;
4032     a->inode.size       = ns;
4033     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4034   }
4035   a->inode.checked = PETSC_TRUE;
4036   PetscFunctionReturn(0);
4037 }
4038 
4039 #undef __FUNCT__
4040 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4041 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4042 {
4043   Mat            B =*C;
4044   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4045   PetscErrorCode ierr;
4046   PetscInt       m=A->rmap->n;
4047 
4048   PetscFunctionBegin;
4049   c->inode.use       = a->inode.use;
4050   c->inode.limit     = a->inode.limit;
4051   c->inode.max_limit = a->inode.max_limit;
4052   if (a->inode.size) {
4053     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4054     c->inode.node_count = a->inode.node_count;
4055     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4056     /* note the table of functions below should match that in Mat_CheckInode() */
4057     if (!B->factortype) {
4058       B->ops->mult              = MatMult_SeqAIJ_Inode;
4059       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4060       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4061       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4062       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4063       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4064       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4065       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4066       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4067     } else {
4068       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4069     }
4070   } else {
4071     c->inode.size       = 0;
4072     c->inode.node_count = 0;
4073   }
4074   c->inode.ibdiagvalid = PETSC_FALSE;
4075   c->inode.ibdiag      = 0;
4076   c->inode.bdiag       = 0;
4077   PetscFunctionReturn(0);
4078 }
4079 
4080 #undef __FUNCT__
4081 #define __FUNCT__ "MatGetRow_FactoredLU"
4082 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4083 {
4084   PetscInt k, *vi;
4085 
4086   PetscFunctionBegin;
4087   vi = aj + ai[row];
4088   for (k=0; k<nzl; k++) cols[k] = vi[k];
4089   vi        = aj + adiag[row];
4090   cols[nzl] = vi[0];
4091   vi        = aj + adiag[row+1]+1;
4092   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4093   PetscFunctionReturn(0);
4094 }
4095 /*
4096    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4097    Modified from Mat_CheckInode().
4098 
4099    Input Parameters:
4100 +  Mat A - ILU or LU matrix factor
4101 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4102     do not need to recompute the inodes
4103 */
4104 #undef __FUNCT__
4105 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4106 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure)
4107 {
4108   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4109   PetscErrorCode ierr;
4110   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4111   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4112   PetscBool      flag;
4113 
4114   PetscFunctionBegin;
4115   if (!a->inode.use)                     PetscFunctionReturn(0);
4116   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4117 
4118   m = A->rmap->n;
4119   if (a->inode.size) ns = a->inode.size;
4120   else {
4121     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4122   }
4123 
4124   i          = 0;
4125   node_count = 0;
4126   while (i < m) {                /* For each row */
4127     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4128     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4129     nzx  = nzl1 + nzu1 + 1;
4130     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
4131     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4132 
4133     /* Limits the number of elements in a node to 'a->inode.limit' */
4134     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4135       nzl2 = ai[j+1] - ai[j];
4136       nzu2 = adiag[j] - adiag[j+1] - 1;
4137       nzy  = nzl2 + nzu2 + 1;
4138       if (nzy != nzx) break;
4139       ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
4140       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4141       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4142       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
4143       ierr = PetscFree(cols2);CHKERRQ(ierr);
4144     }
4145     ns[node_count++] = blk_size;
4146     ierr             = PetscFree(cols1);CHKERRQ(ierr);
4147     i                = j;
4148   }
4149   /* If not enough inodes found,, do not use inode version of the routines */
4150   if (!m || node_count > .8*m) {
4151     ierr = PetscFree(ns);CHKERRQ(ierr);
4152 
4153     a->inode.node_count = 0;
4154     a->inode.size       = NULL;
4155     a->inode.use        = PETSC_FALSE;
4156 
4157     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4158   } else {
4159     A->ops->mult              = 0;
4160     A->ops->sor               = 0;
4161     A->ops->multadd           = 0;
4162     A->ops->getrowij          = 0;
4163     A->ops->restorerowij      = 0;
4164     A->ops->getcolumnij       = 0;
4165     A->ops->restorecolumnij   = 0;
4166     A->ops->coloringpatch     = 0;
4167     A->ops->multdiagonalblock = 0;
4168     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4169     a->inode.node_count       = node_count;
4170     a->inode.size             = ns;
4171 
4172     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4173   }
4174   a->inode.checked = PETSC_TRUE;
4175   PetscFunctionReturn(0);
4176 }
4177 
4178 #undef __FUNCT__
4179 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4180 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4181 {
4182   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4183 
4184   PetscFunctionBegin;
4185   a->inode.ibdiagvalid = PETSC_FALSE;
4186   PetscFunctionReturn(0);
4187 }
4188 
4189 /*
4190      This is really ugly. if inodes are used this replaces the
4191   permutations with ones that correspond to rows/cols of the matrix
4192   rather then inode blocks
4193 */
4194 #undef __FUNCT__
4195 #define __FUNCT__ "MatInodeAdjustForInodes"
4196 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4197 {
4198   PetscErrorCode ierr;
4199 
4200   PetscFunctionBegin;
4201   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4202   PetscFunctionReturn(0);
4203 }
4204 
4205 #undef __FUNCT__
4206 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4207 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4208 {
4209   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4210   PetscErrorCode ierr;
4211   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4212   const PetscInt *ridx,*cidx;
4213   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4214   PetscInt       nslim_col,*ns_col;
4215   IS             ris = *rperm,cis = *cperm;
4216 
4217   PetscFunctionBegin;
4218   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4219   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4220 
4221   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4222   ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4223   ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4224 
4225   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4226   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4227 
4228   /* Form the inode structure for the rows of permuted matric using inv perm*/
4229   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4230 
4231   /* Construct the permutations for rows*/
4232   for (i=0,row = 0; i<nslim_row; ++i) {
4233     indx      = ridx[i];
4234     start_val = tns[indx];
4235     end_val   = tns[indx + 1];
4236     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4237   }
4238 
4239   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4240   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4241 
4242   /* Construct permutations for columns */
4243   for (i=0,col=0; i<nslim_col; ++i) {
4244     indx      = cidx[i];
4245     start_val = tns[indx];
4246     end_val   = tns[indx + 1];
4247     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4248   }
4249 
4250   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4251   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4252   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4253   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4254 
4255   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4256   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4257 
4258   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4259   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4260   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4261   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4262   ierr = PetscFree(tns);CHKERRQ(ierr);
4263   PetscFunctionReturn(0);
4264 }
4265 
4266 #undef __FUNCT__
4267 #define __FUNCT__ "MatInodeGetInodeSizes"
4268 /*@C
4269    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4270 
4271    Not Collective
4272 
4273    Input Parameter:
4274 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4275 
4276    Output Parameter:
4277 +  node_count - no of inodes present in the matrix.
4278 .  sizes      - an array of size node_count,with sizes of each inode.
4279 -  limit      - the max size used to generate the inodes.
4280 
4281    Level: advanced
4282 
4283    Notes: This routine returns some internal storage information
4284    of the matrix, it is intended to be used by advanced users.
4285    It should be called after the matrix is assembled.
4286    The contents of the sizes[] array should not be changed.
4287    NULL may be passed for information not requested.
4288 
4289 .keywords: matrix, seqaij, get, inode
4290 
4291 .seealso: MatGetInfo()
4292 @*/
4293 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4294 {
4295   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4296 
4297   PetscFunctionBegin;
4298   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4299   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
4300   if (f) {
4301     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4302   }
4303   PetscFunctionReturn(0);
4304 }
4305 
4306 #undef __FUNCT__
4307 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4308 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4309 {
4310   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4311 
4312   PetscFunctionBegin;
4313   if (node_count) *node_count = a->inode.node_count;
4314   if (sizes)      *sizes      = a->inode.size;
4315   if (limit)      *limit      = a->inode.limit;
4316   PetscFunctionReturn(0);
4317 }
4318