xref: /petsc/src/mat/impls/aij/seq/inode.c (revision cc85fe4ded5189db5e5e073ce90ef04de0003fdb)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1) {
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
86   }
87   /* allocate space for row pointers */
88   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
89   *iia = ia;
90   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
91   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
92 
93   /* determine the number of columns in each row */
94   ia[0] = oshift;
95   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
96 
97     j    = aj + ai[row] + ishift;
98     jmax = aj + ai[row+1] + ishift;
99     col  = *j++ + ishift;
100     i2   = tvc[col];
101     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
102       ia[i1+1]++;
103       ia[i2+1]++;
104       i2++;                     /* Start col of next node */
105       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
106       i2 = tvc[col];
107     }
108     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
109   }
110 
111   /* shift ia[i] to point to next row */
112   for (i1=1; i1<nslim_row+1; i1++) {
113     row        = ia[i1-1];
114     ia[i1]    += row;
115     work[i1-1] = row - oshift;
116   }
117 
118   /* allocate space for column pointers */
119   nz   = ia[nslim_row] + (!ishift);
120   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
121   *jja = ja;
122 
123   /* loop over lower triangular part putting into ja */
124   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
125     j    = aj + ai[row] + ishift;
126     jmax = aj + ai[row+1] + ishift;
127     col  = *j++ + ishift;
128     i2   = tvc[col];
129     while (i2<i1 && j<jmax) {
130       ja[work[i2]++] = i1 + oshift;
131       ja[work[i1]++] = i2 + oshift;
132       ++i2;
133       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
134       i2 = tvc[col];
135     }
136     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
137 
138   }
139   ierr = PetscFree(work);CHKERRQ(ierr);
140   ierr = PetscFree(tns);CHKERRQ(ierr);
141   ierr = PetscFree(tvc);CHKERRQ(ierr);
142   PetscFunctionReturn(0);
143 }
144 
145 /*
146       This builds nonsymmetric version of nonzero structure,
147 */
148 #undef __FUNCT__
149 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
150 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
151 {
152   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
153   PetscErrorCode ierr;
154   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
155   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
156 
157   PetscFunctionBegin;
158   nslim_row = a->inode.node_count;
159   n         = A->cmap->n;
160 
161   /* Create The column_inode for this matrix */
162   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
163 
164   /* allocate space for reformated column_inode structure */
165   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
166   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
167   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
168 
169   for (i1=0,col=0; i1<nslim_col; ++i1) {
170     nsz = ns_col[i1];
171     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
172   }
173   /* allocate space for row pointers */
174   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
175   *iia = ia;
176   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
177   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
178 
179   /* determine the number of columns in each row */
180   ia[0] = oshift;
181   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
182     j   = aj + ai[row] + ishift;
183     col = *j++ + ishift;
184     i2  = tvc[col];
185     nz  = ai[row+1] - ai[row];
186     while (nz-- > 0) {           /* off-diagonal elemets */
187       ia[i1+1]++;
188       i2++;                     /* Start col of next node */
189       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
190       if (nz > 0) i2 = tvc[col];
191     }
192   }
193 
194   /* shift ia[i] to point to next row */
195   for (i1=1; i1<nslim_row+1; i1++) {
196     row        = ia[i1-1];
197     ia[i1]    += row;
198     work[i1-1] = row - oshift;
199   }
200 
201   /* allocate space for column pointers */
202   nz   = ia[nslim_row] + (!ishift);
203   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
204   *jja = ja;
205 
206   /* loop over matrix putting into ja */
207   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
208     j   = aj + ai[row] + ishift;
209     col = *j++ + ishift;
210     i2  = tvc[col];
211     nz  = ai[row+1] - ai[row];
212     while (nz-- > 0) {
213       ja[work[i1]++] = i2 + oshift;
214       ++i2;
215       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216       if (nz > 0) i2 = tvc[col];
217     }
218   }
219   ierr = PetscFree(ns_col);CHKERRQ(ierr);
220   ierr = PetscFree(work);CHKERRQ(ierr);
221   ierr = PetscFree(tns);CHKERRQ(ierr);
222   ierr = PetscFree(tvc);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 #undef __FUNCT__
227 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
228 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
229 {
230   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
231   PetscErrorCode ierr;
232 
233   PetscFunctionBegin;
234   *n = a->inode.node_count;
235   if (!ia) PetscFunctionReturn(0);
236   if (!blockcompressed) {
237     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
238   } else if (symmetric) {
239     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
240   } else {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   }
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
248 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   if (!ia) PetscFunctionReturn(0);
254 
255   if (!blockcompressed) {
256     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
257   } else {
258     ierr = PetscFree(*ia);CHKERRQ(ierr);
259     ierr = PetscFree(*ja);CHKERRQ(ierr);
260   }
261   PetscFunctionReturn(0);
262 }
263 
264 /* ----------------------------------------------------------- */
265 
266 #undef __FUNCT__
267 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
268 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269 {
270   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
271   PetscErrorCode ierr;
272   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
274 
275   PetscFunctionBegin;
276   nslim_row = a->inode.node_count;
277   n         = A->cmap->n;
278 
279   /* Create The column_inode for this matrix */
280   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
281 
282   /* allocate space for reformated column_inode structure */
283   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
284   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
285   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
286 
287   for (i1=0,col=0; i1<nslim_col; ++i1) {
288     nsz = ns_col[i1];
289     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
290   }
291   /* allocate space for column pointers */
292   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
293   *iia = ia;
294   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
295   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
296 
297   /* determine the number of columns in each row */
298   ia[0] = oshift;
299   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
300     j   = aj + ai[row] + ishift;
301     col = *j++ + ishift;
302     i2  = tvc[col];
303     nz  = ai[row+1] - ai[row];
304     while (nz-- > 0) {           /* off-diagonal elemets */
305       /* ia[i1+1]++; */
306       ia[i2+1]++;
307       i2++;
308       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
309       if (nz > 0) i2 = tvc[col];
310     }
311   }
312 
313   /* shift ia[i] to point to next col */
314   for (i1=1; i1<nslim_col+1; i1++) {
315     col        = ia[i1-1];
316     ia[i1]    += col;
317     work[i1-1] = col - oshift;
318   }
319 
320   /* allocate space for column pointers */
321   nz   = ia[nslim_col] + (!ishift);
322   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
323   *jja = ja;
324 
325   /* loop over matrix putting into ja */
326   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
327     j   = aj + ai[row] + ishift;
328     col = *j++ + ishift;
329     i2  = tvc[col];
330     nz  = ai[row+1] - ai[row];
331     while (nz-- > 0) {
332       /* ja[work[i1]++] = i2 + oshift; */
333       ja[work[i2]++] = i1 + oshift;
334       i2++;
335       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
336       if (nz > 0) i2 = tvc[col];
337     }
338   }
339   ierr = PetscFree(ns_col);CHKERRQ(ierr);
340   ierr = PetscFree(work);CHKERRQ(ierr);
341   ierr = PetscFree(tns);CHKERRQ(ierr);
342   ierr = PetscFree(tvc);CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
348 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
349 {
350   PetscErrorCode ierr;
351 
352   PetscFunctionBegin;
353   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
354   if (!ia) PetscFunctionReturn(0);
355 
356   if (!blockcompressed) {
357     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
358   } else if (symmetric) {
359     /* Since the indices are symmetric it does'nt matter */
360     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
361   } else {
362     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   }
364   PetscFunctionReturn(0);
365 }
366 
367 #undef __FUNCT__
368 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
369 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
370 {
371   PetscErrorCode ierr;
372 
373   PetscFunctionBegin;
374   if (!ia) PetscFunctionReturn(0);
375   if (!blockcompressed) {
376     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
377   } else {
378     ierr = PetscFree(*ia);CHKERRQ(ierr);
379     ierr = PetscFree(*ja);CHKERRQ(ierr);
380   }
381   PetscFunctionReturn(0);
382 }
383 
384 /* ----------------------------------------------------------- */
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
388 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
389 {
390   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
391   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
392   PetscScalar       *y;
393   const PetscScalar *x;
394   const MatScalar   *v1,*v2,*v3,*v4,*v5;
395   PetscErrorCode    ierr;
396   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
397 
398 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
399 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
400 #endif
401 
402   PetscFunctionBegin;
403   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
404   node_max = a->inode.node_count;
405   ns       = a->inode.size;     /* Node Size array */
406   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
407   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
408   idx      = a->j;
409   v1       = a->a;
410   ii       = a->i;
411 
412   for (i = 0,row = 0; i< node_max; ++i) {
413     nsz         = ns[i];
414     n           = ii[1] - ii[0];
415     nonzerorow += (n>0)*nsz;
416     ii         += nsz;
417     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
418     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
419     sz = n;                     /* No of non zeros in this row */
420                                 /* Switch on the size of Node */
421     switch (nsz) {               /* Each loop in 'case' is unrolled */
422     case 1:
423       sum1 = 0.;
424 
425       for (n = 0; n< sz-1; n+=2) {
426         i1    = idx[0];         /* The instructions are ordered to */
427         i2    = idx[1];         /* make the compiler's job easy */
428         idx  += 2;
429         tmp0  = x[i1];
430         tmp1  = x[i2];
431         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432       }
433 
434       if (n == sz-1) {          /* Take care of the last nonzero  */
435         tmp0  = x[*idx++];
436         sum1 += *v1++ *tmp0;
437       }
438       y[row++]=sum1;
439       break;
440     case 2:
441       sum1 = 0.;
442       sum2 = 0.;
443       v2   = v1 + n;
444 
445       for (n = 0; n< sz-1; n+=2) {
446         i1    = idx[0];
447         i2    = idx[1];
448         idx  += 2;
449         tmp0  = x[i1];
450         tmp1  = x[i2];
451         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
452         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
453       }
454       if (n == sz-1) {
455         tmp0  = x[*idx++];
456         sum1 += *v1++ * tmp0;
457         sum2 += *v2++ * tmp0;
458       }
459       y[row++]=sum1;
460       y[row++]=sum2;
461       v1      =v2;              /* Since the next block to be processed starts there*/
462       idx    +=sz;
463       break;
464     case 3:
465       sum1 = 0.;
466       sum2 = 0.;
467       sum3 = 0.;
468       v2   = v1 + n;
469       v3   = v2 + n;
470 
471       for (n = 0; n< sz-1; n+=2) {
472         i1    = idx[0];
473         i2    = idx[1];
474         idx  += 2;
475         tmp0  = x[i1];
476         tmp1  = x[i2];
477         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
478         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
479         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
480       }
481       if (n == sz-1) {
482         tmp0  = x[*idx++];
483         sum1 += *v1++ * tmp0;
484         sum2 += *v2++ * tmp0;
485         sum3 += *v3++ * tmp0;
486       }
487       y[row++]=sum1;
488       y[row++]=sum2;
489       y[row++]=sum3;
490       v1      =v3;              /* Since the next block to be processed starts there*/
491       idx    +=2*sz;
492       break;
493     case 4:
494       sum1 = 0.;
495       sum2 = 0.;
496       sum3 = 0.;
497       sum4 = 0.;
498       v2   = v1 + n;
499       v3   = v2 + n;
500       v4   = v3 + n;
501 
502       for (n = 0; n< sz-1; n+=2) {
503         i1    = idx[0];
504         i2    = idx[1];
505         idx  += 2;
506         tmp0  = x[i1];
507         tmp1  = x[i2];
508         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
509         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
510         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
511         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
512       }
513       if (n == sz-1) {
514         tmp0  = x[*idx++];
515         sum1 += *v1++ * tmp0;
516         sum2 += *v2++ * tmp0;
517         sum3 += *v3++ * tmp0;
518         sum4 += *v4++ * tmp0;
519       }
520       y[row++]=sum1;
521       y[row++]=sum2;
522       y[row++]=sum3;
523       y[row++]=sum4;
524       v1      =v4;              /* Since the next block to be processed starts there*/
525       idx    +=3*sz;
526       break;
527     case 5:
528       sum1 = 0.;
529       sum2 = 0.;
530       sum3 = 0.;
531       sum4 = 0.;
532       sum5 = 0.;
533       v2   = v1 + n;
534       v3   = v2 + n;
535       v4   = v3 + n;
536       v5   = v4 + n;
537 
538       for (n = 0; n<sz-1; n+=2) {
539         i1    = idx[0];
540         i2    = idx[1];
541         idx  += 2;
542         tmp0  = x[i1];
543         tmp1  = x[i2];
544         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
545         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
546         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
547         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
548         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
549       }
550       if (n == sz-1) {
551         tmp0  = x[*idx++];
552         sum1 += *v1++ * tmp0;
553         sum2 += *v2++ * tmp0;
554         sum3 += *v3++ * tmp0;
555         sum4 += *v4++ * tmp0;
556         sum5 += *v5++ * tmp0;
557       }
558       y[row++]=sum1;
559       y[row++]=sum2;
560       y[row++]=sum3;
561       y[row++]=sum4;
562       y[row++]=sum5;
563       v1      =v5;       /* Since the next block to be processed starts there */
564       idx    +=4*sz;
565       break;
566     default:
567       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
568     }
569   }
570   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
571   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
572   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 /* ----------------------------------------------------------- */
576 /* Almost same code as the MatMult_SeqAIJ_Inode() */
577 #undef __FUNCT__
578 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
579 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
580 {
581   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
582   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
583   MatScalar      *v1,*v2,*v3,*v4,*v5;
584   PetscScalar    *x,*y,*z,*zt;
585   PetscErrorCode ierr;
586   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
587 
588   PetscFunctionBegin;
589   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
590   node_max = a->inode.node_count;
591   ns       = a->inode.size;     /* Node Size array */
592 
593   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
594   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
595   if (zz != yy) {
596     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
597   } else {
598     z = y;
599   }
600   zt = z;
601 
602   idx = a->j;
603   v1  = a->a;
604   ii  = a->i;
605 
606   for (i = 0,row = 0; i< node_max; ++i) {
607     nsz = ns[i];
608     n   = ii[1] - ii[0];
609     ii += nsz;
610     sz  = n;                    /* No of non zeros in this row */
611                                 /* Switch on the size of Node */
612     switch (nsz) {               /* Each loop in 'case' is unrolled */
613     case 1:
614       sum1 = *zt++;
615 
616       for (n = 0; n< sz-1; n+=2) {
617         i1    = idx[0];         /* The instructions are ordered to */
618         i2    = idx[1];         /* make the compiler's job easy */
619         idx  += 2;
620         tmp0  = x[i1];
621         tmp1  = x[i2];
622         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623       }
624 
625       if (n   == sz-1) {          /* Take care of the last nonzero  */
626         tmp0  = x[*idx++];
627         sum1 += *v1++ * tmp0;
628       }
629       y[row++]=sum1;
630       break;
631     case 2:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       v2   = v1 + n;
635 
636       for (n = 0; n< sz-1; n+=2) {
637         i1    = idx[0];
638         i2    = idx[1];
639         idx  += 2;
640         tmp0  = x[i1];
641         tmp1  = x[i2];
642         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644       }
645       if (n   == sz-1) {
646         tmp0  = x[*idx++];
647         sum1 += *v1++ * tmp0;
648         sum2 += *v2++ * tmp0;
649       }
650       y[row++]=sum1;
651       y[row++]=sum2;
652       v1      =v2;              /* Since the next block to be processed starts there*/
653       idx    +=sz;
654       break;
655     case 3:
656       sum1 = *zt++;
657       sum2 = *zt++;
658       sum3 = *zt++;
659       v2   = v1 + n;
660       v3   = v2 + n;
661 
662       for (n = 0; n< sz-1; n+=2) {
663         i1    = idx[0];
664         i2    = idx[1];
665         idx  += 2;
666         tmp0  = x[i1];
667         tmp1  = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671       }
672       if (n == sz-1) {
673         tmp0  = x[*idx++];
674         sum1 += *v1++ * tmp0;
675         sum2 += *v2++ * tmp0;
676         sum3 += *v3++ * tmp0;
677       }
678       y[row++]=sum1;
679       y[row++]=sum2;
680       y[row++]=sum3;
681       v1      =v3;              /* Since the next block to be processed starts there*/
682       idx    +=2*sz;
683       break;
684     case 4:
685       sum1 = *zt++;
686       sum2 = *zt++;
687       sum3 = *zt++;
688       sum4 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692 
693       for (n = 0; n< sz-1; n+=2) {
694         i1    = idx[0];
695         i2    = idx[1];
696         idx  += 2;
697         tmp0  = x[i1];
698         tmp1  = x[i2];
699         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703       }
704       if (n == sz-1) {
705         tmp0  = x[*idx++];
706         sum1 += *v1++ * tmp0;
707         sum2 += *v2++ * tmp0;
708         sum3 += *v3++ * tmp0;
709         sum4 += *v4++ * tmp0;
710       }
711       y[row++]=sum1;
712       y[row++]=sum2;
713       y[row++]=sum3;
714       y[row++]=sum4;
715       v1      =v4;              /* Since the next block to be processed starts there*/
716       idx    +=3*sz;
717       break;
718     case 5:
719       sum1 = *zt++;
720       sum2 = *zt++;
721       sum3 = *zt++;
722       sum4 = *zt++;
723       sum5 = *zt++;
724       v2   = v1 + n;
725       v3   = v2 + n;
726       v4   = v3 + n;
727       v5   = v4 + n;
728 
729       for (n = 0; n<sz-1; n+=2) {
730         i1    = idx[0];
731         i2    = idx[1];
732         idx  += 2;
733         tmp0  = x[i1];
734         tmp1  = x[i2];
735         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740       }
741       if (n   == sz-1) {
742         tmp0  = x[*idx++];
743         sum1 += *v1++ * tmp0;
744         sum2 += *v2++ * tmp0;
745         sum3 += *v3++ * tmp0;
746         sum4 += *v4++ * tmp0;
747         sum5 += *v5++ * tmp0;
748       }
749       y[row++]=sum1;
750       y[row++]=sum2;
751       y[row++]=sum3;
752       y[row++]=sum4;
753       y[row++]=sum5;
754       v1      =v5;       /* Since the next block to be processed starts there */
755       idx    +=4*sz;
756       break;
757     default:
758       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759     }
760   }
761   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
763   if (zz != yy) {
764     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
765   }
766   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
767   PetscFunctionReturn(0);
768 }
769 
770 /* ----------------------------------------------------------- */
771 #undef __FUNCT__
772 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
773 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
774 {
775   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
776   IS                iscol = a->col,isrow = a->row;
777   PetscErrorCode    ierr;
778   const PetscInt    *r,*c,*rout,*cout;
779   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
780   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
781   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
782   PetscScalar       sum1,sum2,sum3,sum4,sum5;
783   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
784   const PetscScalar *b;
785 
786   PetscFunctionBegin;
787   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
788   node_max = a->inode.node_count;
789   ns       = a->inode.size;     /* Node Size array */
790 
791   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   tmp  = a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
797 
798   /* forward solve the lower triangular */
799   tmps = tmp;
800   aa   = a_a;
801   aj   = a_j;
802   ad   = a->diag;
803 
804   for (i = 0,row = 0; i< node_max; ++i) {
805     nsz = ns[i];
806     aii = ai[row];
807     v1  = aa + aii;
808     vi  = aj + aii;
809     nz  = ad[row]- aii;
810     if (i < node_max-1) {
811       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
812       * but our indexing to determine it's size could. */
813       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
814       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
815       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
816       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
817     }
818 
819     switch (nsz) {               /* Each loop in 'case' is unrolled */
820     case 1:
821       sum1 = b[*r++];
822       for (j=0; j<nz-1; j+=2) {
823         i0    = vi[0];
824         i1    = vi[1];
825         vi   +=2;
826         tmp0  = tmps[i0];
827         tmp1  = tmps[i1];
828         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
829       }
830       if (j == nz-1) {
831         tmp0  = tmps[*vi++];
832         sum1 -= *v1++ *tmp0;
833       }
834       tmp[row++]=sum1;
835       break;
836     case 2:
837       sum1 = b[*r++];
838       sum2 = b[*r++];
839       v2   = aa + ai[row+1];
840 
841       for (j=0; j<nz-1; j+=2) {
842         i0    = vi[0];
843         i1    = vi[1];
844         vi   +=2;
845         tmp0  = tmps[i0];
846         tmp1  = tmps[i1];
847         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849       }
850       if (j == nz-1) {
851         tmp0  = tmps[*vi++];
852         sum1 -= *v1++ *tmp0;
853         sum2 -= *v2++ *tmp0;
854       }
855       sum2     -= *v2++ *sum1;
856       tmp[row++]=sum1;
857       tmp[row++]=sum2;
858       break;
859     case 3:
860       sum1 = b[*r++];
861       sum2 = b[*r++];
862       sum3 = b[*r++];
863       v2   = aa + ai[row+1];
864       v3   = aa + ai[row+2];
865 
866       for (j=0; j<nz-1; j+=2) {
867         i0    = vi[0];
868         i1    = vi[1];
869         vi   +=2;
870         tmp0  = tmps[i0];
871         tmp1  = tmps[i1];
872         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
873         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
874         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
875       }
876       if (j == nz-1) {
877         tmp0  = tmps[*vi++];
878         sum1 -= *v1++ *tmp0;
879         sum2 -= *v2++ *tmp0;
880         sum3 -= *v3++ *tmp0;
881       }
882       sum2 -= *v2++ * sum1;
883       sum3 -= *v3++ * sum1;
884       sum3 -= *v3++ * sum2;
885 
886       tmp[row++]=sum1;
887       tmp[row++]=sum2;
888       tmp[row++]=sum3;
889       break;
890 
891     case 4:
892       sum1 = b[*r++];
893       sum2 = b[*r++];
894       sum3 = b[*r++];
895       sum4 = b[*r++];
896       v2   = aa + ai[row+1];
897       v3   = aa + ai[row+2];
898       v4   = aa + ai[row+3];
899 
900       for (j=0; j<nz-1; j+=2) {
901         i0    = vi[0];
902         i1    = vi[1];
903         vi   +=2;
904         tmp0  = tmps[i0];
905         tmp1  = tmps[i1];
906         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
907         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
908         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
909         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
910       }
911       if (j == nz-1) {
912         tmp0  = tmps[*vi++];
913         sum1 -= *v1++ *tmp0;
914         sum2 -= *v2++ *tmp0;
915         sum3 -= *v3++ *tmp0;
916         sum4 -= *v4++ *tmp0;
917       }
918       sum2 -= *v2++ * sum1;
919       sum3 -= *v3++ * sum1;
920       sum4 -= *v4++ * sum1;
921       sum3 -= *v3++ * sum2;
922       sum4 -= *v4++ * sum2;
923       sum4 -= *v4++ * sum3;
924 
925       tmp[row++]=sum1;
926       tmp[row++]=sum2;
927       tmp[row++]=sum3;
928       tmp[row++]=sum4;
929       break;
930     case 5:
931       sum1 = b[*r++];
932       sum2 = b[*r++];
933       sum3 = b[*r++];
934       sum4 = b[*r++];
935       sum5 = b[*r++];
936       v2   = aa + ai[row+1];
937       v3   = aa + ai[row+2];
938       v4   = aa + ai[row+3];
939       v5   = aa + ai[row+4];
940 
941       for (j=0; j<nz-1; j+=2) {
942         i0    = vi[0];
943         i1    = vi[1];
944         vi   +=2;
945         tmp0  = tmps[i0];
946         tmp1  = tmps[i1];
947         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
948         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
949         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
950         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
951         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
952       }
953       if (j == nz-1) {
954         tmp0  = tmps[*vi++];
955         sum1 -= *v1++ *tmp0;
956         sum2 -= *v2++ *tmp0;
957         sum3 -= *v3++ *tmp0;
958         sum4 -= *v4++ *tmp0;
959         sum5 -= *v5++ *tmp0;
960       }
961 
962       sum2 -= *v2++ * sum1;
963       sum3 -= *v3++ * sum1;
964       sum4 -= *v4++ * sum1;
965       sum5 -= *v5++ * sum1;
966       sum3 -= *v3++ * sum2;
967       sum4 -= *v4++ * sum2;
968       sum5 -= *v5++ * sum2;
969       sum4 -= *v4++ * sum3;
970       sum5 -= *v5++ * sum3;
971       sum5 -= *v5++ * sum4;
972 
973       tmp[row++]=sum1;
974       tmp[row++]=sum2;
975       tmp[row++]=sum3;
976       tmp[row++]=sum4;
977       tmp[row++]=sum5;
978       break;
979     default:
980       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
981     }
982   }
983   /* backward solve the upper triangular */
984   for (i=node_max -1,row = n-1; i>=0; i--) {
985     nsz = ns[i];
986     aii = ai[row+1] -1;
987     v1  = aa + aii;
988     vi  = aj + aii;
989     nz  = aii- ad[row];
990     switch (nsz) {               /* Each loop in 'case' is unrolled */
991     case 1:
992       sum1 = tmp[row];
993 
994       for (j=nz; j>1; j-=2) {
995         vi   -=2;
996         i0    = vi[2];
997         i1    = vi[1];
998         tmp0  = tmps[i0];
999         tmp1  = tmps[i1];
1000         v1   -= 2;
1001         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002       }
1003       if (j==1) {
1004         tmp0  = tmps[*vi--];
1005         sum1 -= *v1-- * tmp0;
1006       }
1007       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1008       break;
1009     case 2:
1010       sum1 = tmp[row];
1011       sum2 = tmp[row -1];
1012       v2   = aa + ai[row]-1;
1013       for (j=nz; j>1; j-=2) {
1014         vi   -=2;
1015         i0    = vi[2];
1016         i1    = vi[1];
1017         tmp0  = tmps[i0];
1018         tmp1  = tmps[i1];
1019         v1   -= 2;
1020         v2   -= 2;
1021         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1022         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1023       }
1024       if (j==1) {
1025         tmp0  = tmps[*vi--];
1026         sum1 -= *v1-- * tmp0;
1027         sum2 -= *v2-- * tmp0;
1028       }
1029 
1030       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031       sum2   -= *v2-- * tmp0;
1032       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1033       break;
1034     case 3:
1035       sum1 = tmp[row];
1036       sum2 = tmp[row -1];
1037       sum3 = tmp[row -2];
1038       v2   = aa + ai[row]-1;
1039       v3   = aa + ai[row -1]-1;
1040       for (j=nz; j>1; j-=2) {
1041         vi   -=2;
1042         i0    = vi[2];
1043         i1    = vi[1];
1044         tmp0  = tmps[i0];
1045         tmp1  = tmps[i1];
1046         v1   -= 2;
1047         v2   -= 2;
1048         v3   -= 2;
1049         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1050         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1051         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1052       }
1053       if (j==1) {
1054         tmp0  = tmps[*vi--];
1055         sum1 -= *v1-- * tmp0;
1056         sum2 -= *v2-- * tmp0;
1057         sum3 -= *v3-- * tmp0;
1058       }
1059       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1060       sum2   -= *v2-- * tmp0;
1061       sum3   -= *v3-- * tmp0;
1062       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1063       sum3   -= *v3-- * tmp0;
1064       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1065 
1066       break;
1067     case 4:
1068       sum1 = tmp[row];
1069       sum2 = tmp[row -1];
1070       sum3 = tmp[row -2];
1071       sum4 = tmp[row -3];
1072       v2   = aa + ai[row]-1;
1073       v3   = aa + ai[row -1]-1;
1074       v4   = aa + ai[row -2]-1;
1075 
1076       for (j=nz; j>1; j-=2) {
1077         vi   -=2;
1078         i0    = vi[2];
1079         i1    = vi[1];
1080         tmp0  = tmps[i0];
1081         tmp1  = tmps[i1];
1082         v1   -= 2;
1083         v2   -= 2;
1084         v3   -= 2;
1085         v4   -= 2;
1086         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1087         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1088         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1089         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1090       }
1091       if (j==1) {
1092         tmp0  = tmps[*vi--];
1093         sum1 -= *v1-- * tmp0;
1094         sum2 -= *v2-- * tmp0;
1095         sum3 -= *v3-- * tmp0;
1096         sum4 -= *v4-- * tmp0;
1097       }
1098 
1099       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1100       sum2   -= *v2-- * tmp0;
1101       sum3   -= *v3-- * tmp0;
1102       sum4   -= *v4-- * tmp0;
1103       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1104       sum3   -= *v3-- * tmp0;
1105       sum4   -= *v4-- * tmp0;
1106       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1107       sum4   -= *v4-- * tmp0;
1108       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1109       break;
1110     case 5:
1111       sum1 = tmp[row];
1112       sum2 = tmp[row -1];
1113       sum3 = tmp[row -2];
1114       sum4 = tmp[row -3];
1115       sum5 = tmp[row -4];
1116       v2   = aa + ai[row]-1;
1117       v3   = aa + ai[row -1]-1;
1118       v4   = aa + ai[row -2]-1;
1119       v5   = aa + ai[row -3]-1;
1120       for (j=nz; j>1; j-=2) {
1121         vi   -= 2;
1122         i0    = vi[2];
1123         i1    = vi[1];
1124         tmp0  = tmps[i0];
1125         tmp1  = tmps[i1];
1126         v1   -= 2;
1127         v2   -= 2;
1128         v3   -= 2;
1129         v4   -= 2;
1130         v5   -= 2;
1131         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1132         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1133         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1134         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1135         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1136       }
1137       if (j==1) {
1138         tmp0  = tmps[*vi--];
1139         sum1 -= *v1-- * tmp0;
1140         sum2 -= *v2-- * tmp0;
1141         sum3 -= *v3-- * tmp0;
1142         sum4 -= *v4-- * tmp0;
1143         sum5 -= *v5-- * tmp0;
1144       }
1145 
1146       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1147       sum2   -= *v2-- * tmp0;
1148       sum3   -= *v3-- * tmp0;
1149       sum4   -= *v4-- * tmp0;
1150       sum5   -= *v5-- * tmp0;
1151       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1152       sum3   -= *v3-- * tmp0;
1153       sum4   -= *v4-- * tmp0;
1154       sum5   -= *v5-- * tmp0;
1155       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1159       sum5   -= *v5-- * tmp0;
1160       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1161       break;
1162     default:
1163       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1164     }
1165   }
1166   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1168   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1176 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1177 {
1178   Mat             C     =B;
1179   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1180   IS              isrow = b->row,isicol = b->icol;
1181   PetscErrorCode  ierr;
1182   const PetscInt  *r,*ic,*ics;
1183   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1184   PetscInt        i,j,k,nz,nzL,row,*pj;
1185   const PetscInt  *ajtmp,*bjtmp;
1186   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1187   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1188   FactorShiftCtx  sctx;
1189   const PetscInt  *ddiag;
1190   PetscReal       rs;
1191   MatScalar       d;
1192   PetscInt        inod,nodesz,node_max,col;
1193   const PetscInt  *ns;
1194   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1195 
1196   PetscFunctionBegin;
1197   /* MatPivotSetUp(): initialize shift context sctx */
1198   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1199 
1200   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1201     ddiag          = a->diag;
1202     sctx.shift_top = info->zeropivot;
1203     for (i=0; i<n; i++) {
1204       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1205       d  = (aa)[ddiag[i]];
1206       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1207       v  = aa+ai[i];
1208       nz = ai[i+1] - ai[i];
1209       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1210       if (rs>sctx.shift_top) sctx.shift_top = rs;
1211     }
1212     sctx.shift_top *= 1.1;
1213     sctx.nshift_max = 5;
1214     sctx.shift_lo   = 0.;
1215     sctx.shift_hi   = 1.;
1216   }
1217 
1218   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1219   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1220 
1221   ierr  = PetscMalloc4(n,PetscScalar,&rtmp1,n,PetscScalar,&rtmp2,n,PetscScalar,&rtmp3,n,PetscScalar,&rtmp4);CHKERRQ(ierr);
1222   ierr  = PetscMemzero(rtmp1,n*sizeof(PetscScalar));CHKERRQ(ierr);
1223   ierr  = PetscMemzero(rtmp2,n*sizeof(PetscScalar));CHKERRQ(ierr);
1224   ierr  = PetscMemzero(rtmp3,n*sizeof(PetscScalar));CHKERRQ(ierr);
1225   ierr  = PetscMemzero(rtmp4,n*sizeof(PetscScalar));CHKERRQ(ierr);
1226   ics   = ic;
1227 
1228   node_max = a->inode.node_count;
1229   ns       = a->inode.size;
1230   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1231 
1232   /* If max inode size > 4, split it into two inodes.*/
1233   /* also map the inode sizes according to the ordering */
1234   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1235   for (i=0,j=0; i<node_max; ++i,++j) {
1236     if (ns[i] > 4) {
1237       tmp_vec1[j] = 4;
1238       ++j;
1239       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1240     } else {
1241       tmp_vec1[j] = ns[i];
1242     }
1243   }
1244   /* Use the correct node_max */
1245   node_max = j;
1246 
1247   /* Now reorder the inode info based on mat re-ordering info */
1248   /* First create a row -> inode_size_array_index map */
1249   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1250   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1251   for (i=0,row=0; i<node_max; i++) {
1252     nodesz = tmp_vec1[i];
1253     for (j=0; j<nodesz; j++,row++) {
1254       nsmap[row] = i;
1255     }
1256   }
1257   /* Using nsmap, create a reordered ns structure */
1258   for (i=0,j=0; i< node_max; i++) {
1259     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1260     tmp_vec2[i] = nodesz;
1261     j          += nodesz;
1262   }
1263   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1264   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1265 
1266   /* Now use the correct ns */
1267   ns = tmp_vec2;
1268 
1269   do {
1270     sctx.newshift = PETSC_FALSE;
1271     /* Now loop over each block-row, and do the factorization */
1272     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1273       nodesz = ns[inod];
1274 
1275       switch (nodesz) {
1276       case 1:
1277         /*----------*/
1278         /* zero rtmp1 */
1279         /* L part */
1280         nz    = bi[i+1] - bi[i];
1281         bjtmp = bj + bi[i];
1282         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1283 
1284         /* U part */
1285         nz    = bdiag[i]-bdiag[i+1];
1286         bjtmp = bj + bdiag[i+1]+1;
1287         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1288 
1289         /* load in initial (unfactored row) */
1290         nz    = ai[r[i]+1] - ai[r[i]];
1291         ajtmp = aj + ai[r[i]];
1292         v     = aa + ai[r[i]];
1293         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1294 
1295         /* ZeropivotApply() */
1296         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1297 
1298         /* elimination */
1299         bjtmp = bj + bi[i];
1300         row   = *bjtmp++;
1301         nzL   = bi[i+1] - bi[i];
1302         for (k=0; k < nzL; k++) {
1303           pc = rtmp1 + row;
1304           if (*pc != 0.0) {
1305             pv   = b->a + bdiag[row];
1306             mul1 = *pc * (*pv);
1307             *pc  = mul1;
1308             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1309             pv   = b->a + bdiag[row+1]+1;
1310             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1311             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1312             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1313           }
1314           row = *bjtmp++;
1315         }
1316 
1317         /* finished row so stick it into b->a */
1318         rs = 0.0;
1319         /* L part */
1320         pv = b->a + bi[i];
1321         pj = b->j + bi[i];
1322         nz = bi[i+1] - bi[i];
1323         for (j=0; j<nz; j++) {
1324           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1325         }
1326 
1327         /* U part */
1328         pv = b->a + bdiag[i+1]+1;
1329         pj = b->j + bdiag[i+1]+1;
1330         nz = bdiag[i] - bdiag[i+1]-1;
1331         for (j=0; j<nz; j++) {
1332           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* Check zero pivot */
1336         sctx.rs = rs;
1337         sctx.pv = rtmp1[i];
1338         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1339         if (sctx.newshift) break;
1340 
1341         /* Mark diagonal and invert diagonal for simplier triangular solves */
1342         pv  = b->a + bdiag[i];
1343         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1344         break;
1345 
1346       case 2:
1347         /*----------*/
1348         /* zero rtmp1 and rtmp2 */
1349         /* L part */
1350         nz    = bi[i+1] - bi[i];
1351         bjtmp = bj + bi[i];
1352         for  (j=0; j<nz; j++) {
1353           col        = bjtmp[j];
1354           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1355         }
1356 
1357         /* U part */
1358         nz    = bdiag[i]-bdiag[i+1];
1359         bjtmp = bj + bdiag[i+1]+1;
1360         for  (j=0; j<nz; j++) {
1361           col        = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* load in initial (unfactored row) */
1366         nz    = ai[r[i]+1] - ai[r[i]];
1367         ajtmp = aj + ai[r[i]];
1368         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1369         for (j=0; j<nz; j++) {
1370           col        = ics[ajtmp[j]];
1371           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1372         }
1373         /* ZeropivotApply(): shift the diagonal of the matrix  */
1374         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1375 
1376         /* elimination */
1377         bjtmp = bj + bi[i];
1378         row   = *bjtmp++; /* pivot row */
1379         nzL   = bi[i+1] - bi[i];
1380         for (k=0; k < nzL; k++) {
1381           pc1 = rtmp1 + row;
1382           pc2 = rtmp2 + row;
1383           if (*pc1 != 0.0 || *pc2 != 0.0) {
1384             pv   = b->a + bdiag[row];
1385             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1386             *pc1 = mul1;       *pc2 = mul2;
1387 
1388             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1389             pv = b->a + bdiag[row+1]+1;
1390             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1391             for (j=0; j<nz; j++) {
1392               col         = pj[j];
1393               rtmp1[col] -= mul1 * pv[j];
1394               rtmp2[col] -= mul2 * pv[j];
1395             }
1396             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1397           }
1398           row = *bjtmp++;
1399         }
1400 
1401         /* finished row i; check zero pivot, then stick row i into b->a */
1402         rs = 0.0;
1403         /* L part */
1404         pc1 = b->a + bi[i];
1405         pj  = b->j + bi[i];
1406         nz  = bi[i+1] - bi[i];
1407         for (j=0; j<nz; j++) {
1408           col    = pj[j];
1409           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1410         }
1411         /* U part */
1412         pc1 = b->a + bdiag[i+1]+1;
1413         pj  = b->j + bdiag[i+1]+1;
1414         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1415         for (j=0; j<nz; j++) {
1416           col    = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419 
1420         sctx.rs = rs;
1421         sctx.pv = rtmp1[i];
1422         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1423         if (sctx.newshift) break;
1424         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1425         *pc1 = 1.0/sctx.pv;
1426 
1427         /* Now take care of diagonal 2x2 block. */
1428         pc2 = rtmp2 + i;
1429         if (*pc2 != 0.0) {
1430           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1431           *pc2 = mul1;          /* insert L entry */
1432           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1433           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1434           for (j=0; j<nz; j++) {
1435             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1436           }
1437           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1438         }
1439 
1440         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1441         rs = 0.0;
1442         /* L part */
1443         pc2 = b->a + bi[i+1];
1444         pj  = b->j + bi[i+1];
1445         nz  = bi[i+2] - bi[i+1];
1446         for (j=0; j<nz; j++) {
1447           col    = pj[j];
1448           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1449         }
1450         /* U part */
1451         pc2 = b->a + bdiag[i+2]+1;
1452         pj  = b->j + bdiag[i+2]+1;
1453         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1454         for (j=0; j<nz; j++) {
1455           col    = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458 
1459         sctx.rs = rs;
1460         sctx.pv = rtmp2[i+1];
1461         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1462         if (sctx.newshift) break;
1463         pc2  = b->a + bdiag[i+1];
1464         *pc2 = 1.0/sctx.pv;
1465         break;
1466 
1467       case 3:
1468         /*----------*/
1469         /* zero rtmp */
1470         /* L part */
1471         nz    = bi[i+1] - bi[i];
1472         bjtmp = bj + bi[i];
1473         for  (j=0; j<nz; j++) {
1474           col        = bjtmp[j];
1475           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1476         }
1477 
1478         /* U part */
1479         nz    = bdiag[i]-bdiag[i+1];
1480         bjtmp = bj + bdiag[i+1]+1;
1481         for  (j=0; j<nz; j++) {
1482           col        = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* load in initial (unfactored row) */
1487         nz    = ai[r[i]+1] - ai[r[i]];
1488         ajtmp = aj + ai[r[i]];
1489         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1490         for (j=0; j<nz; j++) {
1491           col        = ics[ajtmp[j]];
1492           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1493         }
1494         /* ZeropivotApply(): shift the diagonal of the matrix  */
1495         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1496 
1497         /* elimination */
1498         bjtmp = bj + bi[i];
1499         row   = *bjtmp++; /* pivot row */
1500         nzL   = bi[i+1] - bi[i];
1501         for (k=0; k < nzL; k++) {
1502           pc1 = rtmp1 + row;
1503           pc2 = rtmp2 + row;
1504           pc3 = rtmp3 + row;
1505           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1506             pv   = b->a + bdiag[row];
1507             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1508             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1509 
1510             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1511             pv = b->a + bdiag[row+1]+1;
1512             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1513             for (j=0; j<nz; j++) {
1514               col         = pj[j];
1515               rtmp1[col] -= mul1 * pv[j];
1516               rtmp2[col] -= mul2 * pv[j];
1517               rtmp3[col] -= mul3 * pv[j];
1518             }
1519             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1520           }
1521           row = *bjtmp++;
1522         }
1523 
1524         /* finished row i; check zero pivot, then stick row i into b->a */
1525         rs = 0.0;
1526         /* L part */
1527         pc1 = b->a + bi[i];
1528         pj  = b->j + bi[i];
1529         nz  = bi[i+1] - bi[i];
1530         for (j=0; j<nz; j++) {
1531           col    = pj[j];
1532           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1533         }
1534         /* U part */
1535         pc1 = b->a + bdiag[i+1]+1;
1536         pj  = b->j + bdiag[i+1]+1;
1537         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1538         for (j=0; j<nz; j++) {
1539           col    = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542 
1543         sctx.rs = rs;
1544         sctx.pv = rtmp1[i];
1545         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1546         if (sctx.newshift) break;
1547         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1548         *pc1 = 1.0/sctx.pv;
1549 
1550         /* Now take care of 1st column of diagonal 3x3 block. */
1551         pc2 = rtmp2 + i;
1552         pc3 = rtmp3 + i;
1553         if (*pc2 != 0.0 || *pc3 != 0.0) {
1554           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1555           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1556           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1557           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1558           for (j=0; j<nz; j++) {
1559             col         = pj[j];
1560             rtmp2[col] -= mul2 * rtmp1[col];
1561             rtmp3[col] -= mul3 * rtmp1[col];
1562           }
1563           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1564         }
1565 
1566         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1567         rs = 0.0;
1568         /* L part */
1569         pc2 = b->a + bi[i+1];
1570         pj  = b->j + bi[i+1];
1571         nz  = bi[i+2] - bi[i+1];
1572         for (j=0; j<nz; j++) {
1573           col    = pj[j];
1574           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1575         }
1576         /* U part */
1577         pc2 = b->a + bdiag[i+2]+1;
1578         pj  = b->j + bdiag[i+2]+1;
1579         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1580         for (j=0; j<nz; j++) {
1581           col    = pj[j];
1582           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583         }
1584 
1585         sctx.rs = rs;
1586         sctx.pv = rtmp2[i+1];
1587         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1588         if (sctx.newshift) break;
1589         pc2  = b->a + bdiag[i+1];
1590         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1591 
1592         /* Now take care of 2nd column of diagonal 3x3 block. */
1593         pc3 = rtmp3 + i+1;
1594         if (*pc3 != 0.0) {
1595           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1596           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1597           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1598           for (j=0; j<nz; j++) {
1599             col         = pj[j];
1600             rtmp3[col] -= mul3 * rtmp2[col];
1601           }
1602           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1603         }
1604 
1605         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1606         rs = 0.0;
1607         /* L part */
1608         pc3 = b->a + bi[i+2];
1609         pj  = b->j + bi[i+2];
1610         nz  = bi[i+3] - bi[i+2];
1611         for (j=0; j<nz; j++) {
1612           col    = pj[j];
1613           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1614         }
1615         /* U part */
1616         pc3 = b->a + bdiag[i+3]+1;
1617         pj  = b->j + bdiag[i+3]+1;
1618         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1619         for (j=0; j<nz; j++) {
1620           col    = pj[j];
1621           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622         }
1623 
1624         sctx.rs = rs;
1625         sctx.pv = rtmp3[i+2];
1626         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1627         if (sctx.newshift) break;
1628         pc3  = b->a + bdiag[i+2];
1629         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1630         break;
1631       case 4:
1632         /*----------*/
1633         /* zero rtmp */
1634         /* L part */
1635         nz    = bi[i+1] - bi[i];
1636         bjtmp = bj + bi[i];
1637         for  (j=0; j<nz; j++) {
1638           col        = bjtmp[j];
1639           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1640         }
1641 
1642         /* U part */
1643         nz    = bdiag[i]-bdiag[i+1];
1644         bjtmp = bj + bdiag[i+1]+1;
1645         for  (j=0; j<nz; j++) {
1646           col        = bjtmp[j];
1647           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1648         }
1649 
1650         /* load in initial (unfactored row) */
1651         nz    = ai[r[i]+1] - ai[r[i]];
1652         ajtmp = aj + ai[r[i]];
1653         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1654         for (j=0; j<nz; j++) {
1655           col        = ics[ajtmp[j]];
1656           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1657         }
1658         /* ZeropivotApply(): shift the diagonal of the matrix  */
1659         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1660 
1661         /* elimination */
1662         bjtmp = bj + bi[i];
1663         row   = *bjtmp++; /* pivot row */
1664         nzL   = bi[i+1] - bi[i];
1665         for (k=0; k < nzL; k++) {
1666           pc1 = rtmp1 + row;
1667           pc2 = rtmp2 + row;
1668           pc3 = rtmp3 + row;
1669           pc4 = rtmp4 + row;
1670           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1671             pv   = b->a + bdiag[row];
1672             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1673             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1674 
1675             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1676             pv = b->a + bdiag[row+1]+1;
1677             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1678             for (j=0; j<nz; j++) {
1679               col         = pj[j];
1680               rtmp1[col] -= mul1 * pv[j];
1681               rtmp2[col] -= mul2 * pv[j];
1682               rtmp3[col] -= mul3 * pv[j];
1683               rtmp4[col] -= mul4 * pv[j];
1684             }
1685             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1686           }
1687           row = *bjtmp++;
1688         }
1689 
1690         /* finished row i; check zero pivot, then stick row i into b->a */
1691         rs = 0.0;
1692         /* L part */
1693         pc1 = b->a + bi[i];
1694         pj  = b->j + bi[i];
1695         nz  = bi[i+1] - bi[i];
1696         for (j=0; j<nz; j++) {
1697           col    = pj[j];
1698           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1699         }
1700         /* U part */
1701         pc1 = b->a + bdiag[i+1]+1;
1702         pj  = b->j + bdiag[i+1]+1;
1703         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1704         for (j=0; j<nz; j++) {
1705           col    = pj[j];
1706           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707         }
1708 
1709         sctx.rs = rs;
1710         sctx.pv = rtmp1[i];
1711         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1712         if (sctx.newshift) break;
1713         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1714         *pc1 = 1.0/sctx.pv;
1715 
1716         /* Now take care of 1st column of diagonal 4x4 block. */
1717         pc2 = rtmp2 + i;
1718         pc3 = rtmp3 + i;
1719         pc4 = rtmp4 + i;
1720         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1722           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1723           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1724           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1725           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1726           for (j=0; j<nz; j++) {
1727             col         = pj[j];
1728             rtmp2[col] -= mul2 * rtmp1[col];
1729             rtmp3[col] -= mul3 * rtmp1[col];
1730             rtmp4[col] -= mul4 * rtmp1[col];
1731           }
1732           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1733         }
1734 
1735         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1736         rs = 0.0;
1737         /* L part */
1738         pc2 = b->a + bi[i+1];
1739         pj  = b->j + bi[i+1];
1740         nz  = bi[i+2] - bi[i+1];
1741         for (j=0; j<nz; j++) {
1742           col    = pj[j];
1743           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1744         }
1745         /* U part */
1746         pc2 = b->a + bdiag[i+2]+1;
1747         pj  = b->j + bdiag[i+2]+1;
1748         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1749         for (j=0; j<nz; j++) {
1750           col    = pj[j];
1751           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752         }
1753 
1754         sctx.rs = rs;
1755         sctx.pv = rtmp2[i+1];
1756         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1757         if (sctx.newshift) break;
1758         pc2  = b->a + bdiag[i+1];
1759         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1760 
1761         /* Now take care of 2nd column of diagonal 4x4 block. */
1762         pc3 = rtmp3 + i+1;
1763         pc4 = rtmp4 + i+1;
1764         if (*pc3 != 0.0 || *pc4 != 0.0) {
1765           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1766           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1767           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1768           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1769           for (j=0; j<nz; j++) {
1770             col         = pj[j];
1771             rtmp3[col] -= mul3 * rtmp2[col];
1772             rtmp4[col] -= mul4 * rtmp2[col];
1773           }
1774           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1775         }
1776 
1777         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1778         rs = 0.0;
1779         /* L part */
1780         pc3 = b->a + bi[i+2];
1781         pj  = b->j + bi[i+2];
1782         nz  = bi[i+3] - bi[i+2];
1783         for (j=0; j<nz; j++) {
1784           col    = pj[j];
1785           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1786         }
1787         /* U part */
1788         pc3 = b->a + bdiag[i+3]+1;
1789         pj  = b->j + bdiag[i+3]+1;
1790         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1791         for (j=0; j<nz; j++) {
1792           col    = pj[j];
1793           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794         }
1795 
1796         sctx.rs = rs;
1797         sctx.pv = rtmp3[i+2];
1798         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1799         if (sctx.newshift) break;
1800         pc3  = b->a + bdiag[i+2];
1801         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1802 
1803         /* Now take care of 3rd column of diagonal 4x4 block. */
1804         pc4 = rtmp4 + i+2;
1805         if (*pc4 != 0.0) {
1806           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1807           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1808           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1809           for (j=0; j<nz; j++) {
1810             col         = pj[j];
1811             rtmp4[col] -= mul4 * rtmp3[col];
1812           }
1813           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1814         }
1815 
1816         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1817         rs = 0.0;
1818         /* L part */
1819         pc4 = b->a + bi[i+3];
1820         pj  = b->j + bi[i+3];
1821         nz  = bi[i+4] - bi[i+3];
1822         for (j=0; j<nz; j++) {
1823           col    = pj[j];
1824           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1825         }
1826         /* U part */
1827         pc4 = b->a + bdiag[i+4]+1;
1828         pj  = b->j + bdiag[i+4]+1;
1829         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1830         for (j=0; j<nz; j++) {
1831           col    = pj[j];
1832           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833         }
1834 
1835         sctx.rs = rs;
1836         sctx.pv = rtmp4[i+3];
1837         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1838         if (sctx.newshift) break;
1839         pc4  = b->a + bdiag[i+3];
1840         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1841         break;
1842 
1843       default:
1844         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1845       }
1846       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1847       i += nodesz;                 /* Update the row */
1848     }
1849 
1850     /* MatPivotRefine() */
1851     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1852       /*
1853        * if no shift in this attempt & shifting & started shifting & can refine,
1854        * then try lower shift
1855        */
1856       sctx.shift_hi       = sctx.shift_fraction;
1857       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1858       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1859       sctx.newshift       = PETSC_TRUE;
1860       sctx.nshift++;
1861     }
1862   } while (sctx.newshift);
1863 
1864   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1865   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1866   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1867   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1868 
1869   C->ops->solve             = MatSolve_SeqAIJ;
1870   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1871   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1872   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1873   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1874   C->assembled              = PETSC_TRUE;
1875   C->preallocated           = PETSC_TRUE;
1876 
1877   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1878 
1879   /* MatShiftView(A,info,&sctx) */
1880   if (sctx.nshift) {
1881     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1882       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1883     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1884       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1885     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1886       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1887     }
1888   }
1889   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 #undef __FUNCT__
1894 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1895 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1896 {
1897   Mat             C     = B;
1898   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1899   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1900   PetscErrorCode  ierr;
1901   const PetscInt  *r,*ic,*c,*ics;
1902   PetscInt        n   = A->rmap->n,*bi = b->i;
1903   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1904   PetscInt        i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1905   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1906   PetscScalar     mul1,mul2,mul3,tmp;
1907   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1908   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1909   PetscReal       rs=0.0;
1910   FactorShiftCtx  sctx;
1911 
1912   PetscFunctionBegin;
1913   sctx.shift_top      = 0;
1914   sctx.nshift_max     = 0;
1915   sctx.shift_lo       = 0;
1916   sctx.shift_hi       = 0;
1917   sctx.shift_fraction = 0;
1918 
1919   /* if both shift schemes are chosen by user, only use info->shiftpd */
1920   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1921     sctx.shift_top = 0;
1922     for (i=0; i<n; i++) {
1923       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1924       rs    = 0.0;
1925       ajtmp = aj + ai[i];
1926       rtmp1 = aa + ai[i];
1927       nz    = ai[i+1] - ai[i];
1928       for (j=0; j<nz; j++) {
1929         if (*ajtmp != i) {
1930           rs += PetscAbsScalar(*rtmp1++);
1931         } else {
1932           rs -= PetscRealPart(*rtmp1++);
1933         }
1934         ajtmp++;
1935       }
1936       if (rs>sctx.shift_top) sctx.shift_top = rs;
1937     }
1938     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1939     sctx.shift_top *= 1.1;
1940     sctx.nshift_max = 5;
1941     sctx.shift_lo   = 0.;
1942     sctx.shift_hi   = 1.;
1943   }
1944   sctx.shift_amount = 0;
1945   sctx.nshift       = 0;
1946 
1947   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1948   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1949   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1950   ierr   = PetscMalloc3(n,PetscScalar,&rtmp11,n,PetscScalar,&rtmp22,n,PetscScalar,&rtmp33);CHKERRQ(ierr);
1951   ierr   = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr);
1952   ierr   = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr);
1953   ierr   = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr);
1954   ics    = ic;
1955 
1956   node_max = a->inode.node_count;
1957   ns       = a->inode.size;
1958   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1959 
1960   /* If max inode size > 3, split it into two inodes.*/
1961   /* also map the inode sizes according to the ordering */
1962   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1963   for (i=0,j=0; i<node_max; ++i,++j) {
1964     if (ns[i]>3) {
1965       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1966       ++j;
1967       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1968     } else {
1969       tmp_vec1[j] = ns[i];
1970     }
1971   }
1972   /* Use the correct node_max */
1973   node_max = j;
1974 
1975   /* Now reorder the inode info based on mat re-ordering info */
1976   /* First create a row -> inode_size_array_index map */
1977   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1978   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1979   for (i=0,row=0; i<node_max; i++) {
1980     nodesz = tmp_vec1[i];
1981     for (j=0; j<nodesz; j++,row++) {
1982       nsmap[row] = i;
1983     }
1984   }
1985   /* Using nsmap, create a reordered ns structure */
1986   for (i=0,j=0; i< node_max; i++) {
1987     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1988     tmp_vec2[i] = nodesz;
1989     j          += nodesz;
1990   }
1991   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1992   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1993   /* Now use the correct ns */
1994   ns = tmp_vec2;
1995 
1996   do {
1997     sctx.newshift = PETSC_FALSE;
1998     /* Now loop over each block-row, and do the factorization */
1999     for (i=0,row=0; i<node_max; i++) {
2000       nodesz = ns[i];
2001       nz     = bi[row+1] - bi[row];
2002       bjtmp  = bj + bi[row];
2003 
2004       switch (nodesz) {
2005       case 1:
2006         for  (j=0; j<nz; j++) {
2007           idx         = bjtmp[j];
2008           rtmp11[idx] = 0.0;
2009         }
2010 
2011         /* load in initial (unfactored row) */
2012         idx    = r[row];
2013         nz_tmp = ai[idx+1] - ai[idx];
2014         ajtmp  = aj + ai[idx];
2015         v1     = aa + ai[idx];
2016 
2017         for (j=0; j<nz_tmp; j++) {
2018           idx         = ics[ajtmp[j]];
2019           rtmp11[idx] = v1[j];
2020         }
2021         rtmp11[ics[r[row]]] += sctx.shift_amount;
2022 
2023         prow = *bjtmp++;
2024         while (prow < row) {
2025           pc1 = rtmp11 + prow;
2026           if (*pc1 != 0.0) {
2027             pv     = ba + bd[prow];
2028             pj     = nbj + bd[prow];
2029             mul1   = *pc1 * *pv++;
2030             *pc1   = mul1;
2031             nz_tmp = bi[prow+1] - bd[prow] - 1;
2032             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2033             for (j=0; j<nz_tmp; j++) {
2034               tmp          = pv[j];
2035               idx          = pj[j];
2036               rtmp11[idx] -= mul1 * tmp;
2037             }
2038           }
2039           prow = *bjtmp++;
2040         }
2041         pj  = bj + bi[row];
2042         pc1 = ba + bi[row];
2043 
2044         sctx.pv     = rtmp11[row];
2045         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2046         rs          = 0.0;
2047         for (j=0; j<nz; j++) {
2048           idx    = pj[j];
2049           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2050           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2051         }
2052         sctx.rs = rs;
2053         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2054         if (sctx.newshift) goto endofwhile;
2055         break;
2056 
2057       case 2:
2058         for (j=0; j<nz; j++) {
2059           idx         = bjtmp[j];
2060           rtmp11[idx] = 0.0;
2061           rtmp22[idx] = 0.0;
2062         }
2063 
2064         /* load in initial (unfactored row) */
2065         idx    = r[row];
2066         nz_tmp = ai[idx+1] - ai[idx];
2067         ajtmp  = aj + ai[idx];
2068         v1     = aa + ai[idx];
2069         v2     = aa + ai[idx+1];
2070         for (j=0; j<nz_tmp; j++) {
2071           idx         = ics[ajtmp[j]];
2072           rtmp11[idx] = v1[j];
2073           rtmp22[idx] = v2[j];
2074         }
2075         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2076         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2077 
2078         prow = *bjtmp++;
2079         while (prow < row) {
2080           pc1 = rtmp11 + prow;
2081           pc2 = rtmp22 + prow;
2082           if (*pc1 != 0.0 || *pc2 != 0.0) {
2083             pv   = ba + bd[prow];
2084             pj   = nbj + bd[prow];
2085             mul1 = *pc1 * *pv;
2086             mul2 = *pc2 * *pv;
2087             ++pv;
2088             *pc1 = mul1;
2089             *pc2 = mul2;
2090 
2091             nz_tmp = bi[prow+1] - bd[prow] - 1;
2092             for (j=0; j<nz_tmp; j++) {
2093               tmp          = pv[j];
2094               idx          = pj[j];
2095               rtmp11[idx] -= mul1 * tmp;
2096               rtmp22[idx] -= mul2 * tmp;
2097             }
2098             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2099           }
2100           prow = *bjtmp++;
2101         }
2102 
2103         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2104         pc1 = rtmp11 + prow;
2105         pc2 = rtmp22 + prow;
2106 
2107         sctx.pv = *pc1;
2108         pj      = bj + bi[prow];
2109         rs      = 0.0;
2110         for (j=0; j<nz; j++) {
2111           idx = pj[j];
2112           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2113         }
2114         sctx.rs = rs;
2115         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2116         if (sctx.newshift) goto endofwhile;
2117 
2118         if (*pc2 != 0.0) {
2119           pj     = nbj + bd[prow];
2120           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2121           *pc2   = mul2;
2122           nz_tmp = bi[prow+1] - bd[prow] - 1;
2123           for (j=0; j<nz_tmp; j++) {
2124             idx          = pj[j];
2125             tmp          = rtmp11[idx];
2126             rtmp22[idx] -= mul2 * tmp;
2127           }
2128           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2129         }
2130 
2131         pj  = bj + bi[row];
2132         pc1 = ba + bi[row];
2133         pc2 = ba + bi[row+1];
2134 
2135         sctx.pv       = rtmp22[row+1];
2136         rs            = 0.0;
2137         rtmp11[row]   = 1.0/rtmp11[row];
2138         rtmp22[row+1] = 1.0/rtmp22[row+1];
2139         /* copy row entries from dense representation to sparse */
2140         for (j=0; j<nz; j++) {
2141           idx    = pj[j];
2142           pc1[j] = rtmp11[idx];
2143           pc2[j] = rtmp22[idx];
2144           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2145         }
2146         sctx.rs = rs;
2147         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2148         if (sctx.newshift) goto endofwhile;
2149         break;
2150 
2151       case 3:
2152         for  (j=0; j<nz; j++) {
2153           idx         = bjtmp[j];
2154           rtmp11[idx] = 0.0;
2155           rtmp22[idx] = 0.0;
2156           rtmp33[idx] = 0.0;
2157         }
2158         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2159         idx    = r[row];
2160         nz_tmp = ai[idx+1] - ai[idx];
2161         ajtmp  = aj + ai[idx];
2162         v1     = aa + ai[idx];
2163         v2     = aa + ai[idx+1];
2164         v3     = aa + ai[idx+2];
2165         for (j=0; j<nz_tmp; j++) {
2166           idx         = ics[ajtmp[j]];
2167           rtmp11[idx] = v1[j];
2168           rtmp22[idx] = v2[j];
2169           rtmp33[idx] = v3[j];
2170         }
2171         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2172         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2173         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2174 
2175         /* loop over all pivot row blocks above this row block */
2176         prow = *bjtmp++;
2177         while (prow < row) {
2178           pc1 = rtmp11 + prow;
2179           pc2 = rtmp22 + prow;
2180           pc3 = rtmp33 + prow;
2181           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2182             pv   = ba  + bd[prow];
2183             pj   = nbj + bd[prow];
2184             mul1 = *pc1 * *pv;
2185             mul2 = *pc2 * *pv;
2186             mul3 = *pc3 * *pv;
2187             ++pv;
2188             *pc1 = mul1;
2189             *pc2 = mul2;
2190             *pc3 = mul3;
2191 
2192             nz_tmp = bi[prow+1] - bd[prow] - 1;
2193             /* update this row based on pivot row */
2194             for (j=0; j<nz_tmp; j++) {
2195               tmp          = pv[j];
2196               idx          = pj[j];
2197               rtmp11[idx] -= mul1 * tmp;
2198               rtmp22[idx] -= mul2 * tmp;
2199               rtmp33[idx] -= mul3 * tmp;
2200             }
2201             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2202           }
2203           prow = *bjtmp++;
2204         }
2205 
2206         /* Now take care of diagonal 3x3 block in this set of rows */
2207         /* note: prow = row here */
2208         pc1 = rtmp11 + prow;
2209         pc2 = rtmp22 + prow;
2210         pc3 = rtmp33 + prow;
2211 
2212         sctx.pv = *pc1;
2213         pj      = bj + bi[prow];
2214         rs      = 0.0;
2215         for (j=0; j<nz; j++) {
2216           idx = pj[j];
2217           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2218         }
2219         sctx.rs = rs;
2220         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2221         if (sctx.newshift) goto endofwhile;
2222 
2223         if (*pc2 != 0.0 || *pc3 != 0.0) {
2224           mul2   = (*pc2)/(*pc1);
2225           mul3   = (*pc3)/(*pc1);
2226           *pc2   = mul2;
2227           *pc3   = mul3;
2228           nz_tmp = bi[prow+1] - bd[prow] - 1;
2229           pj     = nbj + bd[prow];
2230           for (j=0; j<nz_tmp; j++) {
2231             idx          = pj[j];
2232             tmp          = rtmp11[idx];
2233             rtmp22[idx] -= mul2 * tmp;
2234             rtmp33[idx] -= mul3 * tmp;
2235           }
2236           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2237         }
2238         ++prow;
2239 
2240         pc2     = rtmp22 + prow;
2241         pc3     = rtmp33 + prow;
2242         sctx.pv = *pc2;
2243         pj      = bj + bi[prow];
2244         rs      = 0.0;
2245         for (j=0; j<nz; j++) {
2246           idx = pj[j];
2247           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2248         }
2249         sctx.rs = rs;
2250         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2251         if (sctx.newshift) goto endofwhile;
2252 
2253         if (*pc3 != 0.0) {
2254           mul3   = (*pc3)/(*pc2);
2255           *pc3   = mul3;
2256           pj     = nbj + bd[prow];
2257           nz_tmp = bi[prow+1] - bd[prow] - 1;
2258           for (j=0; j<nz_tmp; j++) {
2259             idx          = pj[j];
2260             tmp          = rtmp22[idx];
2261             rtmp33[idx] -= mul3 * tmp;
2262           }
2263           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2264         }
2265 
2266         pj  = bj + bi[row];
2267         pc1 = ba + bi[row];
2268         pc2 = ba + bi[row+1];
2269         pc3 = ba + bi[row+2];
2270 
2271         sctx.pv       = rtmp33[row+2];
2272         rs            = 0.0;
2273         rtmp11[row]   = 1.0/rtmp11[row];
2274         rtmp22[row+1] = 1.0/rtmp22[row+1];
2275         rtmp33[row+2] = 1.0/rtmp33[row+2];
2276         /* copy row entries from dense representation to sparse */
2277         for (j=0; j<nz; j++) {
2278           idx    = pj[j];
2279           pc1[j] = rtmp11[idx];
2280           pc2[j] = rtmp22[idx];
2281           pc3[j] = rtmp33[idx];
2282           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2283         }
2284 
2285         sctx.rs = rs;
2286         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2287         if (sctx.newshift) goto endofwhile;
2288         break;
2289 
2290       default:
2291         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2292       }
2293       row += nodesz;                 /* Update the row */
2294     }
2295 endofwhile:;
2296   } while (sctx.newshift);
2297   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2298   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2299   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2300   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2301   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2302 
2303   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2304   /* do not set solve add, since MatSolve_Inode + Add is faster */
2305   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2306   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2307   C->assembled              = PETSC_TRUE;
2308   C->preallocated           = PETSC_TRUE;
2309   if (sctx.nshift) {
2310     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2311       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2312     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2313       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2314     }
2315   }
2316   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2317   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2318   PetscFunctionReturn(0);
2319 }
2320 
2321 
2322 /* ----------------------------------------------------------- */
2323 #undef __FUNCT__
2324 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2325 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2326 {
2327   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2328   IS                iscol = a->col,isrow = a->row;
2329   PetscErrorCode    ierr;
2330   const PetscInt    *r,*c,*rout,*cout;
2331   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2332   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2333   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2334   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2335   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336   const PetscScalar *b;
2337 
2338   PetscFunctionBegin;
2339   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340   node_max = a->inode.node_count;
2341   ns       = a->inode.size;     /* Node Size array */
2342 
2343   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2344   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2345   tmp  = a->solve_work;
2346 
2347   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2348   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2349 
2350   /* forward solve the lower triangular */
2351   tmps = tmp;
2352   aa   = a_a;
2353   aj   = a_j;
2354   ad   = a->diag;
2355 
2356   for (i = 0,row = 0; i< node_max; ++i) {
2357     nsz = ns[i];
2358     aii = ai[row];
2359     v1  = aa + aii;
2360     vi  = aj + aii;
2361     nz  = ai[row+1]- ai[row];
2362 
2363     if (i < node_max-1) {
2364       /* Prefetch the indices for the next block */
2365       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366       /* Prefetch the data for the next block */
2367       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368     }
2369 
2370     switch (nsz) {               /* Each loop in 'case' is unrolled */
2371     case 1:
2372       sum1 = b[r[row]];
2373       for (j=0; j<nz-1; j+=2) {
2374         i0    = vi[j];
2375         i1    = vi[j+1];
2376         tmp0  = tmps[i0];
2377         tmp1  = tmps[i1];
2378         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379       }
2380       if (j == nz-1) {
2381         tmp0  = tmps[vi[j]];
2382         sum1 -= v1[j]*tmp0;
2383       }
2384       tmp[row++]=sum1;
2385       break;
2386     case 2:
2387       sum1 = b[r[row]];
2388       sum2 = b[r[row+1]];
2389       v2   = aa + ai[row+1];
2390 
2391       for (j=0; j<nz-1; j+=2) {
2392         i0    = vi[j];
2393         i1    = vi[j+1];
2394         tmp0  = tmps[i0];
2395         tmp1  = tmps[i1];
2396         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398       }
2399       if (j == nz-1) {
2400         tmp0  = tmps[vi[j]];
2401         sum1 -= v1[j] *tmp0;
2402         sum2 -= v2[j] *tmp0;
2403       }
2404       sum2     -= v2[nz] * sum1;
2405       tmp[row++]=sum1;
2406       tmp[row++]=sum2;
2407       break;
2408     case 3:
2409       sum1 = b[r[row]];
2410       sum2 = b[r[row+1]];
2411       sum3 = b[r[row+2]];
2412       v2   = aa + ai[row+1];
2413       v3   = aa + ai[row+2];
2414 
2415       for (j=0; j<nz-1; j+=2) {
2416         i0    = vi[j];
2417         i1    = vi[j+1];
2418         tmp0  = tmps[i0];
2419         tmp1  = tmps[i1];
2420         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423       }
2424       if (j == nz-1) {
2425         tmp0  = tmps[vi[j]];
2426         sum1 -= v1[j] *tmp0;
2427         sum2 -= v2[j] *tmp0;
2428         sum3 -= v3[j] *tmp0;
2429       }
2430       sum2     -= v2[nz] * sum1;
2431       sum3     -= v3[nz] * sum1;
2432       sum3     -= v3[nz+1] * sum2;
2433       tmp[row++]=sum1;
2434       tmp[row++]=sum2;
2435       tmp[row++]=sum3;
2436       break;
2437 
2438     case 4:
2439       sum1 = b[r[row]];
2440       sum2 = b[r[row+1]];
2441       sum3 = b[r[row+2]];
2442       sum4 = b[r[row+3]];
2443       v2   = aa + ai[row+1];
2444       v3   = aa + ai[row+2];
2445       v4   = aa + ai[row+3];
2446 
2447       for (j=0; j<nz-1; j+=2) {
2448         i0    = vi[j];
2449         i1    = vi[j+1];
2450         tmp0  = tmps[i0];
2451         tmp1  = tmps[i1];
2452         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456       }
2457       if (j == nz-1) {
2458         tmp0  = tmps[vi[j]];
2459         sum1 -= v1[j] *tmp0;
2460         sum2 -= v2[j] *tmp0;
2461         sum3 -= v3[j] *tmp0;
2462         sum4 -= v4[j] *tmp0;
2463       }
2464       sum2 -= v2[nz] * sum1;
2465       sum3 -= v3[nz] * sum1;
2466       sum4 -= v4[nz] * sum1;
2467       sum3 -= v3[nz+1] * sum2;
2468       sum4 -= v4[nz+1] * sum2;
2469       sum4 -= v4[nz+2] * sum3;
2470 
2471       tmp[row++]=sum1;
2472       tmp[row++]=sum2;
2473       tmp[row++]=sum3;
2474       tmp[row++]=sum4;
2475       break;
2476     case 5:
2477       sum1 = b[r[row]];
2478       sum2 = b[r[row+1]];
2479       sum3 = b[r[row+2]];
2480       sum4 = b[r[row+3]];
2481       sum5 = b[r[row+4]];
2482       v2   = aa + ai[row+1];
2483       v3   = aa + ai[row+2];
2484       v4   = aa + ai[row+3];
2485       v5   = aa + ai[row+4];
2486 
2487       for (j=0; j<nz-1; j+=2) {
2488         i0    = vi[j];
2489         i1    = vi[j+1];
2490         tmp0  = tmps[i0];
2491         tmp1  = tmps[i1];
2492         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497       }
2498       if (j == nz-1) {
2499         tmp0  = tmps[vi[j]];
2500         sum1 -= v1[j] *tmp0;
2501         sum2 -= v2[j] *tmp0;
2502         sum3 -= v3[j] *tmp0;
2503         sum4 -= v4[j] *tmp0;
2504         sum5 -= v5[j] *tmp0;
2505       }
2506 
2507       sum2 -= v2[nz] * sum1;
2508       sum3 -= v3[nz] * sum1;
2509       sum4 -= v4[nz] * sum1;
2510       sum5 -= v5[nz] * sum1;
2511       sum3 -= v3[nz+1] * sum2;
2512       sum4 -= v4[nz+1] * sum2;
2513       sum5 -= v5[nz+1] * sum2;
2514       sum4 -= v4[nz+2] * sum3;
2515       sum5 -= v5[nz+2] * sum3;
2516       sum5 -= v5[nz+3] * sum4;
2517 
2518       tmp[row++]=sum1;
2519       tmp[row++]=sum2;
2520       tmp[row++]=sum3;
2521       tmp[row++]=sum4;
2522       tmp[row++]=sum5;
2523       break;
2524     default:
2525       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526     }
2527   }
2528   /* backward solve the upper triangular */
2529   for (i=node_max -1,row = n-1; i>=0; i--) {
2530     nsz = ns[i];
2531     aii = ad[row+1] + 1;
2532     v1  = aa + aii;
2533     vi  = aj + aii;
2534     nz  = ad[row]- ad[row+1] - 1;
2535 
2536     if (i > 0) {
2537       /* Prefetch the indices for the next block */
2538       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539       /* Prefetch the data for the next block */
2540       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541     }
2542 
2543     switch (nsz) {               /* Each loop in 'case' is unrolled */
2544     case 1:
2545       sum1 = tmp[row];
2546 
2547       for (j=0; j<nz-1; j+=2) {
2548         i0    = vi[j];
2549         i1    = vi[j+1];
2550         tmp0  = tmps[i0];
2551         tmp1  = tmps[i1];
2552         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553       }
2554       if (j == nz-1) {
2555         tmp0  = tmps[vi[j]];
2556         sum1 -= v1[j]*tmp0;
2557       }
2558       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559       break;
2560     case 2:
2561       sum1 = tmp[row];
2562       sum2 = tmp[row-1];
2563       v2   = aa + ad[row] + 1;
2564       for (j=0; j<nz-1; j+=2) {
2565         i0    = vi[j];
2566         i1    = vi[j+1];
2567         tmp0  = tmps[i0];
2568         tmp1  = tmps[i1];
2569         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571       }
2572       if (j == nz-1) {
2573         tmp0  = tmps[vi[j]];
2574         sum1 -= v1[j]* tmp0;
2575         sum2 -= v2[j+1]* tmp0;
2576       }
2577 
2578       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579       sum2     -= v2[0] * tmp0;
2580       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581       break;
2582     case 3:
2583       sum1 = tmp[row];
2584       sum2 = tmp[row -1];
2585       sum3 = tmp[row -2];
2586       v2   = aa + ad[row] + 1;
2587       v3   = aa + ad[row -1] + 1;
2588       for (j=0; j<nz-1; j+=2) {
2589         i0    = vi[j];
2590         i1    = vi[j+1];
2591         tmp0  = tmps[i0];
2592         tmp1  = tmps[i1];
2593         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596       }
2597       if (j== nz-1) {
2598         tmp0  = tmps[vi[j]];
2599         sum1 -= v1[j] * tmp0;
2600         sum2 -= v2[j+1] * tmp0;
2601         sum3 -= v3[j+2] * tmp0;
2602       }
2603       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604       sum2     -= v2[0]* tmp0;
2605       sum3     -= v3[1] * tmp0;
2606       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607       sum3     -= v3[0]* tmp0;
2608       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609 
2610       break;
2611     case 4:
2612       sum1 = tmp[row];
2613       sum2 = tmp[row -1];
2614       sum3 = tmp[row -2];
2615       sum4 = tmp[row -3];
2616       v2   = aa + ad[row]+1;
2617       v3   = aa + ad[row -1]+1;
2618       v4   = aa + ad[row -2]+1;
2619 
2620       for (j=0; j<nz-1; j+=2) {
2621         i0    = vi[j];
2622         i1    = vi[j+1];
2623         tmp0  = tmps[i0];
2624         tmp1  = tmps[i1];
2625         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2626         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629       }
2630       if (j== nz-1) {
2631         tmp0  = tmps[vi[j]];
2632         sum1 -= v1[j] * tmp0;
2633         sum2 -= v2[j+1] * tmp0;
2634         sum3 -= v3[j+2] * tmp0;
2635         sum4 -= v4[j+3] * tmp0;
2636       }
2637 
2638       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639       sum2     -= v2[0] * tmp0;
2640       sum3     -= v3[1] * tmp0;
2641       sum4     -= v4[2] * tmp0;
2642       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643       sum3     -= v3[0] * tmp0;
2644       sum4     -= v4[1] * tmp0;
2645       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646       sum4     -= v4[0] * tmp0;
2647       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648       break;
2649     case 5:
2650       sum1 = tmp[row];
2651       sum2 = tmp[row -1];
2652       sum3 = tmp[row -2];
2653       sum4 = tmp[row -3];
2654       sum5 = tmp[row -4];
2655       v2   = aa + ad[row]+1;
2656       v3   = aa + ad[row -1]+1;
2657       v4   = aa + ad[row -2]+1;
2658       v5   = aa + ad[row -3]+1;
2659       for (j=0; j<nz-1; j+=2) {
2660         i0    = vi[j];
2661         i1    = vi[j+1];
2662         tmp0  = tmps[i0];
2663         tmp1  = tmps[i1];
2664         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669       }
2670       if (j==nz-1) {
2671         tmp0  = tmps[vi[j]];
2672         sum1 -= v1[j] * tmp0;
2673         sum2 -= v2[j+1] * tmp0;
2674         sum3 -= v3[j+2] * tmp0;
2675         sum4 -= v4[j+3] * tmp0;
2676         sum5 -= v5[j+4] * tmp0;
2677       }
2678 
2679       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680       sum2     -= v2[0] * tmp0;
2681       sum3     -= v3[1] * tmp0;
2682       sum4     -= v4[2] * tmp0;
2683       sum5     -= v5[3] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685       sum3     -= v3[0] * tmp0;
2686       sum4     -= v4[1] * tmp0;
2687       sum5     -= v5[2] * tmp0;
2688       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689       sum4     -= v4[0] * tmp0;
2690       sum5     -= v5[1] * tmp0;
2691       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692       sum5     -= v5[0] * tmp0;
2693       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694       break;
2695     default:
2696       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697     }
2698   }
2699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2703   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 
2708 /*
2709      Makes a longer coloring[] array and calls the usual code with that
2710 */
2711 #undef __FUNCT__
2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714 {
2715   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2716   PetscErrorCode  ierr;
2717   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718   PetscInt        *colorused,i;
2719   ISColoringValue *newcolor;
2720 
2721   PetscFunctionBegin;
2722   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2723   /* loop over inodes, marking a color for each column*/
2724   row = 0;
2725   for (i=0; i<m; i++) {
2726     for (j=0; j<ns[i]; j++) {
2727       newcolor[row++] = coloring[i] + j*ncolors;
2728     }
2729   }
2730 
2731   /* eliminate unneeded colors */
2732   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2733   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2734   for (i=0; i<n; i++) {
2735     colorused[newcolor[i]] = 1;
2736   }
2737 
2738   for (i=1; i<5*ncolors; i++) {
2739     colorused[i] += colorused[i-1];
2740   }
2741   ncolors = colorused[5*ncolors-1];
2742   for (i=0; i<n; i++) {
2743     newcolor[i] = colorused[newcolor[i]]-1;
2744   }
2745   ierr = PetscFree(colorused);CHKERRQ(ierr);
2746   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2747   ierr = PetscFree(coloring);CHKERRQ(ierr);
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 #include <petsc-private/kernels/blockinvert.h>
2752 
2753 #undef __FUNCT__
2754 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2755 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2756 {
2757   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2758   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2759   MatScalar         *ibdiag,*bdiag,work[25],*t;
2760   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2761   const MatScalar   *v = a->a,*v1,*v2,*v3,*v4,*v5;
2762   const PetscScalar *xb, *b;
2763   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2764   PetscErrorCode    ierr;
2765   PetscInt          n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2766   PetscInt          *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2767 
2768   PetscFunctionBegin;
2769   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2770   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2771 
2772   if (!a->inode.ibdiagvalid) {
2773     if (!a->inode.ibdiag) {
2774       /* calculate space needed for diagonal blocks */
2775       for (i=0; i<m; i++) {
2776         cnt += sizes[i]*sizes[i];
2777       }
2778       a->inode.bdiagsize = cnt;
2779 
2780       ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2781     }
2782 
2783     /* copy over the diagonal blocks and invert them */
2784     ibdiag = a->inode.ibdiag;
2785     bdiag  = a->inode.bdiag;
2786     cnt    = 0;
2787     for (i=0, row = 0; i<m; i++) {
2788       for (j=0; j<sizes[i]; j++) {
2789         for (k=0; k<sizes[i]; k++) {
2790           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2791         }
2792       }
2793       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2794 
2795       switch (sizes[i]) {
2796       case 1:
2797         /* Create matrix data structure */
2798         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2799         ibdiag[cnt] = 1.0/ibdiag[cnt];
2800         break;
2801       case 2:
2802         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2803         break;
2804       case 3:
2805         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2806         break;
2807       case 4:
2808         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2809         break;
2810       case 5:
2811         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2812         break;
2813       default:
2814         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2815       }
2816       cnt += sizes[i]*sizes[i];
2817       row += sizes[i];
2818     }
2819     a->inode.ibdiagvalid = PETSC_TRUE;
2820   }
2821   ibdiag = a->inode.ibdiag;
2822   bdiag  = a->inode.bdiag;
2823   t      = a->inode.ssor_work;
2824 
2825   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2826   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2827   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2828   if (flag & SOR_ZERO_INITIAL_GUESS) {
2829     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2830 
2831       for (i=0, row=0; i<m; i++) {
2832         sz  = diag[row] - ii[row];
2833         v1  = a->a + ii[row];
2834         idx = a->j + ii[row];
2835 
2836         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2837         switch (sizes[i]) {
2838         case 1:
2839 
2840           sum1 = b[row];
2841           for (n = 0; n<sz-1; n+=2) {
2842             i1    = idx[0];
2843             i2    = idx[1];
2844             idx  += 2;
2845             tmp0  = x[i1];
2846             tmp1  = x[i2];
2847             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2848           }
2849 
2850           if (n == sz-1) {
2851             tmp0  = x[*idx];
2852             sum1 -= *v1 * tmp0;
2853           }
2854           t[row]   = sum1;
2855           x[row++] = sum1*(*ibdiag++);
2856           break;
2857         case 2:
2858           v2   = a->a + ii[row+1];
2859           sum1 = b[row];
2860           sum2 = b[row+1];
2861           for (n = 0; n<sz-1; n+=2) {
2862             i1    = idx[0];
2863             i2    = idx[1];
2864             idx  += 2;
2865             tmp0  = x[i1];
2866             tmp1  = x[i2];
2867             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2868             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2869           }
2870 
2871           if (n == sz-1) {
2872             tmp0  = x[*idx];
2873             sum1 -= v1[0] * tmp0;
2874             sum2 -= v2[0] * tmp0;
2875           }
2876           t[row]   = sum1;
2877           t[row+1] = sum2;
2878           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2879           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2880           ibdiag  += 4;
2881           break;
2882         case 3:
2883           v2   = a->a + ii[row+1];
2884           v3   = a->a + ii[row+2];
2885           sum1 = b[row];
2886           sum2 = b[row+1];
2887           sum3 = b[row+2];
2888           for (n = 0; n<sz-1; n+=2) {
2889             i1    = idx[0];
2890             i2    = idx[1];
2891             idx  += 2;
2892             tmp0  = x[i1];
2893             tmp1  = x[i2];
2894             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2895             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2896             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2897           }
2898 
2899           if (n == sz-1) {
2900             tmp0  = x[*idx];
2901             sum1 -= v1[0] * tmp0;
2902             sum2 -= v2[0] * tmp0;
2903             sum3 -= v3[0] * tmp0;
2904           }
2905           t[row]   = sum1;
2906           t[row+1] = sum2;
2907           t[row+2] = sum3;
2908           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2909           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2910           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2911           ibdiag  += 9;
2912           break;
2913         case 4:
2914           v2   = a->a + ii[row+1];
2915           v3   = a->a + ii[row+2];
2916           v4   = a->a + ii[row+3];
2917           sum1 = b[row];
2918           sum2 = b[row+1];
2919           sum3 = b[row+2];
2920           sum4 = b[row+3];
2921           for (n = 0; n<sz-1; n+=2) {
2922             i1    = idx[0];
2923             i2    = idx[1];
2924             idx  += 2;
2925             tmp0  = x[i1];
2926             tmp1  = x[i2];
2927             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2928             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2929             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2930             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2931           }
2932 
2933           if (n == sz-1) {
2934             tmp0  = x[*idx];
2935             sum1 -= v1[0] * tmp0;
2936             sum2 -= v2[0] * tmp0;
2937             sum3 -= v3[0] * tmp0;
2938             sum4 -= v4[0] * tmp0;
2939           }
2940           t[row]   = sum1;
2941           t[row+1] = sum2;
2942           t[row+2] = sum3;
2943           t[row+3] = sum4;
2944           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2945           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2946           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2947           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2948           ibdiag  += 16;
2949           break;
2950         case 5:
2951           v2   = a->a + ii[row+1];
2952           v3   = a->a + ii[row+2];
2953           v4   = a->a + ii[row+3];
2954           v5   = a->a + ii[row+4];
2955           sum1 = b[row];
2956           sum2 = b[row+1];
2957           sum3 = b[row+2];
2958           sum4 = b[row+3];
2959           sum5 = b[row+4];
2960           for (n = 0; n<sz-1; n+=2) {
2961             i1    = idx[0];
2962             i2    = idx[1];
2963             idx  += 2;
2964             tmp0  = x[i1];
2965             tmp1  = x[i2];
2966             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2967             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2968             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2969             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2970             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2971           }
2972 
2973           if (n == sz-1) {
2974             tmp0  = x[*idx];
2975             sum1 -= v1[0] * tmp0;
2976             sum2 -= v2[0] * tmp0;
2977             sum3 -= v3[0] * tmp0;
2978             sum4 -= v4[0] * tmp0;
2979             sum5 -= v5[0] * tmp0;
2980           }
2981           t[row]   = sum1;
2982           t[row+1] = sum2;
2983           t[row+2] = sum3;
2984           t[row+3] = sum4;
2985           t[row+4] = sum5;
2986           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2987           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2988           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2989           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2990           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2991           ibdiag  += 25;
2992           break;
2993         default:
2994           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2995         }
2996       }
2997 
2998       xb   = t;
2999       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3000     } else xb = b;
3001     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3002 
3003       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3004       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3005         ibdiag -= sizes[i]*sizes[i];
3006         sz      = ii[row+1] - diag[row] - 1;
3007         v1      = a->a + diag[row] + 1;
3008         idx     = a->j + diag[row] + 1;
3009 
3010         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3011         switch (sizes[i]) {
3012         case 1:
3013 
3014           sum1 = xb[row];
3015           for (n = 0; n<sz-1; n+=2) {
3016             i1    = idx[0];
3017             i2    = idx[1];
3018             idx  += 2;
3019             tmp0  = x[i1];
3020             tmp1  = x[i2];
3021             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3022           }
3023 
3024           if (n == sz-1) {
3025             tmp0  = x[*idx];
3026             sum1 -= *v1*tmp0;
3027           }
3028           x[row--] = sum1*(*ibdiag);
3029           break;
3030 
3031         case 2:
3032 
3033           sum1 = xb[row];
3034           sum2 = xb[row-1];
3035           /* note that sum1 is associated with the second of the two rows */
3036           v2 = a->a + diag[row-1] + 2;
3037           for (n = 0; n<sz-1; n+=2) {
3038             i1    = idx[0];
3039             i2    = idx[1];
3040             idx  += 2;
3041             tmp0  = x[i1];
3042             tmp1  = x[i2];
3043             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3044             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3045           }
3046 
3047           if (n == sz-1) {
3048             tmp0  = x[*idx];
3049             sum1 -= *v1*tmp0;
3050             sum2 -= *v2*tmp0;
3051           }
3052           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3053           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3054           break;
3055         case 3:
3056 
3057           sum1 = xb[row];
3058           sum2 = xb[row-1];
3059           sum3 = xb[row-2];
3060           v2   = a->a + diag[row-1] + 2;
3061           v3   = a->a + diag[row-2] + 3;
3062           for (n = 0; n<sz-1; n+=2) {
3063             i1    = idx[0];
3064             i2    = idx[1];
3065             idx  += 2;
3066             tmp0  = x[i1];
3067             tmp1  = x[i2];
3068             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3069             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3070             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3071           }
3072 
3073           if (n == sz-1) {
3074             tmp0  = x[*idx];
3075             sum1 -= *v1*tmp0;
3076             sum2 -= *v2*tmp0;
3077             sum3 -= *v3*tmp0;
3078           }
3079           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3080           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3081           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3082           break;
3083         case 4:
3084 
3085           sum1 = xb[row];
3086           sum2 = xb[row-1];
3087           sum3 = xb[row-2];
3088           sum4 = xb[row-3];
3089           v2   = a->a + diag[row-1] + 2;
3090           v3   = a->a + diag[row-2] + 3;
3091           v4   = a->a + diag[row-3] + 4;
3092           for (n = 0; n<sz-1; n+=2) {
3093             i1    = idx[0];
3094             i2    = idx[1];
3095             idx  += 2;
3096             tmp0  = x[i1];
3097             tmp1  = x[i2];
3098             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3099             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3100             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3101             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3102           }
3103 
3104           if (n == sz-1) {
3105             tmp0  = x[*idx];
3106             sum1 -= *v1*tmp0;
3107             sum2 -= *v2*tmp0;
3108             sum3 -= *v3*tmp0;
3109             sum4 -= *v4*tmp0;
3110           }
3111           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3112           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3113           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3114           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3115           break;
3116         case 5:
3117 
3118           sum1 = xb[row];
3119           sum2 = xb[row-1];
3120           sum3 = xb[row-2];
3121           sum4 = xb[row-3];
3122           sum5 = xb[row-4];
3123           v2   = a->a + diag[row-1] + 2;
3124           v3   = a->a + diag[row-2] + 3;
3125           v4   = a->a + diag[row-3] + 4;
3126           v5   = a->a + diag[row-4] + 5;
3127           for (n = 0; n<sz-1; n+=2) {
3128             i1    = idx[0];
3129             i2    = idx[1];
3130             idx  += 2;
3131             tmp0  = x[i1];
3132             tmp1  = x[i2];
3133             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3134             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3135             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3136             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3137             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3138           }
3139 
3140           if (n == sz-1) {
3141             tmp0  = x[*idx];
3142             sum1 -= *v1*tmp0;
3143             sum2 -= *v2*tmp0;
3144             sum3 -= *v3*tmp0;
3145             sum4 -= *v4*tmp0;
3146             sum5 -= *v5*tmp0;
3147           }
3148           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3149           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3150           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3151           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3152           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3153           break;
3154         default:
3155           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3156         }
3157       }
3158 
3159       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3160     }
3161     its--;
3162   }
3163   while (its--) {
3164 
3165     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3166       ibdiag = a->inode.ibdiag;
3167 
3168       for (i=0, row=0; i<m; i++) {
3169         sz  = ii[row + 1] - ii[row];
3170         v1  = a->a + ii[row];
3171         idx = a->j + ii[row];
3172 
3173         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3174         switch (sizes[i]) {
3175         case 1:
3176 
3177           sum1 = b[row];
3178           for (n = 0; n<sz-1; n+=2) {
3179             i1    = idx[0];
3180             i2    = idx[1];
3181             idx  += 2;
3182             tmp0  = x[i1];
3183             tmp1  = x[i2];
3184             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3185           }
3186 
3187           if (n == sz-1) {
3188             tmp0  = x[*idx];
3189             sum1 -= *v1 * tmp0;
3190           }
3191 
3192           /* in MatSOR_SeqAIJ this line would be
3193            *
3194            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3195            *
3196            * but omega == 1, so this becomes
3197            *
3198            * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3199            *
3200            * but bdiag and ibdiag cancel each other, so we can change this
3201            * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3202            * block sizes as well
3203            */
3204           x[row++] += sum1*(*ibdiag++);
3205           break;
3206         case 2:
3207           v2   = a->a + ii[row+1];
3208           sum1 = b[row];
3209           sum2 = b[row+1];
3210           for (n = 0; n<sz-1; n+=2) {
3211             i1    = idx[0];
3212             i2    = idx[1];
3213             idx  += 2;
3214             tmp0  = x[i1];
3215             tmp1  = x[i2];
3216             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3217             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3218           }
3219 
3220           if (n == sz-1) {
3221             tmp0  = x[*idx];
3222             sum1 -= v1[0] * tmp0;
3223             sum2 -= v2[0] * tmp0;
3224           }
3225           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3226           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3227           ibdiag   += 4;
3228           break;
3229         case 3:
3230           v2   = a->a + ii[row+1];
3231           v3   = a->a + ii[row+2];
3232           sum1 = b[row];
3233           sum2 = b[row+1];
3234           sum3 = b[row+2];
3235           for (n = 0; n<sz-1; n+=2) {
3236             i1    = idx[0];
3237             i2    = idx[1];
3238             idx  += 2;
3239             tmp0  = x[i1];
3240             tmp1  = x[i2];
3241             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3242             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3243             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3244           }
3245 
3246           if (n == sz-1) {
3247             tmp0  = x[*idx];
3248             sum1 -= v1[0] * tmp0;
3249             sum2 -= v2[0] * tmp0;
3250             sum3 -= v3[0] * tmp0;
3251           }
3252           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3253           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3254           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3255           ibdiag   += 9;
3256           break;
3257         case 4:
3258           v2   = a->a + ii[row+1];
3259           v3   = a->a + ii[row+2];
3260           v4   = a->a + ii[row+3];
3261           sum1 = b[row];
3262           sum2 = b[row+1];
3263           sum3 = b[row+2];
3264           sum4 = b[row+3];
3265           for (n = 0; n<sz-1; n+=2) {
3266             i1    = idx[0];
3267             i2    = idx[1];
3268             idx  += 2;
3269             tmp0  = x[i1];
3270             tmp1  = x[i2];
3271             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3272             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3273             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3274             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3275           }
3276 
3277           if (n == sz-1) {
3278             tmp0  = x[*idx];
3279             sum1 -= v1[0] * tmp0;
3280             sum2 -= v2[0] * tmp0;
3281             sum3 -= v3[0] * tmp0;
3282             sum4 -= v4[0] * tmp0;
3283           }
3284           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3285           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3286           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3287           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3288           ibdiag   += 16;
3289           break;
3290         case 5:
3291           v2   = a->a + ii[row+1];
3292           v3   = a->a + ii[row+2];
3293           v4   = a->a + ii[row+3];
3294           v5   = a->a + ii[row+4];
3295           sum1 = b[row];
3296           sum2 = b[row+1];
3297           sum3 = b[row+2];
3298           sum4 = b[row+3];
3299           sum5 = b[row+4];
3300           for (n = 0; n<sz-1; n+=2) {
3301             i1    = idx[0];
3302             i2    = idx[1];
3303             idx  += 2;
3304             tmp0  = x[i1];
3305             tmp1  = x[i2];
3306             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3307             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3308             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3309             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3310             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3311           }
3312 
3313           if (n == sz-1) {
3314             tmp0  = x[*idx];
3315             sum1 -= v1[0] * tmp0;
3316             sum2 -= v2[0] * tmp0;
3317             sum3 -= v3[0] * tmp0;
3318             sum4 -= v4[0] * tmp0;
3319             sum5 -= v5[0] * tmp0;
3320           }
3321           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3322           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3323           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3324           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3325           x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3326           ibdiag   += 25;
3327           break;
3328         default:
3329           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3330         }
3331       }
3332 
3333       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3334     }
3335     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3336 
3337       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3338       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3339         ibdiag -= sizes[i]*sizes[i];
3340         sz      = ii[row+1] - ii[row];
3341         v1      = a->a + ii[row];
3342         idx     = a->j + ii[row];
3343 
3344         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3345         switch (sizes[i]) {
3346         case 1:
3347 
3348           sum1 = b[row];
3349           for (n = 0; n<sz-1; n+=2) {
3350             i1    = idx[0];
3351             i2    = idx[1];
3352             idx  += 2;
3353             tmp0  = x[i1];
3354             tmp1  = x[i2];
3355             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3356           }
3357 
3358           if (n == sz-1) {
3359             tmp0  = x[*idx];
3360             sum1 -= *v1*tmp0;
3361           }
3362           x[row--] += sum1*(*ibdiag);
3363           break;
3364 
3365         case 2:
3366 
3367           sum1 = b[row];
3368           sum2 = b[row-1];
3369           /* note that sum1 is associated with the second of the two rows */
3370           v2 = a->a + ii[row - 1];
3371           for (n = 0; n<sz-1; n+=2) {
3372             i1    = idx[0];
3373             i2    = idx[1];
3374             idx  += 2;
3375             tmp0  = x[i1];
3376             tmp1  = x[i2];
3377             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3378             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3379           }
3380 
3381           if (n == sz-1) {
3382             tmp0  = x[*idx];
3383             sum1 -= *v1*tmp0;
3384             sum2 -= *v2*tmp0;
3385           }
3386           x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3387           x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3388           break;
3389         case 3:
3390 
3391           sum1 = b[row];
3392           sum2 = b[row-1];
3393           sum3 = b[row-2];
3394           v2   = a->a + ii[row-1];
3395           v3   = a->a + ii[row-2];
3396           for (n = 0; n<sz-1; n+=2) {
3397             i1    = idx[0];
3398             i2    = idx[1];
3399             idx  += 2;
3400             tmp0  = x[i1];
3401             tmp1  = x[i2];
3402             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3403             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3404             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3405           }
3406 
3407           if (n == sz-1) {
3408             tmp0  = x[*idx];
3409             sum1 -= *v1*tmp0;
3410             sum2 -= *v2*tmp0;
3411             sum3 -= *v3*tmp0;
3412           }
3413           x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3414           x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3415           x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3416           break;
3417         case 4:
3418 
3419           sum1 = b[row];
3420           sum2 = b[row-1];
3421           sum3 = b[row-2];
3422           sum4 = b[row-3];
3423           v2   = a->a + ii[row-1];
3424           v3   = a->a + ii[row-2];
3425           v4   = a->a + ii[row-3];
3426           for (n = 0; n<sz-1; n+=2) {
3427             i1    = idx[0];
3428             i2    = idx[1];
3429             idx  += 2;
3430             tmp0  = x[i1];
3431             tmp1  = x[i2];
3432             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3433             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3434             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3435             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3436           }
3437 
3438           if (n == sz-1) {
3439             tmp0  = x[*idx];
3440             sum1 -= *v1*tmp0;
3441             sum2 -= *v2*tmp0;
3442             sum3 -= *v3*tmp0;
3443             sum4 -= *v4*tmp0;
3444           }
3445           x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3446           x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3447           x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3448           x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3449           break;
3450         case 5:
3451 
3452           sum1 = b[row];
3453           sum2 = b[row-1];
3454           sum3 = b[row-2];
3455           sum4 = b[row-3];
3456           sum5 = b[row-4];
3457           v2   = a->a + ii[row-1];
3458           v3   = a->a + ii[row-2];
3459           v4   = a->a + ii[row-3];
3460           v5   = a->a + ii[row-4];
3461           for (n = 0; n<sz-1; n+=2) {
3462             i1    = idx[0];
3463             i2    = idx[1];
3464             idx  += 2;
3465             tmp0  = x[i1];
3466             tmp1  = x[i2];
3467             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3468             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3469             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3470             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3471             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3472           }
3473 
3474           if (n == sz-1) {
3475             tmp0  = x[*idx];
3476             sum1 -= *v1*tmp0;
3477             sum2 -= *v2*tmp0;
3478             sum3 -= *v3*tmp0;
3479             sum4 -= *v4*tmp0;
3480             sum5 -= *v5*tmp0;
3481           }
3482           x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3483           x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3484           x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3485           x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3486           x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3487           break;
3488         default:
3489           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3490         }
3491       }
3492 
3493       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3494     }
3495   }
3496   if (flag & SOR_EISENSTAT) {
3497     /*
3498           Apply  (U + D)^-1  where D is now the block diagonal
3499     */
3500     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3501     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3502       ibdiag -= sizes[i]*sizes[i];
3503       sz      = ii[row+1] - diag[row] - 1;
3504       v1      = a->a + diag[row] + 1;
3505       idx     = a->j + diag[row] + 1;
3506       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3507       switch (sizes[i]) {
3508       case 1:
3509 
3510         sum1 = b[row];
3511         for (n = 0; n<sz-1; n+=2) {
3512           i1    = idx[0];
3513           i2    = idx[1];
3514           idx  += 2;
3515           tmp0  = x[i1];
3516           tmp1  = x[i2];
3517           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3518         }
3519 
3520         if (n == sz-1) {
3521           tmp0  = x[*idx];
3522           sum1 -= *v1*tmp0;
3523         }
3524         x[row] = sum1*(*ibdiag);row--;
3525         break;
3526 
3527       case 2:
3528 
3529         sum1 = b[row];
3530         sum2 = b[row-1];
3531         /* note that sum1 is associated with the second of the two rows */
3532         v2 = a->a + diag[row-1] + 2;
3533         for (n = 0; n<sz-1; n+=2) {
3534           i1    = idx[0];
3535           i2    = idx[1];
3536           idx  += 2;
3537           tmp0  = x[i1];
3538           tmp1  = x[i2];
3539           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3540           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3541         }
3542 
3543         if (n == sz-1) {
3544           tmp0  = x[*idx];
3545           sum1 -= *v1*tmp0;
3546           sum2 -= *v2*tmp0;
3547         }
3548         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3549         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3550         row     -= 2;
3551         break;
3552       case 3:
3553 
3554         sum1 = b[row];
3555         sum2 = b[row-1];
3556         sum3 = b[row-2];
3557         v2   = a->a + diag[row-1] + 2;
3558         v3   = a->a + diag[row-2] + 3;
3559         for (n = 0; n<sz-1; n+=2) {
3560           i1    = idx[0];
3561           i2    = idx[1];
3562           idx  += 2;
3563           tmp0  = x[i1];
3564           tmp1  = x[i2];
3565           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3566           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3567           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3568         }
3569 
3570         if (n == sz-1) {
3571           tmp0  = x[*idx];
3572           sum1 -= *v1*tmp0;
3573           sum2 -= *v2*tmp0;
3574           sum3 -= *v3*tmp0;
3575         }
3576         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3577         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3578         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3579         row     -= 3;
3580         break;
3581       case 4:
3582 
3583         sum1 = b[row];
3584         sum2 = b[row-1];
3585         sum3 = b[row-2];
3586         sum4 = b[row-3];
3587         v2   = a->a + diag[row-1] + 2;
3588         v3   = a->a + diag[row-2] + 3;
3589         v4   = a->a + diag[row-3] + 4;
3590         for (n = 0; n<sz-1; n+=2) {
3591           i1    = idx[0];
3592           i2    = idx[1];
3593           idx  += 2;
3594           tmp0  = x[i1];
3595           tmp1  = x[i2];
3596           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3597           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3598           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3599           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3600         }
3601 
3602         if (n == sz-1) {
3603           tmp0  = x[*idx];
3604           sum1 -= *v1*tmp0;
3605           sum2 -= *v2*tmp0;
3606           sum3 -= *v3*tmp0;
3607           sum4 -= *v4*tmp0;
3608         }
3609         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3610         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3611         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3612         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3613         row     -= 4;
3614         break;
3615       case 5:
3616 
3617         sum1 = b[row];
3618         sum2 = b[row-1];
3619         sum3 = b[row-2];
3620         sum4 = b[row-3];
3621         sum5 = b[row-4];
3622         v2   = a->a + diag[row-1] + 2;
3623         v3   = a->a + diag[row-2] + 3;
3624         v4   = a->a + diag[row-3] + 4;
3625         v5   = a->a + diag[row-4] + 5;
3626         for (n = 0; n<sz-1; n+=2) {
3627           i1    = idx[0];
3628           i2    = idx[1];
3629           idx  += 2;
3630           tmp0  = x[i1];
3631           tmp1  = x[i2];
3632           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3633           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3634           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3635           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3636           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3637         }
3638 
3639         if (n == sz-1) {
3640           tmp0  = x[*idx];
3641           sum1 -= *v1*tmp0;
3642           sum2 -= *v2*tmp0;
3643           sum3 -= *v3*tmp0;
3644           sum4 -= *v4*tmp0;
3645           sum5 -= *v5*tmp0;
3646         }
3647         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3648         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3649         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3650         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3651         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3652         row     -= 5;
3653         break;
3654       default:
3655         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3656       }
3657     }
3658     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3659 
3660     /*
3661            t = b - D x    where D is the block diagonal
3662     */
3663     cnt = 0;
3664     for (i=0, row=0; i<m; i++) {
3665       switch (sizes[i]) {
3666       case 1:
3667         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3668         break;
3669       case 2:
3670         x1       = x[row]; x2 = x[row+1];
3671         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3672         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3673         t[row]   = b[row] - tmp1;
3674         t[row+1] = b[row+1] - tmp2; row += 2;
3675         cnt     += 4;
3676         break;
3677       case 3:
3678         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3679         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3680         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3681         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3682         t[row]   = b[row] - tmp1;
3683         t[row+1] = b[row+1] - tmp2;
3684         t[row+2] = b[row+2] - tmp3; row += 3;
3685         cnt     += 9;
3686         break;
3687       case 4:
3688         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3689         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3690         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3691         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3692         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3693         t[row]   = b[row] - tmp1;
3694         t[row+1] = b[row+1] - tmp2;
3695         t[row+2] = b[row+2] - tmp3;
3696         t[row+3] = b[row+3] - tmp4; row += 4;
3697         cnt     += 16;
3698         break;
3699       case 5:
3700         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3701         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3702         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3703         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3704         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3705         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3706         t[row]   = b[row] - tmp1;
3707         t[row+1] = b[row+1] - tmp2;
3708         t[row+2] = b[row+2] - tmp3;
3709         t[row+3] = b[row+3] - tmp4;
3710         t[row+4] = b[row+4] - tmp5;row += 5;
3711         cnt     += 25;
3712         break;
3713       default:
3714         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3715       }
3716     }
3717     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3718 
3719 
3720 
3721     /*
3722           Apply (L + D)^-1 where D is the block diagonal
3723     */
3724     for (i=0, row=0; i<m; i++) {
3725       sz  = diag[row] - ii[row];
3726       v1  = a->a + ii[row];
3727       idx = a->j + ii[row];
3728       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3729       switch (sizes[i]) {
3730       case 1:
3731 
3732         sum1 = t[row];
3733         for (n = 0; n<sz-1; n+=2) {
3734           i1    = idx[0];
3735           i2    = idx[1];
3736           idx  += 2;
3737           tmp0  = t[i1];
3738           tmp1  = t[i2];
3739           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3740         }
3741 
3742         if (n == sz-1) {
3743           tmp0  = t[*idx];
3744           sum1 -= *v1 * tmp0;
3745         }
3746         x[row] += t[row] = sum1*(*ibdiag++); row++;
3747         break;
3748       case 2:
3749         v2   = a->a + ii[row+1];
3750         sum1 = t[row];
3751         sum2 = t[row+1];
3752         for (n = 0; n<sz-1; n+=2) {
3753           i1    = idx[0];
3754           i2    = idx[1];
3755           idx  += 2;
3756           tmp0  = t[i1];
3757           tmp1  = t[i2];
3758           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3759           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3760         }
3761 
3762         if (n == sz-1) {
3763           tmp0  = t[*idx];
3764           sum1 -= v1[0] * tmp0;
3765           sum2 -= v2[0] * tmp0;
3766         }
3767         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3768         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3769         ibdiag   += 4; row += 2;
3770         break;
3771       case 3:
3772         v2   = a->a + ii[row+1];
3773         v3   = a->a + ii[row+2];
3774         sum1 = t[row];
3775         sum2 = t[row+1];
3776         sum3 = t[row+2];
3777         for (n = 0; n<sz-1; n+=2) {
3778           i1    = idx[0];
3779           i2    = idx[1];
3780           idx  += 2;
3781           tmp0  = t[i1];
3782           tmp1  = t[i2];
3783           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3784           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3785           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3786         }
3787 
3788         if (n == sz-1) {
3789           tmp0  = t[*idx];
3790           sum1 -= v1[0] * tmp0;
3791           sum2 -= v2[0] * tmp0;
3792           sum3 -= v3[0] * tmp0;
3793         }
3794         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3795         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3796         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3797         ibdiag   += 9; row += 3;
3798         break;
3799       case 4:
3800         v2   = a->a + ii[row+1];
3801         v3   = a->a + ii[row+2];
3802         v4   = a->a + ii[row+3];
3803         sum1 = t[row];
3804         sum2 = t[row+1];
3805         sum3 = t[row+2];
3806         sum4 = t[row+3];
3807         for (n = 0; n<sz-1; n+=2) {
3808           i1    = idx[0];
3809           i2    = idx[1];
3810           idx  += 2;
3811           tmp0  = t[i1];
3812           tmp1  = t[i2];
3813           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3814           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3815           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3816           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3817         }
3818 
3819         if (n == sz-1) {
3820           tmp0  = t[*idx];
3821           sum1 -= v1[0] * tmp0;
3822           sum2 -= v2[0] * tmp0;
3823           sum3 -= v3[0] * tmp0;
3824           sum4 -= v4[0] * tmp0;
3825         }
3826         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3827         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3828         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3829         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3830         ibdiag   += 16; row += 4;
3831         break;
3832       case 5:
3833         v2   = a->a + ii[row+1];
3834         v3   = a->a + ii[row+2];
3835         v4   = a->a + ii[row+3];
3836         v5   = a->a + ii[row+4];
3837         sum1 = t[row];
3838         sum2 = t[row+1];
3839         sum3 = t[row+2];
3840         sum4 = t[row+3];
3841         sum5 = t[row+4];
3842         for (n = 0; n<sz-1; n+=2) {
3843           i1    = idx[0];
3844           i2    = idx[1];
3845           idx  += 2;
3846           tmp0  = t[i1];
3847           tmp1  = t[i2];
3848           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3849           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3850           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3851           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3852           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3853         }
3854 
3855         if (n == sz-1) {
3856           tmp0  = t[*idx];
3857           sum1 -= v1[0] * tmp0;
3858           sum2 -= v2[0] * tmp0;
3859           sum3 -= v3[0] * tmp0;
3860           sum4 -= v4[0] * tmp0;
3861           sum5 -= v5[0] * tmp0;
3862         }
3863         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3864         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3865         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3866         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3867         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3868         ibdiag   += 25; row += 5;
3869         break;
3870       default:
3871         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3872       }
3873     }
3874     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3875   }
3876   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3877   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3878   PetscFunctionReturn(0);
3879 }
3880 
3881 #undef __FUNCT__
3882 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3883 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3884 {
3885   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3886   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3887   const MatScalar   *bdiag = a->inode.bdiag;
3888   const PetscScalar *b;
3889   PetscErrorCode    ierr;
3890   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3891   const PetscInt    *sizes = a->inode.size;
3892 
3893   PetscFunctionBegin;
3894   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3895   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3896   cnt  = 0;
3897   for (i=0, row=0; i<m; i++) {
3898     switch (sizes[i]) {
3899     case 1:
3900       x[row] = b[row]*bdiag[cnt++];row++;
3901       break;
3902     case 2:
3903       x1       = b[row]; x2 = b[row+1];
3904       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3905       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3906       x[row++] = tmp1;
3907       x[row++] = tmp2;
3908       cnt     += 4;
3909       break;
3910     case 3:
3911       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3912       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3913       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3914       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3915       x[row++] = tmp1;
3916       x[row++] = tmp2;
3917       x[row++] = tmp3;
3918       cnt     += 9;
3919       break;
3920     case 4:
3921       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3922       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3923       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3924       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3925       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3926       x[row++] = tmp1;
3927       x[row++] = tmp2;
3928       x[row++] = tmp3;
3929       x[row++] = tmp4;
3930       cnt     += 16;
3931       break;
3932     case 5:
3933       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3934       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3935       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3936       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3937       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3938       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3939       x[row++] = tmp1;
3940       x[row++] = tmp2;
3941       x[row++] = tmp3;
3942       x[row++] = tmp4;
3943       x[row++] = tmp5;
3944       cnt     += 25;
3945       break;
3946     default:
3947       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3948     }
3949   }
3950   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3951   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3952   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3953   PetscFunctionReturn(0);
3954 }
3955 
3956 /*
3957     samestructure indicates that the matrix has not changed its nonzero structure so we
3958     do not need to recompute the inodes
3959 */
3960 #undef __FUNCT__
3961 #define __FUNCT__ "Mat_CheckInode"
3962 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
3963 {
3964   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3965   PetscErrorCode ierr;
3966   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3967   PetscBool      flag;
3968 
3969   PetscFunctionBegin;
3970   if (!a->inode.use) PetscFunctionReturn(0);
3971   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3972 
3973 
3974   m = A->rmap->n;
3975   if (a->inode.size) ns = a->inode.size;
3976   else {
3977     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
3978   }
3979 
3980   i          = 0;
3981   node_count = 0;
3982   idx        = a->j;
3983   ii         = a->i;
3984   while (i < m) {                /* For each row */
3985     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3986     /* Limits the number of elements in a node to 'a->inode.limit' */
3987     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3988       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
3989       if (nzy != nzx) break;
3990       idy += nzx;              /* Same nonzero pattern */
3991       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3992       if (!flag) break;
3993     }
3994     ns[node_count++] = blk_size;
3995     idx             += blk_size*nzx;
3996     i                = j;
3997   }
3998   /* If not enough inodes found,, do not use inode version of the routines */
3999   if (!m || node_count > .8*m) {
4000     ierr = PetscFree(ns);CHKERRQ(ierr);
4001 
4002     a->inode.node_count       = 0;
4003     a->inode.size             = NULL;
4004     a->inode.use              = PETSC_FALSE;
4005     A->ops->mult              = MatMult_SeqAIJ;
4006     A->ops->sor               = MatSOR_SeqAIJ;
4007     A->ops->multadd           = MatMultAdd_SeqAIJ;
4008     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4009     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4010     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4011     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4012     A->ops->coloringpatch     = 0;
4013     A->ops->multdiagonalblock = 0;
4014 
4015     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4016   } else {
4017     if (!A->factortype) {
4018       A->ops->mult              = MatMult_SeqAIJ_Inode;
4019       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4020       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4021       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4022       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4023       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4024       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4025       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4026       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4027     } else {
4028       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4029     }
4030     a->inode.node_count = node_count;
4031     a->inode.size       = ns;
4032     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4033   }
4034   a->inode.checked = PETSC_TRUE;
4035   PetscFunctionReturn(0);
4036 }
4037 
4038 #undef __FUNCT__
4039 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4040 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4041 {
4042   Mat            B =*C;
4043   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4044   PetscErrorCode ierr;
4045   PetscInt       m=A->rmap->n;
4046 
4047   PetscFunctionBegin;
4048   c->inode.use       = a->inode.use;
4049   c->inode.limit     = a->inode.limit;
4050   c->inode.max_limit = a->inode.max_limit;
4051   if (a->inode.size) {
4052     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4053     c->inode.node_count = a->inode.node_count;
4054     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4055     /* note the table of functions below should match that in Mat_CheckInode() */
4056     if (!B->factortype) {
4057       B->ops->mult              = MatMult_SeqAIJ_Inode;
4058       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4059       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4060       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4061       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4062       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4063       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4064       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4065       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4066     } else {
4067       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4068     }
4069   } else {
4070     c->inode.size       = 0;
4071     c->inode.node_count = 0;
4072   }
4073   c->inode.ibdiagvalid = PETSC_FALSE;
4074   c->inode.ibdiag      = 0;
4075   c->inode.bdiag       = 0;
4076   PetscFunctionReturn(0);
4077 }
4078 
4079 #undef __FUNCT__
4080 #define __FUNCT__ "MatGetRow_FactoredLU"
4081 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4082 {
4083   PetscInt k, *vi;
4084 
4085   PetscFunctionBegin;
4086   vi = aj + ai[row];
4087   for (k=0; k<nzl; k++) cols[k] = vi[k];
4088   vi        = aj + adiag[row];
4089   cols[nzl] = vi[0];
4090   vi        = aj + adiag[row+1]+1;
4091   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4092   PetscFunctionReturn(0);
4093 }
4094 /*
4095    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4096    Modified from Mat_CheckInode().
4097 
4098    Input Parameters:
4099 +  Mat A - ILU or LU matrix factor
4100 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4101     do not need to recompute the inodes
4102 */
4103 #undef __FUNCT__
4104 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4105 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure)
4106 {
4107   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4108   PetscErrorCode ierr;
4109   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4110   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4111   PetscBool      flag;
4112 
4113   PetscFunctionBegin;
4114   if (!a->inode.use)                     PetscFunctionReturn(0);
4115   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4116 
4117   m = A->rmap->n;
4118   if (a->inode.size) ns = a->inode.size;
4119   else {
4120     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4121   }
4122 
4123   i          = 0;
4124   node_count = 0;
4125   while (i < m) {                /* For each row */
4126     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4127     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4128     nzx  = nzl1 + nzu1 + 1;
4129     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
4130     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4131 
4132     /* Limits the number of elements in a node to 'a->inode.limit' */
4133     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4134       nzl2 = ai[j+1] - ai[j];
4135       nzu2 = adiag[j] - adiag[j+1] - 1;
4136       nzy  = nzl2 + nzu2 + 1;
4137       if (nzy != nzx) break;
4138       ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
4139       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4140       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4141       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
4142       ierr = PetscFree(cols2);CHKERRQ(ierr);
4143     }
4144     ns[node_count++] = blk_size;
4145     ierr             = PetscFree(cols1);CHKERRQ(ierr);
4146     i                = j;
4147   }
4148   /* If not enough inodes found,, do not use inode version of the routines */
4149   if (!m || node_count > .8*m) {
4150     ierr = PetscFree(ns);CHKERRQ(ierr);
4151 
4152     a->inode.node_count = 0;
4153     a->inode.size       = NULL;
4154     a->inode.use        = PETSC_FALSE;
4155 
4156     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4157   } else {
4158     A->ops->mult              = 0;
4159     A->ops->sor               = 0;
4160     A->ops->multadd           = 0;
4161     A->ops->getrowij          = 0;
4162     A->ops->restorerowij      = 0;
4163     A->ops->getcolumnij       = 0;
4164     A->ops->restorecolumnij   = 0;
4165     A->ops->coloringpatch     = 0;
4166     A->ops->multdiagonalblock = 0;
4167     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4168     a->inode.node_count       = node_count;
4169     a->inode.size             = ns;
4170 
4171     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4172   }
4173   a->inode.checked = PETSC_TRUE;
4174   PetscFunctionReturn(0);
4175 }
4176 
4177 #undef __FUNCT__
4178 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4179 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4180 {
4181   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4182 
4183   PetscFunctionBegin;
4184   a->inode.ibdiagvalid = PETSC_FALSE;
4185   PetscFunctionReturn(0);
4186 }
4187 
4188 /*
4189      This is really ugly. if inodes are used this replaces the
4190   permutations with ones that correspond to rows/cols of the matrix
4191   rather then inode blocks
4192 */
4193 #undef __FUNCT__
4194 #define __FUNCT__ "MatInodeAdjustForInodes"
4195 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4196 {
4197   PetscErrorCode ierr;
4198 
4199   PetscFunctionBegin;
4200   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4201   PetscFunctionReturn(0);
4202 }
4203 
4204 #undef __FUNCT__
4205 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4206 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4207 {
4208   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4209   PetscErrorCode ierr;
4210   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4211   const PetscInt *ridx,*cidx;
4212   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4213   PetscInt       nslim_col,*ns_col;
4214   IS             ris = *rperm,cis = *cperm;
4215 
4216   PetscFunctionBegin;
4217   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4218   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4219 
4220   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4221   ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4222   ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4223 
4224   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4225   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4226 
4227   /* Form the inode structure for the rows of permuted matric using inv perm*/
4228   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4229 
4230   /* Construct the permutations for rows*/
4231   for (i=0,row = 0; i<nslim_row; ++i) {
4232     indx      = ridx[i];
4233     start_val = tns[indx];
4234     end_val   = tns[indx + 1];
4235     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4236   }
4237 
4238   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4239   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4240 
4241   /* Construct permutations for columns */
4242   for (i=0,col=0; i<nslim_col; ++i) {
4243     indx      = cidx[i];
4244     start_val = tns[indx];
4245     end_val   = tns[indx + 1];
4246     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4247   }
4248 
4249   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4250   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4251   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4252   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4253 
4254   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4255   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4256 
4257   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4258   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4259   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4260   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4261   ierr = PetscFree(tns);CHKERRQ(ierr);
4262   PetscFunctionReturn(0);
4263 }
4264 
4265 #undef __FUNCT__
4266 #define __FUNCT__ "MatInodeGetInodeSizes"
4267 /*@C
4268    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4269 
4270    Not Collective
4271 
4272    Input Parameter:
4273 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4274 
4275    Output Parameter:
4276 +  node_count - no of inodes present in the matrix.
4277 .  sizes      - an array of size node_count,with sizes of each inode.
4278 -  limit      - the max size used to generate the inodes.
4279 
4280    Level: advanced
4281 
4282    Notes: This routine returns some internal storage information
4283    of the matrix, it is intended to be used by advanced users.
4284    It should be called after the matrix is assembled.
4285    The contents of the sizes[] array should not be changed.
4286    NULL may be passed for information not requested.
4287 
4288 .keywords: matrix, seqaij, get, inode
4289 
4290 .seealso: MatGetInfo()
4291 @*/
4292 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4293 {
4294   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4295 
4296   PetscFunctionBegin;
4297   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4298   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4299   if (f) {
4300     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4301   }
4302   PetscFunctionReturn(0);
4303 }
4304 
4305 #undef __FUNCT__
4306 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4307 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4308 {
4309   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4310 
4311   PetscFunctionBegin;
4312   if (node_count) *node_count = a->inode.node_count;
4313   if (sizes)      *sizes      = a->inode.size;
4314   if (limit)      *limit      = a->inode.limit;
4315   PetscFunctionReturn(0);
4316 }
4317