xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 7cee435b8e855d672dbf6acd28f68995f21e06bd)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1) {
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
86   }
87   /* allocate space for row pointers */
88   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
89   *iia = ia;
90   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
91   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
92 
93   /* determine the number of columns in each row */
94   ia[0] = oshift;
95   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
96 
97     j    = aj + ai[row] + ishift;
98     jmax = aj + ai[row+1] + ishift;
99     col  = *j++ + ishift;
100     i2   = tvc[col];
101     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
102       ia[i1+1]++;
103       ia[i2+1]++;
104       i2++;                     /* Start col of next node */
105       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
106       i2 = tvc[col];
107     }
108     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
109   }
110 
111   /* shift ia[i] to point to next row */
112   for (i1=1; i1<nslim_row+1; i1++) {
113     row        = ia[i1-1];
114     ia[i1]    += row;
115     work[i1-1] = row - oshift;
116   }
117 
118   /* allocate space for column pointers */
119   nz   = ia[nslim_row] + (!ishift);
120   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
121   *jja = ja;
122 
123   /* loop over lower triangular part putting into ja */
124   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
125     j    = aj + ai[row] + ishift;
126     jmax = aj + ai[row+1] + ishift;
127     col  = *j++ + ishift;
128     i2   = tvc[col];
129     while (i2<i1 && j<jmax) {
130       ja[work[i2]++] = i1 + oshift;
131       ja[work[i1]++] = i2 + oshift;
132       ++i2;
133       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
134       i2 = tvc[col];
135     }
136     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
137 
138   }
139   ierr = PetscFree(work);CHKERRQ(ierr);
140   ierr = PetscFree(tns);CHKERRQ(ierr);
141   ierr = PetscFree(tvc);CHKERRQ(ierr);
142   PetscFunctionReturn(0);
143 }
144 
145 /*
146       This builds nonsymmetric version of nonzero structure,
147 */
148 #undef __FUNCT__
149 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
150 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
151 {
152   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
153   PetscErrorCode ierr;
154   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
155   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
156 
157   PetscFunctionBegin;
158   nslim_row = a->inode.node_count;
159   n         = A->cmap->n;
160 
161   /* Create The column_inode for this matrix */
162   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
163 
164   /* allocate space for reformated column_inode structure */
165   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
166   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
167   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
168 
169   for (i1=0,col=0; i1<nslim_col; ++i1) {
170     nsz = ns_col[i1];
171     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
172   }
173   /* allocate space for row pointers */
174   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
175   *iia = ia;
176   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
177   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
178 
179   /* determine the number of columns in each row */
180   ia[0] = oshift;
181   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
182     j   = aj + ai[row] + ishift;
183     col = *j++ + ishift;
184     i2  = tvc[col];
185     nz  = ai[row+1] - ai[row];
186     while (nz-- > 0) {           /* off-diagonal elemets */
187       ia[i1+1]++;
188       i2++;                     /* Start col of next node */
189       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
190       if (nz > 0) i2 = tvc[col];
191     }
192   }
193 
194   /* shift ia[i] to point to next row */
195   for (i1=1; i1<nslim_row+1; i1++) {
196     row        = ia[i1-1];
197     ia[i1]    += row;
198     work[i1-1] = row - oshift;
199   }
200 
201   /* allocate space for column pointers */
202   nz   = ia[nslim_row] + (!ishift);
203   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
204   *jja = ja;
205 
206   /* loop over matrix putting into ja */
207   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
208     j   = aj + ai[row] + ishift;
209     col = *j++ + ishift;
210     i2  = tvc[col];
211     nz  = ai[row+1] - ai[row];
212     while (nz-- > 0) {
213       ja[work[i1]++] = i2 + oshift;
214       ++i2;
215       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216       if (nz > 0) i2 = tvc[col];
217     }
218   }
219   ierr = PetscFree(ns_col);CHKERRQ(ierr);
220   ierr = PetscFree(work);CHKERRQ(ierr);
221   ierr = PetscFree(tns);CHKERRQ(ierr);
222   ierr = PetscFree(tvc);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 #undef __FUNCT__
227 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
228 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
229 {
230   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
231   PetscErrorCode ierr;
232 
233   PetscFunctionBegin;
234   *n = a->inode.node_count;
235   if (!ia) PetscFunctionReturn(0);
236   if (!blockcompressed) {
237     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
238   } else if (symmetric) {
239     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
240   } else {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   }
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
248 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   if (!ia) PetscFunctionReturn(0);
254 
255   if (!blockcompressed) {
256     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
257   } else {
258     ierr = PetscFree(*ia);CHKERRQ(ierr);
259     ierr = PetscFree(*ja);CHKERRQ(ierr);
260   }
261   PetscFunctionReturn(0);
262 }
263 
264 /* ----------------------------------------------------------- */
265 
266 #undef __FUNCT__
267 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
268 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269 {
270   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
271   PetscErrorCode ierr;
272   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
274 
275   PetscFunctionBegin;
276   nslim_row = a->inode.node_count;
277   n         = A->cmap->n;
278 
279   /* Create The column_inode for this matrix */
280   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
281 
282   /* allocate space for reformated column_inode structure */
283   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
284   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
285   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
286 
287   for (i1=0,col=0; i1<nslim_col; ++i1) {
288     nsz = ns_col[i1];
289     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
290   }
291   /* allocate space for column pointers */
292   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
293   *iia = ia;
294   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
295   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
296 
297   /* determine the number of columns in each row */
298   ia[0] = oshift;
299   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
300     j   = aj + ai[row] + ishift;
301     col = *j++ + ishift;
302     i2  = tvc[col];
303     nz  = ai[row+1] - ai[row];
304     while (nz-- > 0) {           /* off-diagonal elemets */
305       /* ia[i1+1]++; */
306       ia[i2+1]++;
307       i2++;
308       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
309       if (nz > 0) i2 = tvc[col];
310     }
311   }
312 
313   /* shift ia[i] to point to next col */
314   for (i1=1; i1<nslim_col+1; i1++) {
315     col        = ia[i1-1];
316     ia[i1]    += col;
317     work[i1-1] = col - oshift;
318   }
319 
320   /* allocate space for column pointers */
321   nz   = ia[nslim_col] + (!ishift);
322   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
323   *jja = ja;
324 
325   /* loop over matrix putting into ja */
326   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
327     j   = aj + ai[row] + ishift;
328     col = *j++ + ishift;
329     i2  = tvc[col];
330     nz  = ai[row+1] - ai[row];
331     while (nz-- > 0) {
332       /* ja[work[i1]++] = i2 + oshift; */
333       ja[work[i2]++] = i1 + oshift;
334       i2++;
335       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
336       if (nz > 0) i2 = tvc[col];
337     }
338   }
339   ierr = PetscFree(ns_col);CHKERRQ(ierr);
340   ierr = PetscFree(work);CHKERRQ(ierr);
341   ierr = PetscFree(tns);CHKERRQ(ierr);
342   ierr = PetscFree(tvc);CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
348 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
349 {
350   PetscErrorCode ierr;
351 
352   PetscFunctionBegin;
353   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
354   if (!ia) PetscFunctionReturn(0);
355 
356   if (!blockcompressed) {
357     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
358   } else if (symmetric) {
359     /* Since the indices are symmetric it does'nt matter */
360     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
361   } else {
362     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   }
364   PetscFunctionReturn(0);
365 }
366 
367 #undef __FUNCT__
368 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
369 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
370 {
371   PetscErrorCode ierr;
372 
373   PetscFunctionBegin;
374   if (!ia) PetscFunctionReturn(0);
375   if (!blockcompressed) {
376     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
377   } else {
378     ierr = PetscFree(*ia);CHKERRQ(ierr);
379     ierr = PetscFree(*ja);CHKERRQ(ierr);
380   }
381   PetscFunctionReturn(0);
382 }
383 
384 /* ----------------------------------------------------------- */
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
388 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
389 {
390   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
391   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
392   PetscScalar       *y;
393   const PetscScalar *x;
394   const MatScalar   *v1,*v2,*v3,*v4,*v5;
395   PetscErrorCode    ierr;
396   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
397 
398 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
399 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
400 #endif
401 
402   PetscFunctionBegin;
403   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
404   node_max = a->inode.node_count;
405   ns       = a->inode.size;     /* Node Size array */
406   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
407   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
408   idx      = a->j;
409   v1       = a->a;
410   ii       = a->i;
411 
412   for (i = 0,row = 0; i< node_max; ++i) {
413     nsz         = ns[i];
414     n           = ii[1] - ii[0];
415     nonzerorow += (n>0)*nsz;
416     ii         += nsz;
417     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
418     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
419     sz = n;                     /* No of non zeros in this row */
420                                 /* Switch on the size of Node */
421     switch (nsz) {               /* Each loop in 'case' is unrolled */
422     case 1:
423       sum1 = 0.;
424 
425       for (n = 0; n< sz-1; n+=2) {
426         i1    = idx[0];         /* The instructions are ordered to */
427         i2    = idx[1];         /* make the compiler's job easy */
428         idx  += 2;
429         tmp0  = x[i1];
430         tmp1  = x[i2];
431         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432       }
433 
434       if (n == sz-1) {          /* Take care of the last nonzero  */
435         tmp0  = x[*idx++];
436         sum1 += *v1++ *tmp0;
437       }
438       y[row++]=sum1;
439       break;
440     case 2:
441       sum1 = 0.;
442       sum2 = 0.;
443       v2   = v1 + n;
444 
445       for (n = 0; n< sz-1; n+=2) {
446         i1    = idx[0];
447         i2    = idx[1];
448         idx  += 2;
449         tmp0  = x[i1];
450         tmp1  = x[i2];
451         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
452         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
453       }
454       if (n == sz-1) {
455         tmp0  = x[*idx++];
456         sum1 += *v1++ * tmp0;
457         sum2 += *v2++ * tmp0;
458       }
459       y[row++]=sum1;
460       y[row++]=sum2;
461       v1      =v2;              /* Since the next block to be processed starts there*/
462       idx    +=sz;
463       break;
464     case 3:
465       sum1 = 0.;
466       sum2 = 0.;
467       sum3 = 0.;
468       v2   = v1 + n;
469       v3   = v2 + n;
470 
471       for (n = 0; n< sz-1; n+=2) {
472         i1    = idx[0];
473         i2    = idx[1];
474         idx  += 2;
475         tmp0  = x[i1];
476         tmp1  = x[i2];
477         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
478         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
479         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
480       }
481       if (n == sz-1) {
482         tmp0  = x[*idx++];
483         sum1 += *v1++ * tmp0;
484         sum2 += *v2++ * tmp0;
485         sum3 += *v3++ * tmp0;
486       }
487       y[row++]=sum1;
488       y[row++]=sum2;
489       y[row++]=sum3;
490       v1      =v3;              /* Since the next block to be processed starts there*/
491       idx    +=2*sz;
492       break;
493     case 4:
494       sum1 = 0.;
495       sum2 = 0.;
496       sum3 = 0.;
497       sum4 = 0.;
498       v2   = v1 + n;
499       v3   = v2 + n;
500       v4   = v3 + n;
501 
502       for (n = 0; n< sz-1; n+=2) {
503         i1    = idx[0];
504         i2    = idx[1];
505         idx  += 2;
506         tmp0  = x[i1];
507         tmp1  = x[i2];
508         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
509         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
510         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
511         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
512       }
513       if (n == sz-1) {
514         tmp0  = x[*idx++];
515         sum1 += *v1++ * tmp0;
516         sum2 += *v2++ * tmp0;
517         sum3 += *v3++ * tmp0;
518         sum4 += *v4++ * tmp0;
519       }
520       y[row++]=sum1;
521       y[row++]=sum2;
522       y[row++]=sum3;
523       y[row++]=sum4;
524       v1      =v4;              /* Since the next block to be processed starts there*/
525       idx    +=3*sz;
526       break;
527     case 5:
528       sum1 = 0.;
529       sum2 = 0.;
530       sum3 = 0.;
531       sum4 = 0.;
532       sum5 = 0.;
533       v2   = v1 + n;
534       v3   = v2 + n;
535       v4   = v3 + n;
536       v5   = v4 + n;
537 
538       for (n = 0; n<sz-1; n+=2) {
539         i1    = idx[0];
540         i2    = idx[1];
541         idx  += 2;
542         tmp0  = x[i1];
543         tmp1  = x[i2];
544         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
545         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
546         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
547         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
548         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
549       }
550       if (n == sz-1) {
551         tmp0  = x[*idx++];
552         sum1 += *v1++ * tmp0;
553         sum2 += *v2++ * tmp0;
554         sum3 += *v3++ * tmp0;
555         sum4 += *v4++ * tmp0;
556         sum5 += *v5++ * tmp0;
557       }
558       y[row++]=sum1;
559       y[row++]=sum2;
560       y[row++]=sum3;
561       y[row++]=sum4;
562       y[row++]=sum5;
563       v1      =v5;       /* Since the next block to be processed starts there */
564       idx    +=4*sz;
565       break;
566     default:
567       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
568     }
569   }
570   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
571   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
572   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 /* ----------------------------------------------------------- */
576 /* Almost same code as the MatMult_SeqAIJ_Inode() */
577 #undef __FUNCT__
578 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
579 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
580 {
581   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
582   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
583   MatScalar      *v1,*v2,*v3,*v4,*v5;
584   PetscScalar    *x,*y,*z,*zt;
585   PetscErrorCode ierr;
586   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
587 
588   PetscFunctionBegin;
589   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
590   node_max = a->inode.node_count;
591   ns       = a->inode.size;     /* Node Size array */
592 
593   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
594   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
595   if (zz != yy) {
596     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
597   } else {
598     z = y;
599   }
600   zt = z;
601 
602   idx = a->j;
603   v1  = a->a;
604   ii  = a->i;
605 
606   for (i = 0,row = 0; i< node_max; ++i) {
607     nsz = ns[i];
608     n   = ii[1] - ii[0];
609     ii += nsz;
610     sz  = n;                    /* No of non zeros in this row */
611                                 /* Switch on the size of Node */
612     switch (nsz) {               /* Each loop in 'case' is unrolled */
613     case 1:
614       sum1 = *zt++;
615 
616       for (n = 0; n< sz-1; n+=2) {
617         i1    = idx[0];         /* The instructions are ordered to */
618         i2    = idx[1];         /* make the compiler's job easy */
619         idx  += 2;
620         tmp0  = x[i1];
621         tmp1  = x[i2];
622         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623       }
624 
625       if (n   == sz-1) {          /* Take care of the last nonzero  */
626         tmp0  = x[*idx++];
627         sum1 += *v1++ * tmp0;
628       }
629       y[row++]=sum1;
630       break;
631     case 2:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       v2   = v1 + n;
635 
636       for (n = 0; n< sz-1; n+=2) {
637         i1    = idx[0];
638         i2    = idx[1];
639         idx  += 2;
640         tmp0  = x[i1];
641         tmp1  = x[i2];
642         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644       }
645       if (n   == sz-1) {
646         tmp0  = x[*idx++];
647         sum1 += *v1++ * tmp0;
648         sum2 += *v2++ * tmp0;
649       }
650       y[row++]=sum1;
651       y[row++]=sum2;
652       v1      =v2;              /* Since the next block to be processed starts there*/
653       idx    +=sz;
654       break;
655     case 3:
656       sum1 = *zt++;
657       sum2 = *zt++;
658       sum3 = *zt++;
659       v2   = v1 + n;
660       v3   = v2 + n;
661 
662       for (n = 0; n< sz-1; n+=2) {
663         i1    = idx[0];
664         i2    = idx[1];
665         idx  += 2;
666         tmp0  = x[i1];
667         tmp1  = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671       }
672       if (n == sz-1) {
673         tmp0  = x[*idx++];
674         sum1 += *v1++ * tmp0;
675         sum2 += *v2++ * tmp0;
676         sum3 += *v3++ * tmp0;
677       }
678       y[row++]=sum1;
679       y[row++]=sum2;
680       y[row++]=sum3;
681       v1      =v3;              /* Since the next block to be processed starts there*/
682       idx    +=2*sz;
683       break;
684     case 4:
685       sum1 = *zt++;
686       sum2 = *zt++;
687       sum3 = *zt++;
688       sum4 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692 
693       for (n = 0; n< sz-1; n+=2) {
694         i1    = idx[0];
695         i2    = idx[1];
696         idx  += 2;
697         tmp0  = x[i1];
698         tmp1  = x[i2];
699         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703       }
704       if (n == sz-1) {
705         tmp0  = x[*idx++];
706         sum1 += *v1++ * tmp0;
707         sum2 += *v2++ * tmp0;
708         sum3 += *v3++ * tmp0;
709         sum4 += *v4++ * tmp0;
710       }
711       y[row++]=sum1;
712       y[row++]=sum2;
713       y[row++]=sum3;
714       y[row++]=sum4;
715       v1      =v4;              /* Since the next block to be processed starts there*/
716       idx    +=3*sz;
717       break;
718     case 5:
719       sum1 = *zt++;
720       sum2 = *zt++;
721       sum3 = *zt++;
722       sum4 = *zt++;
723       sum5 = *zt++;
724       v2   = v1 + n;
725       v3   = v2 + n;
726       v4   = v3 + n;
727       v5   = v4 + n;
728 
729       for (n = 0; n<sz-1; n+=2) {
730         i1    = idx[0];
731         i2    = idx[1];
732         idx  += 2;
733         tmp0  = x[i1];
734         tmp1  = x[i2];
735         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740       }
741       if (n   == sz-1) {
742         tmp0  = x[*idx++];
743         sum1 += *v1++ * tmp0;
744         sum2 += *v2++ * tmp0;
745         sum3 += *v3++ * tmp0;
746         sum4 += *v4++ * tmp0;
747         sum5 += *v5++ * tmp0;
748       }
749       y[row++]=sum1;
750       y[row++]=sum2;
751       y[row++]=sum3;
752       y[row++]=sum4;
753       y[row++]=sum5;
754       v1      =v5;       /* Since the next block to be processed starts there */
755       idx    +=4*sz;
756       break;
757     default:
758       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759     }
760   }
761   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
763   if (zz != yy) {
764     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
765   }
766   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
767   PetscFunctionReturn(0);
768 }
769 
770 /* ----------------------------------------------------------- */
771 #undef __FUNCT__
772 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
773 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
774 {
775   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
776   IS                iscol = a->col,isrow = a->row;
777   PetscErrorCode    ierr;
778   const PetscInt    *r,*c,*rout,*cout;
779   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
780   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
781   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
782   PetscScalar       sum1,sum2,sum3,sum4,sum5;
783   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
784   const PetscScalar *b;
785 
786   PetscFunctionBegin;
787   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
788   node_max = a->inode.node_count;
789   ns       = a->inode.size;     /* Node Size array */
790 
791   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   tmp  = a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
797 
798   /* forward solve the lower triangular */
799   tmps = tmp;
800   aa   = a_a;
801   aj   = a_j;
802   ad   = a->diag;
803 
804   for (i = 0,row = 0; i< node_max; ++i) {
805     nsz = ns[i];
806     aii = ai[row];
807     v1  = aa + aii;
808     vi  = aj + aii;
809     nz  = ad[row]- aii;
810     if (i < node_max-1) {
811       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
812       * but our indexing to determine it's size could. */
813       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
814       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
815       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
816       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
817     }
818 
819     switch (nsz) {               /* Each loop in 'case' is unrolled */
820     case 1:
821       sum1 = b[*r++];
822       for (j=0; j<nz-1; j+=2) {
823         i0    = vi[0];
824         i1    = vi[1];
825         vi   +=2;
826         tmp0  = tmps[i0];
827         tmp1  = tmps[i1];
828         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
829       }
830       if (j == nz-1) {
831         tmp0  = tmps[*vi++];
832         sum1 -= *v1++ *tmp0;
833       }
834       tmp[row++]=sum1;
835       break;
836     case 2:
837       sum1 = b[*r++];
838       sum2 = b[*r++];
839       v2   = aa + ai[row+1];
840 
841       for (j=0; j<nz-1; j+=2) {
842         i0    = vi[0];
843         i1    = vi[1];
844         vi   +=2;
845         tmp0  = tmps[i0];
846         tmp1  = tmps[i1];
847         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849       }
850       if (j == nz-1) {
851         tmp0  = tmps[*vi++];
852         sum1 -= *v1++ *tmp0;
853         sum2 -= *v2++ *tmp0;
854       }
855       sum2     -= *v2++ *sum1;
856       tmp[row++]=sum1;
857       tmp[row++]=sum2;
858       break;
859     case 3:
860       sum1 = b[*r++];
861       sum2 = b[*r++];
862       sum3 = b[*r++];
863       v2   = aa + ai[row+1];
864       v3   = aa + ai[row+2];
865 
866       for (j=0; j<nz-1; j+=2) {
867         i0    = vi[0];
868         i1    = vi[1];
869         vi   +=2;
870         tmp0  = tmps[i0];
871         tmp1  = tmps[i1];
872         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
873         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
874         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
875       }
876       if (j == nz-1) {
877         tmp0  = tmps[*vi++];
878         sum1 -= *v1++ *tmp0;
879         sum2 -= *v2++ *tmp0;
880         sum3 -= *v3++ *tmp0;
881       }
882       sum2 -= *v2++ * sum1;
883       sum3 -= *v3++ * sum1;
884       sum3 -= *v3++ * sum2;
885 
886       tmp[row++]=sum1;
887       tmp[row++]=sum2;
888       tmp[row++]=sum3;
889       break;
890 
891     case 4:
892       sum1 = b[*r++];
893       sum2 = b[*r++];
894       sum3 = b[*r++];
895       sum4 = b[*r++];
896       v2   = aa + ai[row+1];
897       v3   = aa + ai[row+2];
898       v4   = aa + ai[row+3];
899 
900       for (j=0; j<nz-1; j+=2) {
901         i0    = vi[0];
902         i1    = vi[1];
903         vi   +=2;
904         tmp0  = tmps[i0];
905         tmp1  = tmps[i1];
906         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
907         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
908         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
909         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
910       }
911       if (j == nz-1) {
912         tmp0  = tmps[*vi++];
913         sum1 -= *v1++ *tmp0;
914         sum2 -= *v2++ *tmp0;
915         sum3 -= *v3++ *tmp0;
916         sum4 -= *v4++ *tmp0;
917       }
918       sum2 -= *v2++ * sum1;
919       sum3 -= *v3++ * sum1;
920       sum4 -= *v4++ * sum1;
921       sum3 -= *v3++ * sum2;
922       sum4 -= *v4++ * sum2;
923       sum4 -= *v4++ * sum3;
924 
925       tmp[row++]=sum1;
926       tmp[row++]=sum2;
927       tmp[row++]=sum3;
928       tmp[row++]=sum4;
929       break;
930     case 5:
931       sum1 = b[*r++];
932       sum2 = b[*r++];
933       sum3 = b[*r++];
934       sum4 = b[*r++];
935       sum5 = b[*r++];
936       v2   = aa + ai[row+1];
937       v3   = aa + ai[row+2];
938       v4   = aa + ai[row+3];
939       v5   = aa + ai[row+4];
940 
941       for (j=0; j<nz-1; j+=2) {
942         i0    = vi[0];
943         i1    = vi[1];
944         vi   +=2;
945         tmp0  = tmps[i0];
946         tmp1  = tmps[i1];
947         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
948         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
949         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
950         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
951         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
952       }
953       if (j == nz-1) {
954         tmp0  = tmps[*vi++];
955         sum1 -= *v1++ *tmp0;
956         sum2 -= *v2++ *tmp0;
957         sum3 -= *v3++ *tmp0;
958         sum4 -= *v4++ *tmp0;
959         sum5 -= *v5++ *tmp0;
960       }
961 
962       sum2 -= *v2++ * sum1;
963       sum3 -= *v3++ * sum1;
964       sum4 -= *v4++ * sum1;
965       sum5 -= *v5++ * sum1;
966       sum3 -= *v3++ * sum2;
967       sum4 -= *v4++ * sum2;
968       sum5 -= *v5++ * sum2;
969       sum4 -= *v4++ * sum3;
970       sum5 -= *v5++ * sum3;
971       sum5 -= *v5++ * sum4;
972 
973       tmp[row++]=sum1;
974       tmp[row++]=sum2;
975       tmp[row++]=sum3;
976       tmp[row++]=sum4;
977       tmp[row++]=sum5;
978       break;
979     default:
980       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
981     }
982   }
983   /* backward solve the upper triangular */
984   for (i=node_max -1,row = n-1; i>=0; i--) {
985     nsz = ns[i];
986     aii = ai[row+1] -1;
987     v1  = aa + aii;
988     vi  = aj + aii;
989     nz  = aii- ad[row];
990     switch (nsz) {               /* Each loop in 'case' is unrolled */
991     case 1:
992       sum1 = tmp[row];
993 
994       for (j=nz; j>1; j-=2) {
995         vi   -=2;
996         i0    = vi[2];
997         i1    = vi[1];
998         tmp0  = tmps[i0];
999         tmp1  = tmps[i1];
1000         v1   -= 2;
1001         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002       }
1003       if (j==1) {
1004         tmp0  = tmps[*vi--];
1005         sum1 -= *v1-- * tmp0;
1006       }
1007       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1008       break;
1009     case 2:
1010       sum1 = tmp[row];
1011       sum2 = tmp[row -1];
1012       v2   = aa + ai[row]-1;
1013       for (j=nz; j>1; j-=2) {
1014         vi   -=2;
1015         i0    = vi[2];
1016         i1    = vi[1];
1017         tmp0  = tmps[i0];
1018         tmp1  = tmps[i1];
1019         v1   -= 2;
1020         v2   -= 2;
1021         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1022         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1023       }
1024       if (j==1) {
1025         tmp0  = tmps[*vi--];
1026         sum1 -= *v1-- * tmp0;
1027         sum2 -= *v2-- * tmp0;
1028       }
1029 
1030       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031       sum2   -= *v2-- * tmp0;
1032       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1033       break;
1034     case 3:
1035       sum1 = tmp[row];
1036       sum2 = tmp[row -1];
1037       sum3 = tmp[row -2];
1038       v2   = aa + ai[row]-1;
1039       v3   = aa + ai[row -1]-1;
1040       for (j=nz; j>1; j-=2) {
1041         vi   -=2;
1042         i0    = vi[2];
1043         i1    = vi[1];
1044         tmp0  = tmps[i0];
1045         tmp1  = tmps[i1];
1046         v1   -= 2;
1047         v2   -= 2;
1048         v3   -= 2;
1049         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1050         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1051         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1052       }
1053       if (j==1) {
1054         tmp0  = tmps[*vi--];
1055         sum1 -= *v1-- * tmp0;
1056         sum2 -= *v2-- * tmp0;
1057         sum3 -= *v3-- * tmp0;
1058       }
1059       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1060       sum2   -= *v2-- * tmp0;
1061       sum3   -= *v3-- * tmp0;
1062       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1063       sum3   -= *v3-- * tmp0;
1064       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1065 
1066       break;
1067     case 4:
1068       sum1 = tmp[row];
1069       sum2 = tmp[row -1];
1070       sum3 = tmp[row -2];
1071       sum4 = tmp[row -3];
1072       v2   = aa + ai[row]-1;
1073       v3   = aa + ai[row -1]-1;
1074       v4   = aa + ai[row -2]-1;
1075 
1076       for (j=nz; j>1; j-=2) {
1077         vi   -=2;
1078         i0    = vi[2];
1079         i1    = vi[1];
1080         tmp0  = tmps[i0];
1081         tmp1  = tmps[i1];
1082         v1   -= 2;
1083         v2   -= 2;
1084         v3   -= 2;
1085         v4   -= 2;
1086         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1087         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1088         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1089         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1090       }
1091       if (j==1) {
1092         tmp0  = tmps[*vi--];
1093         sum1 -= *v1-- * tmp0;
1094         sum2 -= *v2-- * tmp0;
1095         sum3 -= *v3-- * tmp0;
1096         sum4 -= *v4-- * tmp0;
1097       }
1098 
1099       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1100       sum2   -= *v2-- * tmp0;
1101       sum3   -= *v3-- * tmp0;
1102       sum4   -= *v4-- * tmp0;
1103       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1104       sum3   -= *v3-- * tmp0;
1105       sum4   -= *v4-- * tmp0;
1106       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1107       sum4   -= *v4-- * tmp0;
1108       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1109       break;
1110     case 5:
1111       sum1 = tmp[row];
1112       sum2 = tmp[row -1];
1113       sum3 = tmp[row -2];
1114       sum4 = tmp[row -3];
1115       sum5 = tmp[row -4];
1116       v2   = aa + ai[row]-1;
1117       v3   = aa + ai[row -1]-1;
1118       v4   = aa + ai[row -2]-1;
1119       v5   = aa + ai[row -3]-1;
1120       for (j=nz; j>1; j-=2) {
1121         vi   -= 2;
1122         i0    = vi[2];
1123         i1    = vi[1];
1124         tmp0  = tmps[i0];
1125         tmp1  = tmps[i1];
1126         v1   -= 2;
1127         v2   -= 2;
1128         v3   -= 2;
1129         v4   -= 2;
1130         v5   -= 2;
1131         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1132         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1133         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1134         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1135         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1136       }
1137       if (j==1) {
1138         tmp0  = tmps[*vi--];
1139         sum1 -= *v1-- * tmp0;
1140         sum2 -= *v2-- * tmp0;
1141         sum3 -= *v3-- * tmp0;
1142         sum4 -= *v4-- * tmp0;
1143         sum5 -= *v5-- * tmp0;
1144       }
1145 
1146       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1147       sum2   -= *v2-- * tmp0;
1148       sum3   -= *v3-- * tmp0;
1149       sum4   -= *v4-- * tmp0;
1150       sum5   -= *v5-- * tmp0;
1151       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1152       sum3   -= *v3-- * tmp0;
1153       sum4   -= *v4-- * tmp0;
1154       sum5   -= *v5-- * tmp0;
1155       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1159       sum5   -= *v5-- * tmp0;
1160       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1161       break;
1162     default:
1163       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1164     }
1165   }
1166   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1168   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1176 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1177 {
1178   Mat             C     =B;
1179   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1180   IS              isrow = b->row,isicol = b->icol;
1181   PetscErrorCode  ierr;
1182   const PetscInt  *r,*ic,*ics;
1183   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1184   PetscInt        i,j,k,nz,nzL,row,*pj;
1185   const PetscInt  *ajtmp,*bjtmp;
1186   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1187   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1188   FactorShiftCtx  sctx;
1189   const PetscInt  *ddiag;
1190   PetscReal       rs;
1191   MatScalar       d;
1192   PetscInt        inod,nodesz,node_max,col;
1193   const PetscInt  *ns;
1194   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1195 
1196   PetscFunctionBegin;
1197   /* MatPivotSetUp(): initialize shift context sctx */
1198   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1199 
1200   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1201     ddiag          = a->diag;
1202     sctx.shift_top = info->zeropivot;
1203     for (i=0; i<n; i++) {
1204       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1205       d  = (aa)[ddiag[i]];
1206       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1207       v  = aa+ai[i];
1208       nz = ai[i+1] - ai[i];
1209       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1210       if (rs>sctx.shift_top) sctx.shift_top = rs;
1211     }
1212     sctx.shift_top *= 1.1;
1213     sctx.nshift_max = 5;
1214     sctx.shift_lo   = 0.;
1215     sctx.shift_hi   = 1.;
1216   }
1217 
1218   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1219   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1220 
1221   ierr  = PetscMalloc4(n,PetscScalar,&rtmp1,n,PetscScalar,&rtmp2,n,PetscScalar,&rtmp3,n,PetscScalar,&rtmp4);CHKERRQ(ierr);
1222   ierr  = PetscMemzero(rtmp1,n*sizeof(PetscScalar));CHKERRQ(ierr);
1223   ierr  = PetscMemzero(rtmp2,n*sizeof(PetscScalar));CHKERRQ(ierr);
1224   ierr  = PetscMemzero(rtmp3,n*sizeof(PetscScalar));CHKERRQ(ierr);
1225   ierr  = PetscMemzero(rtmp4,n*sizeof(PetscScalar));CHKERRQ(ierr);
1226   ics   = ic;
1227 
1228   node_max = a->inode.node_count;
1229   ns       = a->inode.size;
1230   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1231 
1232   /* If max inode size > 4, split it into two inodes.*/
1233   /* also map the inode sizes according to the ordering */
1234   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1235   for (i=0,j=0; i<node_max; ++i,++j) {
1236     if (ns[i] > 4) {
1237       tmp_vec1[j] = 4;
1238       ++j;
1239       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1240     } else {
1241       tmp_vec1[j] = ns[i];
1242     }
1243   }
1244   /* Use the correct node_max */
1245   node_max = j;
1246 
1247   /* Now reorder the inode info based on mat re-ordering info */
1248   /* First create a row -> inode_size_array_index map */
1249   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1250   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1251   for (i=0,row=0; i<node_max; i++) {
1252     nodesz = tmp_vec1[i];
1253     for (j=0; j<nodesz; j++,row++) {
1254       nsmap[row] = i;
1255     }
1256   }
1257   /* Using nsmap, create a reordered ns structure */
1258   for (i=0,j=0; i< node_max; i++) {
1259     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1260     tmp_vec2[i] = nodesz;
1261     j          += nodesz;
1262   }
1263   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1264   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1265 
1266   /* Now use the correct ns */
1267   ns = tmp_vec2;
1268 
1269   do {
1270     sctx.newshift = PETSC_FALSE;
1271     /* Now loop over each block-row, and do the factorization */
1272     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1273       nodesz = ns[inod];
1274 
1275       switch (nodesz) {
1276       case 1:
1277         /*----------*/
1278         /* zero rtmp1 */
1279         /* L part */
1280         nz    = bi[i+1] - bi[i];
1281         bjtmp = bj + bi[i];
1282         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1283 
1284         /* U part */
1285         nz    = bdiag[i]-bdiag[i+1];
1286         bjtmp = bj + bdiag[i+1]+1;
1287         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1288 
1289         /* load in initial (unfactored row) */
1290         nz    = ai[r[i]+1] - ai[r[i]];
1291         ajtmp = aj + ai[r[i]];
1292         v     = aa + ai[r[i]];
1293         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1294 
1295         /* ZeropivotApply() */
1296         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1297 
1298         /* elimination */
1299         bjtmp = bj + bi[i];
1300         row   = *bjtmp++;
1301         nzL   = bi[i+1] - bi[i];
1302         for (k=0; k < nzL; k++) {
1303           pc = rtmp1 + row;
1304           if (*pc != 0.0) {
1305             pv   = b->a + bdiag[row];
1306             mul1 = *pc * (*pv);
1307             *pc  = mul1;
1308             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1309             pv   = b->a + bdiag[row+1]+1;
1310             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1311             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1312             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1313           }
1314           row = *bjtmp++;
1315         }
1316 
1317         /* finished row so stick it into b->a */
1318         rs = 0.0;
1319         /* L part */
1320         pv = b->a + bi[i];
1321         pj = b->j + bi[i];
1322         nz = bi[i+1] - bi[i];
1323         for (j=0; j<nz; j++) {
1324           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1325         }
1326 
1327         /* U part */
1328         pv = b->a + bdiag[i+1]+1;
1329         pj = b->j + bdiag[i+1]+1;
1330         nz = bdiag[i] - bdiag[i+1]-1;
1331         for (j=0; j<nz; j++) {
1332           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* Check zero pivot */
1336         sctx.rs = rs;
1337         sctx.pv = rtmp1[i];
1338         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1339         if (sctx.newshift) break;
1340 
1341         /* Mark diagonal and invert diagonal for simplier triangular solves */
1342         pv  = b->a + bdiag[i];
1343         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1344         break;
1345 
1346       case 2:
1347         /*----------*/
1348         /* zero rtmp1 and rtmp2 */
1349         /* L part */
1350         nz    = bi[i+1] - bi[i];
1351         bjtmp = bj + bi[i];
1352         for  (j=0; j<nz; j++) {
1353           col        = bjtmp[j];
1354           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1355         }
1356 
1357         /* U part */
1358         nz    = bdiag[i]-bdiag[i+1];
1359         bjtmp = bj + bdiag[i+1]+1;
1360         for  (j=0; j<nz; j++) {
1361           col        = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* load in initial (unfactored row) */
1366         nz    = ai[r[i]+1] - ai[r[i]];
1367         ajtmp = aj + ai[r[i]];
1368         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1369         for (j=0; j<nz; j++) {
1370           col        = ics[ajtmp[j]];
1371           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1372         }
1373         /* ZeropivotApply(): shift the diagonal of the matrix  */
1374         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1375 
1376         /* elimination */
1377         bjtmp = bj + bi[i];
1378         row   = *bjtmp++; /* pivot row */
1379         nzL   = bi[i+1] - bi[i];
1380         for (k=0; k < nzL; k++) {
1381           pc1 = rtmp1 + row;
1382           pc2 = rtmp2 + row;
1383           if (*pc1 != 0.0 || *pc2 != 0.0) {
1384             pv   = b->a + bdiag[row];
1385             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1386             *pc1 = mul1;       *pc2 = mul2;
1387 
1388             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1389             pv = b->a + bdiag[row+1]+1;
1390             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1391             for (j=0; j<nz; j++) {
1392               col         = pj[j];
1393               rtmp1[col] -= mul1 * pv[j];
1394               rtmp2[col] -= mul2 * pv[j];
1395             }
1396             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1397           }
1398           row = *bjtmp++;
1399         }
1400 
1401         /* finished row i; check zero pivot, then stick row i into b->a */
1402         rs = 0.0;
1403         /* L part */
1404         pc1 = b->a + bi[i];
1405         pj  = b->j + bi[i];
1406         nz  = bi[i+1] - bi[i];
1407         for (j=0; j<nz; j++) {
1408           col    = pj[j];
1409           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1410         }
1411         /* U part */
1412         pc1 = b->a + bdiag[i+1]+1;
1413         pj  = b->j + bdiag[i+1]+1;
1414         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1415         for (j=0; j<nz; j++) {
1416           col    = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419 
1420         sctx.rs = rs;
1421         sctx.pv = rtmp1[i];
1422         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1423         if (sctx.newshift) break;
1424         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1425         *pc1 = 1.0/sctx.pv;
1426 
1427         /* Now take care of diagonal 2x2 block. */
1428         pc2 = rtmp2 + i;
1429         if (*pc2 != 0.0) {
1430           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1431           *pc2 = mul1;          /* insert L entry */
1432           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1433           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1434           for (j=0; j<nz; j++) {
1435             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1436           }
1437           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1438         }
1439 
1440         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1441         rs = 0.0;
1442         /* L part */
1443         pc2 = b->a + bi[i+1];
1444         pj  = b->j + bi[i+1];
1445         nz  = bi[i+2] - bi[i+1];
1446         for (j=0; j<nz; j++) {
1447           col    = pj[j];
1448           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1449         }
1450         /* U part */
1451         pc2 = b->a + bdiag[i+2]+1;
1452         pj  = b->j + bdiag[i+2]+1;
1453         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1454         for (j=0; j<nz; j++) {
1455           col    = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458 
1459         sctx.rs = rs;
1460         sctx.pv = rtmp2[i+1];
1461         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1462         if (sctx.newshift) break;
1463         pc2  = b->a + bdiag[i+1];
1464         *pc2 = 1.0/sctx.pv;
1465         break;
1466 
1467       case 3:
1468         /*----------*/
1469         /* zero rtmp */
1470         /* L part */
1471         nz    = bi[i+1] - bi[i];
1472         bjtmp = bj + bi[i];
1473         for  (j=0; j<nz; j++) {
1474           col        = bjtmp[j];
1475           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1476         }
1477 
1478         /* U part */
1479         nz    = bdiag[i]-bdiag[i+1];
1480         bjtmp = bj + bdiag[i+1]+1;
1481         for  (j=0; j<nz; j++) {
1482           col        = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* load in initial (unfactored row) */
1487         nz    = ai[r[i]+1] - ai[r[i]];
1488         ajtmp = aj + ai[r[i]];
1489         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1490         for (j=0; j<nz; j++) {
1491           col        = ics[ajtmp[j]];
1492           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1493         }
1494         /* ZeropivotApply(): shift the diagonal of the matrix  */
1495         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1496 
1497         /* elimination */
1498         bjtmp = bj + bi[i];
1499         row   = *bjtmp++; /* pivot row */
1500         nzL   = bi[i+1] - bi[i];
1501         for (k=0; k < nzL; k++) {
1502           pc1 = rtmp1 + row;
1503           pc2 = rtmp2 + row;
1504           pc3 = rtmp3 + row;
1505           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1506             pv   = b->a + bdiag[row];
1507             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1508             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1509 
1510             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1511             pv = b->a + bdiag[row+1]+1;
1512             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1513             for (j=0; j<nz; j++) {
1514               col         = pj[j];
1515               rtmp1[col] -= mul1 * pv[j];
1516               rtmp2[col] -= mul2 * pv[j];
1517               rtmp3[col] -= mul3 * pv[j];
1518             }
1519             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1520           }
1521           row = *bjtmp++;
1522         }
1523 
1524         /* finished row i; check zero pivot, then stick row i into b->a */
1525         rs = 0.0;
1526         /* L part */
1527         pc1 = b->a + bi[i];
1528         pj  = b->j + bi[i];
1529         nz  = bi[i+1] - bi[i];
1530         for (j=0; j<nz; j++) {
1531           col    = pj[j];
1532           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1533         }
1534         /* U part */
1535         pc1 = b->a + bdiag[i+1]+1;
1536         pj  = b->j + bdiag[i+1]+1;
1537         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1538         for (j=0; j<nz; j++) {
1539           col    = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542 
1543         sctx.rs = rs;
1544         sctx.pv = rtmp1[i];
1545         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1546         if (sctx.newshift) break;
1547         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1548         *pc1 = 1.0/sctx.pv;
1549 
1550         /* Now take care of 1st column of diagonal 3x3 block. */
1551         pc2 = rtmp2 + i;
1552         pc3 = rtmp3 + i;
1553         if (*pc2 != 0.0 || *pc3 != 0.0) {
1554           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1555           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1556           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1557           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1558           for (j=0; j<nz; j++) {
1559             col         = pj[j];
1560             rtmp2[col] -= mul2 * rtmp1[col];
1561             rtmp3[col] -= mul3 * rtmp1[col];
1562           }
1563           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1564         }
1565 
1566         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1567         rs = 0.0;
1568         /* L part */
1569         pc2 = b->a + bi[i+1];
1570         pj  = b->j + bi[i+1];
1571         nz  = bi[i+2] - bi[i+1];
1572         for (j=0; j<nz; j++) {
1573           col    = pj[j];
1574           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1575         }
1576         /* U part */
1577         pc2 = b->a + bdiag[i+2]+1;
1578         pj  = b->j + bdiag[i+2]+1;
1579         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1580         for (j=0; j<nz; j++) {
1581           col    = pj[j];
1582           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583         }
1584 
1585         sctx.rs = rs;
1586         sctx.pv = rtmp2[i+1];
1587         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1588         if (sctx.newshift) break;
1589         pc2  = b->a + bdiag[i+1];
1590         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1591 
1592         /* Now take care of 2nd column of diagonal 3x3 block. */
1593         pc3 = rtmp3 + i+1;
1594         if (*pc3 != 0.0) {
1595           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1596           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1597           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1598           for (j=0; j<nz; j++) {
1599             col         = pj[j];
1600             rtmp3[col] -= mul3 * rtmp2[col];
1601           }
1602           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1603         }
1604 
1605         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1606         rs = 0.0;
1607         /* L part */
1608         pc3 = b->a + bi[i+2];
1609         pj  = b->j + bi[i+2];
1610         nz  = bi[i+3] - bi[i+2];
1611         for (j=0; j<nz; j++) {
1612           col    = pj[j];
1613           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1614         }
1615         /* U part */
1616         pc3 = b->a + bdiag[i+3]+1;
1617         pj  = b->j + bdiag[i+3]+1;
1618         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1619         for (j=0; j<nz; j++) {
1620           col    = pj[j];
1621           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622         }
1623 
1624         sctx.rs = rs;
1625         sctx.pv = rtmp3[i+2];
1626         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1627         if (sctx.newshift) break;
1628         pc3  = b->a + bdiag[i+2];
1629         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1630         break;
1631       case 4:
1632         /*----------*/
1633         /* zero rtmp */
1634         /* L part */
1635         nz    = bi[i+1] - bi[i];
1636         bjtmp = bj + bi[i];
1637         for  (j=0; j<nz; j++) {
1638           col        = bjtmp[j];
1639           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1640         }
1641 
1642         /* U part */
1643         nz    = bdiag[i]-bdiag[i+1];
1644         bjtmp = bj + bdiag[i+1]+1;
1645         for  (j=0; j<nz; j++) {
1646           col        = bjtmp[j];
1647           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1648         }
1649 
1650         /* load in initial (unfactored row) */
1651         nz    = ai[r[i]+1] - ai[r[i]];
1652         ajtmp = aj + ai[r[i]];
1653         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1654         for (j=0; j<nz; j++) {
1655           col        = ics[ajtmp[j]];
1656           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1657         }
1658         /* ZeropivotApply(): shift the diagonal of the matrix  */
1659         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1660 
1661         /* elimination */
1662         bjtmp = bj + bi[i];
1663         row   = *bjtmp++; /* pivot row */
1664         nzL   = bi[i+1] - bi[i];
1665         for (k=0; k < nzL; k++) {
1666           pc1 = rtmp1 + row;
1667           pc2 = rtmp2 + row;
1668           pc3 = rtmp3 + row;
1669           pc4 = rtmp4 + row;
1670           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1671             pv   = b->a + bdiag[row];
1672             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1673             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1674 
1675             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1676             pv = b->a + bdiag[row+1]+1;
1677             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1678             for (j=0; j<nz; j++) {
1679               col         = pj[j];
1680               rtmp1[col] -= mul1 * pv[j];
1681               rtmp2[col] -= mul2 * pv[j];
1682               rtmp3[col] -= mul3 * pv[j];
1683               rtmp4[col] -= mul4 * pv[j];
1684             }
1685             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1686           }
1687           row = *bjtmp++;
1688         }
1689 
1690         /* finished row i; check zero pivot, then stick row i into b->a */
1691         rs = 0.0;
1692         /* L part */
1693         pc1 = b->a + bi[i];
1694         pj  = b->j + bi[i];
1695         nz  = bi[i+1] - bi[i];
1696         for (j=0; j<nz; j++) {
1697           col    = pj[j];
1698           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1699         }
1700         /* U part */
1701         pc1 = b->a + bdiag[i+1]+1;
1702         pj  = b->j + bdiag[i+1]+1;
1703         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1704         for (j=0; j<nz; j++) {
1705           col    = pj[j];
1706           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707         }
1708 
1709         sctx.rs = rs;
1710         sctx.pv = rtmp1[i];
1711         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1712         if (sctx.newshift) break;
1713         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1714         *pc1 = 1.0/sctx.pv;
1715 
1716         /* Now take care of 1st column of diagonal 4x4 block. */
1717         pc2 = rtmp2 + i;
1718         pc3 = rtmp3 + i;
1719         pc4 = rtmp4 + i;
1720         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1722           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1723           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1724           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1725           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1726           for (j=0; j<nz; j++) {
1727             col         = pj[j];
1728             rtmp2[col] -= mul2 * rtmp1[col];
1729             rtmp3[col] -= mul3 * rtmp1[col];
1730             rtmp4[col] -= mul4 * rtmp1[col];
1731           }
1732           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1733         }
1734 
1735         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1736         rs = 0.0;
1737         /* L part */
1738         pc2 = b->a + bi[i+1];
1739         pj  = b->j + bi[i+1];
1740         nz  = bi[i+2] - bi[i+1];
1741         for (j=0; j<nz; j++) {
1742           col    = pj[j];
1743           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1744         }
1745         /* U part */
1746         pc2 = b->a + bdiag[i+2]+1;
1747         pj  = b->j + bdiag[i+2]+1;
1748         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1749         for (j=0; j<nz; j++) {
1750           col    = pj[j];
1751           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752         }
1753 
1754         sctx.rs = rs;
1755         sctx.pv = rtmp2[i+1];
1756         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1757         if (sctx.newshift) break;
1758         pc2  = b->a + bdiag[i+1];
1759         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1760 
1761         /* Now take care of 2nd column of diagonal 4x4 block. */
1762         pc3 = rtmp3 + i+1;
1763         pc4 = rtmp4 + i+1;
1764         if (*pc3 != 0.0 || *pc4 != 0.0) {
1765           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1766           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1767           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1768           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1769           for (j=0; j<nz; j++) {
1770             col         = pj[j];
1771             rtmp3[col] -= mul3 * rtmp2[col];
1772             rtmp4[col] -= mul4 * rtmp2[col];
1773           }
1774           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1775         }
1776 
1777         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1778         rs = 0.0;
1779         /* L part */
1780         pc3 = b->a + bi[i+2];
1781         pj  = b->j + bi[i+2];
1782         nz  = bi[i+3] - bi[i+2];
1783         for (j=0; j<nz; j++) {
1784           col    = pj[j];
1785           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1786         }
1787         /* U part */
1788         pc3 = b->a + bdiag[i+3]+1;
1789         pj  = b->j + bdiag[i+3]+1;
1790         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1791         for (j=0; j<nz; j++) {
1792           col    = pj[j];
1793           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794         }
1795 
1796         sctx.rs = rs;
1797         sctx.pv = rtmp3[i+2];
1798         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1799         if (sctx.newshift) break;
1800         pc3  = b->a + bdiag[i+2];
1801         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1802 
1803         /* Now take care of 3rd column of diagonal 4x4 block. */
1804         pc4 = rtmp4 + i+2;
1805         if (*pc4 != 0.0) {
1806           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1807           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1808           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1809           for (j=0; j<nz; j++) {
1810             col         = pj[j];
1811             rtmp4[col] -= mul4 * rtmp3[col];
1812           }
1813           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1814         }
1815 
1816         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1817         rs = 0.0;
1818         /* L part */
1819         pc4 = b->a + bi[i+3];
1820         pj  = b->j + bi[i+3];
1821         nz  = bi[i+4] - bi[i+3];
1822         for (j=0; j<nz; j++) {
1823           col    = pj[j];
1824           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1825         }
1826         /* U part */
1827         pc4 = b->a + bdiag[i+4]+1;
1828         pj  = b->j + bdiag[i+4]+1;
1829         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1830         for (j=0; j<nz; j++) {
1831           col    = pj[j];
1832           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833         }
1834 
1835         sctx.rs = rs;
1836         sctx.pv = rtmp4[i+3];
1837         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1838         if (sctx.newshift) break;
1839         pc4  = b->a + bdiag[i+3];
1840         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1841         break;
1842 
1843       default:
1844         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1845       }
1846       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1847       i += nodesz;                 /* Update the row */
1848     }
1849 
1850     /* MatPivotRefine() */
1851     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1852       /*
1853        * if no shift in this attempt & shifting & started shifting & can refine,
1854        * then try lower shift
1855        */
1856       sctx.shift_hi       = sctx.shift_fraction;
1857       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1858       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1859       sctx.newshift       = PETSC_TRUE;
1860       sctx.nshift++;
1861     }
1862   } while (sctx.newshift);
1863 
1864   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1865   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1866   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1867   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1868 
1869   C->ops->solve             = MatSolve_SeqAIJ;
1870   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1871   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1872   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1873   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1874   C->assembled              = PETSC_TRUE;
1875   C->preallocated           = PETSC_TRUE;
1876 
1877   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1878 
1879   /* MatShiftView(A,info,&sctx) */
1880   if (sctx.nshift) {
1881     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1882       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1883     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1884       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1885     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1886       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1887     }
1888   }
1889   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 #undef __FUNCT__
1894 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1895 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1896 {
1897   Mat             C     = B;
1898   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1899   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1900   PetscErrorCode  ierr;
1901   const PetscInt  *r,*ic,*c,*ics;
1902   PetscInt        n   = A->rmap->n,*bi = b->i;
1903   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1904   PetscInt        i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1905   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1906   PetscScalar     mul1,mul2,mul3,tmp;
1907   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1908   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1909   PetscReal       rs=0.0;
1910   FactorShiftCtx  sctx;
1911 
1912   PetscFunctionBegin;
1913   sctx.shift_top      = 0;
1914   sctx.nshift_max     = 0;
1915   sctx.shift_lo       = 0;
1916   sctx.shift_hi       = 0;
1917   sctx.shift_fraction = 0;
1918 
1919   /* if both shift schemes are chosen by user, only use info->shiftpd */
1920   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1921     sctx.shift_top = 0;
1922     for (i=0; i<n; i++) {
1923       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1924       rs    = 0.0;
1925       ajtmp = aj + ai[i];
1926       rtmp1 = aa + ai[i];
1927       nz    = ai[i+1] - ai[i];
1928       for (j=0; j<nz; j++) {
1929         if (*ajtmp != i) {
1930           rs += PetscAbsScalar(*rtmp1++);
1931         } else {
1932           rs -= PetscRealPart(*rtmp1++);
1933         }
1934         ajtmp++;
1935       }
1936       if (rs>sctx.shift_top) sctx.shift_top = rs;
1937     }
1938     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1939     sctx.shift_top *= 1.1;
1940     sctx.nshift_max = 5;
1941     sctx.shift_lo   = 0.;
1942     sctx.shift_hi   = 1.;
1943   }
1944   sctx.shift_amount = 0;
1945   sctx.nshift       = 0;
1946 
1947   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1948   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1949   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1950   ierr   = PetscMalloc3(n,PetscScalar,&rtmp11,n,PetscScalar,&rtmp22,n,PetscScalar,&rtmp33);CHKERRQ(ierr);
1951   ierr   = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr);
1952   ierr   = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr);
1953   ierr   = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr);
1954   ics    = ic;
1955 
1956   node_max = a->inode.node_count;
1957   ns       = a->inode.size;
1958   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1959 
1960   /* If max inode size > 3, split it into two inodes.*/
1961   /* also map the inode sizes according to the ordering */
1962   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1963   for (i=0,j=0; i<node_max; ++i,++j) {
1964     if (ns[i]>3) {
1965       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1966       ++j;
1967       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1968     } else {
1969       tmp_vec1[j] = ns[i];
1970     }
1971   }
1972   /* Use the correct node_max */
1973   node_max = j;
1974 
1975   /* Now reorder the inode info based on mat re-ordering info */
1976   /* First create a row -> inode_size_array_index map */
1977   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1978   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1979   for (i=0,row=0; i<node_max; i++) {
1980     nodesz = tmp_vec1[i];
1981     for (j=0; j<nodesz; j++,row++) {
1982       nsmap[row] = i;
1983     }
1984   }
1985   /* Using nsmap, create a reordered ns structure */
1986   for (i=0,j=0; i< node_max; i++) {
1987     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1988     tmp_vec2[i] = nodesz;
1989     j          += nodesz;
1990   }
1991   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1992   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1993   /* Now use the correct ns */
1994   ns = tmp_vec2;
1995 
1996   do {
1997     sctx.newshift = PETSC_FALSE;
1998     /* Now loop over each block-row, and do the factorization */
1999     for (i=0,row=0; i<node_max; i++) {
2000       nodesz = ns[i];
2001       nz     = bi[row+1] - bi[row];
2002       bjtmp  = bj + bi[row];
2003 
2004       switch (nodesz) {
2005       case 1:
2006         for  (j=0; j<nz; j++) {
2007           idx         = bjtmp[j];
2008           rtmp11[idx] = 0.0;
2009         }
2010 
2011         /* load in initial (unfactored row) */
2012         idx    = r[row];
2013         nz_tmp = ai[idx+1] - ai[idx];
2014         ajtmp  = aj + ai[idx];
2015         v1     = aa + ai[idx];
2016 
2017         for (j=0; j<nz_tmp; j++) {
2018           idx         = ics[ajtmp[j]];
2019           rtmp11[idx] = v1[j];
2020         }
2021         rtmp11[ics[r[row]]] += sctx.shift_amount;
2022 
2023         prow = *bjtmp++;
2024         while (prow < row) {
2025           pc1 = rtmp11 + prow;
2026           if (*pc1 != 0.0) {
2027             pv     = ba + bd[prow];
2028             pj     = nbj + bd[prow];
2029             mul1   = *pc1 * *pv++;
2030             *pc1   = mul1;
2031             nz_tmp = bi[prow+1] - bd[prow] - 1;
2032             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2033             for (j=0; j<nz_tmp; j++) {
2034               tmp          = pv[j];
2035               idx          = pj[j];
2036               rtmp11[idx] -= mul1 * tmp;
2037             }
2038           }
2039           prow = *bjtmp++;
2040         }
2041         pj  = bj + bi[row];
2042         pc1 = ba + bi[row];
2043 
2044         sctx.pv     = rtmp11[row];
2045         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2046         rs          = 0.0;
2047         for (j=0; j<nz; j++) {
2048           idx    = pj[j];
2049           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2050           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2051         }
2052         sctx.rs = rs;
2053         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2054         if (sctx.newshift) goto endofwhile;
2055         break;
2056 
2057       case 2:
2058         for (j=0; j<nz; j++) {
2059           idx         = bjtmp[j];
2060           rtmp11[idx] = 0.0;
2061           rtmp22[idx] = 0.0;
2062         }
2063 
2064         /* load in initial (unfactored row) */
2065         idx    = r[row];
2066         nz_tmp = ai[idx+1] - ai[idx];
2067         ajtmp  = aj + ai[idx];
2068         v1     = aa + ai[idx];
2069         v2     = aa + ai[idx+1];
2070         for (j=0; j<nz_tmp; j++) {
2071           idx         = ics[ajtmp[j]];
2072           rtmp11[idx] = v1[j];
2073           rtmp22[idx] = v2[j];
2074         }
2075         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2076         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2077 
2078         prow = *bjtmp++;
2079         while (prow < row) {
2080           pc1 = rtmp11 + prow;
2081           pc2 = rtmp22 + prow;
2082           if (*pc1 != 0.0 || *pc2 != 0.0) {
2083             pv   = ba + bd[prow];
2084             pj   = nbj + bd[prow];
2085             mul1 = *pc1 * *pv;
2086             mul2 = *pc2 * *pv;
2087             ++pv;
2088             *pc1 = mul1;
2089             *pc2 = mul2;
2090 
2091             nz_tmp = bi[prow+1] - bd[prow] - 1;
2092             for (j=0; j<nz_tmp; j++) {
2093               tmp          = pv[j];
2094               idx          = pj[j];
2095               rtmp11[idx] -= mul1 * tmp;
2096               rtmp22[idx] -= mul2 * tmp;
2097             }
2098             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2099           }
2100           prow = *bjtmp++;
2101         }
2102 
2103         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2104         pc1 = rtmp11 + prow;
2105         pc2 = rtmp22 + prow;
2106 
2107         sctx.pv = *pc1;
2108         pj      = bj + bi[prow];
2109         rs      = 0.0;
2110         for (j=0; j<nz; j++) {
2111           idx = pj[j];
2112           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2113         }
2114         sctx.rs = rs;
2115         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2116         if (sctx.newshift) goto endofwhile;
2117 
2118         if (*pc2 != 0.0) {
2119           pj     = nbj + bd[prow];
2120           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2121           *pc2   = mul2;
2122           nz_tmp = bi[prow+1] - bd[prow] - 1;
2123           for (j=0; j<nz_tmp; j++) {
2124             idx          = pj[j];
2125             tmp          = rtmp11[idx];
2126             rtmp22[idx] -= mul2 * tmp;
2127           }
2128           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2129         }
2130 
2131         pj  = bj + bi[row];
2132         pc1 = ba + bi[row];
2133         pc2 = ba + bi[row+1];
2134 
2135         sctx.pv       = rtmp22[row+1];
2136         rs            = 0.0;
2137         rtmp11[row]   = 1.0/rtmp11[row];
2138         rtmp22[row+1] = 1.0/rtmp22[row+1];
2139         /* copy row entries from dense representation to sparse */
2140         for (j=0; j<nz; j++) {
2141           idx    = pj[j];
2142           pc1[j] = rtmp11[idx];
2143           pc2[j] = rtmp22[idx];
2144           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2145         }
2146         sctx.rs = rs;
2147         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2148         if (sctx.newshift) goto endofwhile;
2149         break;
2150 
2151       case 3:
2152         for  (j=0; j<nz; j++) {
2153           idx         = bjtmp[j];
2154           rtmp11[idx] = 0.0;
2155           rtmp22[idx] = 0.0;
2156           rtmp33[idx] = 0.0;
2157         }
2158         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2159         idx    = r[row];
2160         nz_tmp = ai[idx+1] - ai[idx];
2161         ajtmp  = aj + ai[idx];
2162         v1     = aa + ai[idx];
2163         v2     = aa + ai[idx+1];
2164         v3     = aa + ai[idx+2];
2165         for (j=0; j<nz_tmp; j++) {
2166           idx         = ics[ajtmp[j]];
2167           rtmp11[idx] = v1[j];
2168           rtmp22[idx] = v2[j];
2169           rtmp33[idx] = v3[j];
2170         }
2171         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2172         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2173         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2174 
2175         /* loop over all pivot row blocks above this row block */
2176         prow = *bjtmp++;
2177         while (prow < row) {
2178           pc1 = rtmp11 + prow;
2179           pc2 = rtmp22 + prow;
2180           pc3 = rtmp33 + prow;
2181           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2182             pv   = ba  + bd[prow];
2183             pj   = nbj + bd[prow];
2184             mul1 = *pc1 * *pv;
2185             mul2 = *pc2 * *pv;
2186             mul3 = *pc3 * *pv;
2187             ++pv;
2188             *pc1 = mul1;
2189             *pc2 = mul2;
2190             *pc3 = mul3;
2191 
2192             nz_tmp = bi[prow+1] - bd[prow] - 1;
2193             /* update this row based on pivot row */
2194             for (j=0; j<nz_tmp; j++) {
2195               tmp          = pv[j];
2196               idx          = pj[j];
2197               rtmp11[idx] -= mul1 * tmp;
2198               rtmp22[idx] -= mul2 * tmp;
2199               rtmp33[idx] -= mul3 * tmp;
2200             }
2201             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2202           }
2203           prow = *bjtmp++;
2204         }
2205 
2206         /* Now take care of diagonal 3x3 block in this set of rows */
2207         /* note: prow = row here */
2208         pc1 = rtmp11 + prow;
2209         pc2 = rtmp22 + prow;
2210         pc3 = rtmp33 + prow;
2211 
2212         sctx.pv = *pc1;
2213         pj      = bj + bi[prow];
2214         rs      = 0.0;
2215         for (j=0; j<nz; j++) {
2216           idx = pj[j];
2217           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2218         }
2219         sctx.rs = rs;
2220         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2221         if (sctx.newshift) goto endofwhile;
2222 
2223         if (*pc2 != 0.0 || *pc3 != 0.0) {
2224           mul2   = (*pc2)/(*pc1);
2225           mul3   = (*pc3)/(*pc1);
2226           *pc2   = mul2;
2227           *pc3   = mul3;
2228           nz_tmp = bi[prow+1] - bd[prow] - 1;
2229           pj     = nbj + bd[prow];
2230           for (j=0; j<nz_tmp; j++) {
2231             idx          = pj[j];
2232             tmp          = rtmp11[idx];
2233             rtmp22[idx] -= mul2 * tmp;
2234             rtmp33[idx] -= mul3 * tmp;
2235           }
2236           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2237         }
2238         ++prow;
2239 
2240         pc2     = rtmp22 + prow;
2241         pc3     = rtmp33 + prow;
2242         sctx.pv = *pc2;
2243         pj      = bj + bi[prow];
2244         rs      = 0.0;
2245         for (j=0; j<nz; j++) {
2246           idx = pj[j];
2247           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2248         }
2249         sctx.rs = rs;
2250         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2251         if (sctx.newshift) goto endofwhile;
2252 
2253         if (*pc3 != 0.0) {
2254           mul3   = (*pc3)/(*pc2);
2255           *pc3   = mul3;
2256           pj     = nbj + bd[prow];
2257           nz_tmp = bi[prow+1] - bd[prow] - 1;
2258           for (j=0; j<nz_tmp; j++) {
2259             idx          = pj[j];
2260             tmp          = rtmp22[idx];
2261             rtmp33[idx] -= mul3 * tmp;
2262           }
2263           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2264         }
2265 
2266         pj  = bj + bi[row];
2267         pc1 = ba + bi[row];
2268         pc2 = ba + bi[row+1];
2269         pc3 = ba + bi[row+2];
2270 
2271         sctx.pv       = rtmp33[row+2];
2272         rs            = 0.0;
2273         rtmp11[row]   = 1.0/rtmp11[row];
2274         rtmp22[row+1] = 1.0/rtmp22[row+1];
2275         rtmp33[row+2] = 1.0/rtmp33[row+2];
2276         /* copy row entries from dense representation to sparse */
2277         for (j=0; j<nz; j++) {
2278           idx    = pj[j];
2279           pc1[j] = rtmp11[idx];
2280           pc2[j] = rtmp22[idx];
2281           pc3[j] = rtmp33[idx];
2282           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2283         }
2284 
2285         sctx.rs = rs;
2286         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2287         if (sctx.newshift) goto endofwhile;
2288         break;
2289 
2290       default:
2291         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2292       }
2293       row += nodesz;                 /* Update the row */
2294     }
2295 endofwhile:;
2296   } while (sctx.newshift);
2297   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2298   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2299   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2300   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2301   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2302 
2303   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2304   /* do not set solve add, since MatSolve_Inode + Add is faster */
2305   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2306   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2307   C->assembled              = PETSC_TRUE;
2308   C->preallocated           = PETSC_TRUE;
2309   if (sctx.nshift) {
2310     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2311       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2312     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2313       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2314     }
2315   }
2316   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2317   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2318   PetscFunctionReturn(0);
2319 }
2320 
2321 
2322 /* ----------------------------------------------------------- */
2323 #undef __FUNCT__
2324 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2325 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2326 {
2327   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2328   IS                iscol = a->col,isrow = a->row;
2329   PetscErrorCode    ierr;
2330   const PetscInt    *r,*c,*rout,*cout;
2331   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2332   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2333   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2334   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2335   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336   const PetscScalar *b;
2337 
2338   PetscFunctionBegin;
2339   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340   node_max = a->inode.node_count;
2341   ns       = a->inode.size;     /* Node Size array */
2342 
2343   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2344   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2345   tmp  = a->solve_work;
2346 
2347   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2348   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2349 
2350   /* forward solve the lower triangular */
2351   tmps = tmp;
2352   aa   = a_a;
2353   aj   = a_j;
2354   ad   = a->diag;
2355 
2356   for (i = 0,row = 0; i< node_max; ++i) {
2357     nsz = ns[i];
2358     aii = ai[row];
2359     v1  = aa + aii;
2360     vi  = aj + aii;
2361     nz  = ai[row+1]- ai[row];
2362 
2363     if (i < node_max-1) {
2364       /* Prefetch the indices for the next block */
2365       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366       /* Prefetch the data for the next block */
2367       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368     }
2369 
2370     switch (nsz) {               /* Each loop in 'case' is unrolled */
2371     case 1:
2372       sum1 = b[r[row]];
2373       for (j=0; j<nz-1; j+=2) {
2374         i0    = vi[j];
2375         i1    = vi[j+1];
2376         tmp0  = tmps[i0];
2377         tmp1  = tmps[i1];
2378         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379       }
2380       if (j == nz-1) {
2381         tmp0  = tmps[vi[j]];
2382         sum1 -= v1[j]*tmp0;
2383       }
2384       tmp[row++]=sum1;
2385       break;
2386     case 2:
2387       sum1 = b[r[row]];
2388       sum2 = b[r[row+1]];
2389       v2   = aa + ai[row+1];
2390 
2391       for (j=0; j<nz-1; j+=2) {
2392         i0    = vi[j];
2393         i1    = vi[j+1];
2394         tmp0  = tmps[i0];
2395         tmp1  = tmps[i1];
2396         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398       }
2399       if (j == nz-1) {
2400         tmp0  = tmps[vi[j]];
2401         sum1 -= v1[j] *tmp0;
2402         sum2 -= v2[j] *tmp0;
2403       }
2404       sum2     -= v2[nz] * sum1;
2405       tmp[row++]=sum1;
2406       tmp[row++]=sum2;
2407       break;
2408     case 3:
2409       sum1 = b[r[row]];
2410       sum2 = b[r[row+1]];
2411       sum3 = b[r[row+2]];
2412       v2   = aa + ai[row+1];
2413       v3   = aa + ai[row+2];
2414 
2415       for (j=0; j<nz-1; j+=2) {
2416         i0    = vi[j];
2417         i1    = vi[j+1];
2418         tmp0  = tmps[i0];
2419         tmp1  = tmps[i1];
2420         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423       }
2424       if (j == nz-1) {
2425         tmp0  = tmps[vi[j]];
2426         sum1 -= v1[j] *tmp0;
2427         sum2 -= v2[j] *tmp0;
2428         sum3 -= v3[j] *tmp0;
2429       }
2430       sum2     -= v2[nz] * sum1;
2431       sum3     -= v3[nz] * sum1;
2432       sum3     -= v3[nz+1] * sum2;
2433       tmp[row++]=sum1;
2434       tmp[row++]=sum2;
2435       tmp[row++]=sum3;
2436       break;
2437 
2438     case 4:
2439       sum1 = b[r[row]];
2440       sum2 = b[r[row+1]];
2441       sum3 = b[r[row+2]];
2442       sum4 = b[r[row+3]];
2443       v2   = aa + ai[row+1];
2444       v3   = aa + ai[row+2];
2445       v4   = aa + ai[row+3];
2446 
2447       for (j=0; j<nz-1; j+=2) {
2448         i0    = vi[j];
2449         i1    = vi[j+1];
2450         tmp0  = tmps[i0];
2451         tmp1  = tmps[i1];
2452         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456       }
2457       if (j == nz-1) {
2458         tmp0  = tmps[vi[j]];
2459         sum1 -= v1[j] *tmp0;
2460         sum2 -= v2[j] *tmp0;
2461         sum3 -= v3[j] *tmp0;
2462         sum4 -= v4[j] *tmp0;
2463       }
2464       sum2 -= v2[nz] * sum1;
2465       sum3 -= v3[nz] * sum1;
2466       sum4 -= v4[nz] * sum1;
2467       sum3 -= v3[nz+1] * sum2;
2468       sum4 -= v4[nz+1] * sum2;
2469       sum4 -= v4[nz+2] * sum3;
2470 
2471       tmp[row++]=sum1;
2472       tmp[row++]=sum2;
2473       tmp[row++]=sum3;
2474       tmp[row++]=sum4;
2475       break;
2476     case 5:
2477       sum1 = b[r[row]];
2478       sum2 = b[r[row+1]];
2479       sum3 = b[r[row+2]];
2480       sum4 = b[r[row+3]];
2481       sum5 = b[r[row+4]];
2482       v2   = aa + ai[row+1];
2483       v3   = aa + ai[row+2];
2484       v4   = aa + ai[row+3];
2485       v5   = aa + ai[row+4];
2486 
2487       for (j=0; j<nz-1; j+=2) {
2488         i0    = vi[j];
2489         i1    = vi[j+1];
2490         tmp0  = tmps[i0];
2491         tmp1  = tmps[i1];
2492         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497       }
2498       if (j == nz-1) {
2499         tmp0  = tmps[vi[j]];
2500         sum1 -= v1[j] *tmp0;
2501         sum2 -= v2[j] *tmp0;
2502         sum3 -= v3[j] *tmp0;
2503         sum4 -= v4[j] *tmp0;
2504         sum5 -= v5[j] *tmp0;
2505       }
2506 
2507       sum2 -= v2[nz] * sum1;
2508       sum3 -= v3[nz] * sum1;
2509       sum4 -= v4[nz] * sum1;
2510       sum5 -= v5[nz] * sum1;
2511       sum3 -= v3[nz+1] * sum2;
2512       sum4 -= v4[nz+1] * sum2;
2513       sum5 -= v5[nz+1] * sum2;
2514       sum4 -= v4[nz+2] * sum3;
2515       sum5 -= v5[nz+2] * sum3;
2516       sum5 -= v5[nz+3] * sum4;
2517 
2518       tmp[row++]=sum1;
2519       tmp[row++]=sum2;
2520       tmp[row++]=sum3;
2521       tmp[row++]=sum4;
2522       tmp[row++]=sum5;
2523       break;
2524     default:
2525       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526     }
2527   }
2528   /* backward solve the upper triangular */
2529   for (i=node_max -1,row = n-1; i>=0; i--) {
2530     nsz = ns[i];
2531     aii = ad[row+1] + 1;
2532     v1  = aa + aii;
2533     vi  = aj + aii;
2534     nz  = ad[row]- ad[row+1] - 1;
2535 
2536     if (i > 0) {
2537       /* Prefetch the indices for the next block */
2538       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539       /* Prefetch the data for the next block */
2540       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541     }
2542 
2543     switch (nsz) {               /* Each loop in 'case' is unrolled */
2544     case 1:
2545       sum1 = tmp[row];
2546 
2547       for (j=0; j<nz-1; j+=2) {
2548         i0    = vi[j];
2549         i1    = vi[j+1];
2550         tmp0  = tmps[i0];
2551         tmp1  = tmps[i1];
2552         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553       }
2554       if (j == nz-1) {
2555         tmp0  = tmps[vi[j]];
2556         sum1 -= v1[j]*tmp0;
2557       }
2558       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559       break;
2560     case 2:
2561       sum1 = tmp[row];
2562       sum2 = tmp[row-1];
2563       v2   = aa + ad[row] + 1;
2564       for (j=0; j<nz-1; j+=2) {
2565         i0    = vi[j];
2566         i1    = vi[j+1];
2567         tmp0  = tmps[i0];
2568         tmp1  = tmps[i1];
2569         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571       }
2572       if (j == nz-1) {
2573         tmp0  = tmps[vi[j]];
2574         sum1 -= v1[j]* tmp0;
2575         sum2 -= v2[j+1]* tmp0;
2576       }
2577 
2578       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579       sum2     -= v2[0] * tmp0;
2580       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581       break;
2582     case 3:
2583       sum1 = tmp[row];
2584       sum2 = tmp[row -1];
2585       sum3 = tmp[row -2];
2586       v2   = aa + ad[row] + 1;
2587       v3   = aa + ad[row -1] + 1;
2588       for (j=0; j<nz-1; j+=2) {
2589         i0    = vi[j];
2590         i1    = vi[j+1];
2591         tmp0  = tmps[i0];
2592         tmp1  = tmps[i1];
2593         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596       }
2597       if (j== nz-1) {
2598         tmp0  = tmps[vi[j]];
2599         sum1 -= v1[j] * tmp0;
2600         sum2 -= v2[j+1] * tmp0;
2601         sum3 -= v3[j+2] * tmp0;
2602       }
2603       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604       sum2     -= v2[0]* tmp0;
2605       sum3     -= v3[1] * tmp0;
2606       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607       sum3     -= v3[0]* tmp0;
2608       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609 
2610       break;
2611     case 4:
2612       sum1 = tmp[row];
2613       sum2 = tmp[row -1];
2614       sum3 = tmp[row -2];
2615       sum4 = tmp[row -3];
2616       v2   = aa + ad[row]+1;
2617       v3   = aa + ad[row -1]+1;
2618       v4   = aa + ad[row -2]+1;
2619 
2620       for (j=0; j<nz-1; j+=2) {
2621         i0    = vi[j];
2622         i1    = vi[j+1];
2623         tmp0  = tmps[i0];
2624         tmp1  = tmps[i1];
2625         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2626         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629       }
2630       if (j== nz-1) {
2631         tmp0  = tmps[vi[j]];
2632         sum1 -= v1[j] * tmp0;
2633         sum2 -= v2[j+1] * tmp0;
2634         sum3 -= v3[j+2] * tmp0;
2635         sum4 -= v4[j+3] * tmp0;
2636       }
2637 
2638       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639       sum2     -= v2[0] * tmp0;
2640       sum3     -= v3[1] * tmp0;
2641       sum4     -= v4[2] * tmp0;
2642       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643       sum3     -= v3[0] * tmp0;
2644       sum4     -= v4[1] * tmp0;
2645       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646       sum4     -= v4[0] * tmp0;
2647       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648       break;
2649     case 5:
2650       sum1 = tmp[row];
2651       sum2 = tmp[row -1];
2652       sum3 = tmp[row -2];
2653       sum4 = tmp[row -3];
2654       sum5 = tmp[row -4];
2655       v2   = aa + ad[row]+1;
2656       v3   = aa + ad[row -1]+1;
2657       v4   = aa + ad[row -2]+1;
2658       v5   = aa + ad[row -3]+1;
2659       for (j=0; j<nz-1; j+=2) {
2660         i0    = vi[j];
2661         i1    = vi[j+1];
2662         tmp0  = tmps[i0];
2663         tmp1  = tmps[i1];
2664         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669       }
2670       if (j==nz-1) {
2671         tmp0  = tmps[vi[j]];
2672         sum1 -= v1[j] * tmp0;
2673         sum2 -= v2[j+1] * tmp0;
2674         sum3 -= v3[j+2] * tmp0;
2675         sum4 -= v4[j+3] * tmp0;
2676         sum5 -= v5[j+4] * tmp0;
2677       }
2678 
2679       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680       sum2     -= v2[0] * tmp0;
2681       sum3     -= v3[1] * tmp0;
2682       sum4     -= v4[2] * tmp0;
2683       sum5     -= v5[3] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685       sum3     -= v3[0] * tmp0;
2686       sum4     -= v4[1] * tmp0;
2687       sum5     -= v5[2] * tmp0;
2688       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689       sum4     -= v4[0] * tmp0;
2690       sum5     -= v5[1] * tmp0;
2691       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692       sum5     -= v5[0] * tmp0;
2693       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694       break;
2695     default:
2696       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697     }
2698   }
2699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2703   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 
2708 /*
2709      Makes a longer coloring[] array and calls the usual code with that
2710 */
2711 #undef __FUNCT__
2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714 {
2715   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2716   PetscErrorCode  ierr;
2717   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718   PetscInt        *colorused,i;
2719   ISColoringValue *newcolor;
2720 
2721   PetscFunctionBegin;
2722   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2723   /* loop over inodes, marking a color for each column*/
2724   row = 0;
2725   for (i=0; i<m; i++) {
2726     for (j=0; j<ns[i]; j++) {
2727       newcolor[row++] = coloring[i] + j*ncolors;
2728     }
2729   }
2730 
2731   /* eliminate unneeded colors */
2732   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2733   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2734   for (i=0; i<n; i++) {
2735     colorused[newcolor[i]] = 1;
2736   }
2737 
2738   for (i=1; i<5*ncolors; i++) {
2739     colorused[i] += colorused[i-1];
2740   }
2741   ncolors = colorused[5*ncolors-1];
2742   for (i=0; i<n; i++) {
2743     newcolor[i] = colorused[newcolor[i]]-1;
2744   }
2745   ierr = PetscFree(colorused);CHKERRQ(ierr);
2746   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2747   ierr = PetscFree(coloring);CHKERRQ(ierr);
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 #include <petsc-private/kernels/blockinvert.h>
2752 
2753 #undef __FUNCT__
2754 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2755 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2756 {
2757   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2758   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2759   MatScalar         *ibdiag,*bdiag,work[25],*t;
2760   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2761   const MatScalar   *v = a->a,*v1,*v2,*v3,*v4,*v5;
2762   const PetscScalar *xb, *b;
2763   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2764   PetscErrorCode    ierr;
2765   PetscInt          n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2766   PetscInt          *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2767 
2768   PetscFunctionBegin;
2769   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2770   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2771 
2772   if (!a->inode.ibdiagvalid) {
2773     if (!a->inode.ibdiag) {
2774       /* calculate space needed for diagonal blocks */
2775       for (i=0; i<m; i++) {
2776         cnt += sizes[i]*sizes[i];
2777       }
2778       a->inode.bdiagsize = cnt;
2779 
2780       ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2781     }
2782 
2783     /* copy over the diagonal blocks and invert them */
2784     ibdiag = a->inode.ibdiag;
2785     bdiag  = a->inode.bdiag;
2786     cnt    = 0;
2787     for (i=0, row = 0; i<m; i++) {
2788       for (j=0; j<sizes[i]; j++) {
2789         for (k=0; k<sizes[i]; k++) {
2790           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2791         }
2792       }
2793       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2794 
2795       switch (sizes[i]) {
2796       case 1:
2797         /* Create matrix data structure */
2798         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2799         ibdiag[cnt] = 1.0/ibdiag[cnt];
2800         break;
2801       case 2:
2802         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2803         break;
2804       case 3:
2805         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2806         break;
2807       case 4:
2808         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2809         break;
2810       case 5:
2811         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2812         break;
2813       default:
2814         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2815       }
2816       cnt += sizes[i]*sizes[i];
2817       row += sizes[i];
2818     }
2819     a->inode.ibdiagvalid = PETSC_TRUE;
2820   }
2821   ibdiag = a->inode.ibdiag;
2822   bdiag  = a->inode.bdiag;
2823   t      = a->inode.ssor_work;
2824 
2825   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2826   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2827   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2828   if (flag & SOR_ZERO_INITIAL_GUESS) {
2829     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2830 
2831       for (i=0, row=0; i<m; i++) {
2832         sz  = diag[row] - ii[row];
2833         v1  = a->a + ii[row];
2834         idx = a->j + ii[row];
2835 
2836         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2837         switch (sizes[i]) {
2838         case 1:
2839 
2840           sum1 = b[row];
2841           for (n = 0; n<sz-1; n+=2) {
2842             i1    = idx[0];
2843             i2    = idx[1];
2844             idx  += 2;
2845             tmp0  = x[i1];
2846             tmp1  = x[i2];
2847             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2848           }
2849 
2850           if (n == sz-1) {
2851             tmp0  = x[*idx];
2852             sum1 -= *v1 * tmp0;
2853           }
2854           t[row]   = sum1;
2855           x[row++] = sum1*(*ibdiag++);
2856           break;
2857         case 2:
2858           v2   = a->a + ii[row+1];
2859           sum1 = b[row];
2860           sum2 = b[row+1];
2861           for (n = 0; n<sz-1; n+=2) {
2862             i1    = idx[0];
2863             i2    = idx[1];
2864             idx  += 2;
2865             tmp0  = x[i1];
2866             tmp1  = x[i2];
2867             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2868             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2869           }
2870 
2871           if (n == sz-1) {
2872             tmp0  = x[*idx];
2873             sum1 -= v1[0] * tmp0;
2874             sum2 -= v2[0] * tmp0;
2875           }
2876           t[row]   = sum1;
2877           t[row+1] = sum2;
2878           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2879           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2880           ibdiag  += 4;
2881           break;
2882         case 3:
2883           v2   = a->a + ii[row+1];
2884           v3   = a->a + ii[row+2];
2885           sum1 = b[row];
2886           sum2 = b[row+1];
2887           sum3 = b[row+2];
2888           for (n = 0; n<sz-1; n+=2) {
2889             i1    = idx[0];
2890             i2    = idx[1];
2891             idx  += 2;
2892             tmp0  = x[i1];
2893             tmp1  = x[i2];
2894             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2895             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2896             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2897           }
2898 
2899           if (n == sz-1) {
2900             tmp0  = x[*idx];
2901             sum1 -= v1[0] * tmp0;
2902             sum2 -= v2[0] * tmp0;
2903             sum3 -= v3[0] * tmp0;
2904           }
2905           t[row]   = sum1;
2906           t[row+1] = sum2;
2907           t[row+2] = sum3;
2908           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2909           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2910           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2911           ibdiag  += 9;
2912           break;
2913         case 4:
2914           v2   = a->a + ii[row+1];
2915           v3   = a->a + ii[row+2];
2916           v4   = a->a + ii[row+3];
2917           sum1 = b[row];
2918           sum2 = b[row+1];
2919           sum3 = b[row+2];
2920           sum4 = b[row+3];
2921           for (n = 0; n<sz-1; n+=2) {
2922             i1    = idx[0];
2923             i2    = idx[1];
2924             idx  += 2;
2925             tmp0  = x[i1];
2926             tmp1  = x[i2];
2927             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2928             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2929             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2930             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2931           }
2932 
2933           if (n == sz-1) {
2934             tmp0  = x[*idx];
2935             sum1 -= v1[0] * tmp0;
2936             sum2 -= v2[0] * tmp0;
2937             sum3 -= v3[0] * tmp0;
2938             sum4 -= v4[0] * tmp0;
2939           }
2940           t[row]   = sum1;
2941           t[row+1] = sum2;
2942           t[row+2] = sum3;
2943           t[row+3] = sum4;
2944           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2945           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2946           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2947           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2948           ibdiag  += 16;
2949           break;
2950         case 5:
2951           v2   = a->a + ii[row+1];
2952           v3   = a->a + ii[row+2];
2953           v4   = a->a + ii[row+3];
2954           v5   = a->a + ii[row+4];
2955           sum1 = b[row];
2956           sum2 = b[row+1];
2957           sum3 = b[row+2];
2958           sum4 = b[row+3];
2959           sum5 = b[row+4];
2960           for (n = 0; n<sz-1; n+=2) {
2961             i1    = idx[0];
2962             i2    = idx[1];
2963             idx  += 2;
2964             tmp0  = x[i1];
2965             tmp1  = x[i2];
2966             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2967             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2968             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2969             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2970             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2971           }
2972 
2973           if (n == sz-1) {
2974             tmp0  = x[*idx];
2975             sum1 -= v1[0] * tmp0;
2976             sum2 -= v2[0] * tmp0;
2977             sum3 -= v3[0] * tmp0;
2978             sum4 -= v4[0] * tmp0;
2979             sum5 -= v5[0] * tmp0;
2980           }
2981           t[row]   = sum1;
2982           t[row+1] = sum2;
2983           t[row+2] = sum3;
2984           t[row+3] = sum4;
2985           t[row+4] = sum5;
2986           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2987           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2988           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2989           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2990           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2991           ibdiag  += 25;
2992           break;
2993         default:
2994           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2995         }
2996       }
2997 
2998       xb   = t;
2999       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3000     } else xb = b;
3001     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3002 
3003       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3004       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3005         ibdiag -= sizes[i]*sizes[i];
3006         sz      = ii[row+1] - diag[row] - 1;
3007         v1      = a->a + diag[row] + 1;
3008         idx     = a->j + diag[row] + 1;
3009 
3010         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3011         switch (sizes[i]) {
3012         case 1:
3013 
3014           sum1 = xb[row];
3015           for (n = 0; n<sz-1; n+=2) {
3016             i1    = idx[0];
3017             i2    = idx[1];
3018             idx  += 2;
3019             tmp0  = x[i1];
3020             tmp1  = x[i2];
3021             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3022           }
3023 
3024           if (n == sz-1) {
3025             tmp0  = x[*idx];
3026             sum1 -= *v1*tmp0;
3027           }
3028           x[row--] = sum1*(*ibdiag);
3029           break;
3030 
3031         case 2:
3032 
3033           sum1 = xb[row];
3034           sum2 = xb[row-1];
3035           /* note that sum1 is associated with the second of the two rows */
3036           v2 = a->a + diag[row-1] + 2;
3037           for (n = 0; n<sz-1; n+=2) {
3038             i1    = idx[0];
3039             i2    = idx[1];
3040             idx  += 2;
3041             tmp0  = x[i1];
3042             tmp1  = x[i2];
3043             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3044             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3045           }
3046 
3047           if (n == sz-1) {
3048             tmp0  = x[*idx];
3049             sum1 -= *v1*tmp0;
3050             sum2 -= *v2*tmp0;
3051           }
3052           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3053           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3054           break;
3055         case 3:
3056 
3057           sum1 = xb[row];
3058           sum2 = xb[row-1];
3059           sum3 = xb[row-2];
3060           v2   = a->a + diag[row-1] + 2;
3061           v3   = a->a + diag[row-2] + 3;
3062           for (n = 0; n<sz-1; n+=2) {
3063             i1    = idx[0];
3064             i2    = idx[1];
3065             idx  += 2;
3066             tmp0  = x[i1];
3067             tmp1  = x[i2];
3068             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3069             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3070             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3071           }
3072 
3073           if (n == sz-1) {
3074             tmp0  = x[*idx];
3075             sum1 -= *v1*tmp0;
3076             sum2 -= *v2*tmp0;
3077             sum3 -= *v3*tmp0;
3078           }
3079           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3080           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3081           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3082           break;
3083         case 4:
3084 
3085           sum1 = xb[row];
3086           sum2 = xb[row-1];
3087           sum3 = xb[row-2];
3088           sum4 = xb[row-3];
3089           v2   = a->a + diag[row-1] + 2;
3090           v3   = a->a + diag[row-2] + 3;
3091           v4   = a->a + diag[row-3] + 4;
3092           for (n = 0; n<sz-1; n+=2) {
3093             i1    = idx[0];
3094             i2    = idx[1];
3095             idx  += 2;
3096             tmp0  = x[i1];
3097             tmp1  = x[i2];
3098             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3099             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3100             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3101             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3102           }
3103 
3104           if (n == sz-1) {
3105             tmp0  = x[*idx];
3106             sum1 -= *v1*tmp0;
3107             sum2 -= *v2*tmp0;
3108             sum3 -= *v3*tmp0;
3109             sum4 -= *v4*tmp0;
3110           }
3111           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3112           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3113           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3114           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3115           break;
3116         case 5:
3117 
3118           sum1 = xb[row];
3119           sum2 = xb[row-1];
3120           sum3 = xb[row-2];
3121           sum4 = xb[row-3];
3122           sum5 = xb[row-4];
3123           v2   = a->a + diag[row-1] + 2;
3124           v3   = a->a + diag[row-2] + 3;
3125           v4   = a->a + diag[row-3] + 4;
3126           v5   = a->a + diag[row-4] + 5;
3127           for (n = 0; n<sz-1; n+=2) {
3128             i1    = idx[0];
3129             i2    = idx[1];
3130             idx  += 2;
3131             tmp0  = x[i1];
3132             tmp1  = x[i2];
3133             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3134             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3135             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3136             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3137             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3138           }
3139 
3140           if (n == sz-1) {
3141             tmp0  = x[*idx];
3142             sum1 -= *v1*tmp0;
3143             sum2 -= *v2*tmp0;
3144             sum3 -= *v3*tmp0;
3145             sum4 -= *v4*tmp0;
3146             sum5 -= *v5*tmp0;
3147           }
3148           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3149           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3150           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3151           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3152           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3153           break;
3154         default:
3155           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3156         }
3157       }
3158 
3159       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3160     }
3161     its--;
3162   }
3163   while (its--) {
3164 
3165     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3166       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3167            i<m;
3168            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3169 
3170         sz  = diag[row] - ii[row];
3171         v1  = a->a + ii[row];
3172         idx = a->j + ii[row];
3173         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3174         switch (sizes[i]) {
3175         case 1:
3176           sum1 = b[row];
3177           for (n = 0; n<sz-1; n+=2) {
3178             i1    = idx[0];
3179             i2    = idx[1];
3180             idx  += 2;
3181             tmp0  = x[i1];
3182             tmp1  = x[i2];
3183             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3184           }
3185           if (n == sz-1) {
3186             tmp0  = x[*idx++];
3187             sum1 -= *v1 * tmp0;
3188             v1++;
3189           }
3190           t[row]   = sum1;
3191           sz      = ii[row+1] - diag[row] - 1;
3192           idx     = a->j + diag[row] + 1;
3193           v1 += 1;
3194           for (n = 0; n<sz-1; n+=2) {
3195             i1    = idx[0];
3196             i2    = idx[1];
3197             idx  += 2;
3198             tmp0  = x[i1];
3199             tmp1  = x[i2];
3200             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3201           }
3202           if (n == sz-1) {
3203             tmp0  = x[*idx++];
3204             sum1 -= *v1 * tmp0;
3205           }
3206           /* in MatSOR_SeqAIJ this line would be
3207            *
3208            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3209            *
3210            * but omega == 1, so this becomes
3211            *
3212            * x[row] = sum1*(*ibdiag++);
3213            *
3214            */
3215           x[row] = sum1*(*ibdiag);
3216           break;
3217         case 2:
3218           v2   = a->a + ii[row+1];
3219           sum1 = b[row];
3220           sum2 = b[row+1];
3221           for (n = 0; n<sz-1; n+=2) {
3222             i1    = idx[0];
3223             i2    = idx[1];
3224             idx  += 2;
3225             tmp0  = x[i1];
3226             tmp1  = x[i2];
3227             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3228             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3229           }
3230           if (n == sz-1) {
3231             tmp0  = x[*idx++];
3232             sum1 -= v1[0] * tmp0;
3233             sum2 -= v2[0] * tmp0;
3234             v1++; v2++;
3235           }
3236           t[row]   = sum1;
3237           t[row+1] = sum2;
3238           sz      = ii[row+1] - diag[row] - 2;
3239           idx     = a->j + diag[row] + 2;
3240           v1 += 2;
3241           v2 += 2;
3242           for (n = 0; n<sz-1; n+=2) {
3243             i1    = idx[0];
3244             i2    = idx[1];
3245             idx  += 2;
3246             tmp0  = x[i1];
3247             tmp1  = x[i2];
3248             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3249             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3250           }
3251           if (n == sz-1) {
3252             tmp0  = x[*idx];
3253             sum1 -= v1[0] * tmp0;
3254             sum2 -= v2[0] * tmp0;
3255           }
3256           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3257           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3258           break;
3259         case 3:
3260           v2   = a->a + ii[row+1];
3261           v3   = a->a + ii[row+2];
3262           sum1 = b[row];
3263           sum2 = b[row+1];
3264           sum3 = b[row+2];
3265           for (n = 0; n<sz-1; n+=2) {
3266             i1    = idx[0];
3267             i2    = idx[1];
3268             idx  += 2;
3269             tmp0  = x[i1];
3270             tmp1  = x[i2];
3271             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3272             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3273             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3274           }
3275           if (n == sz-1) {
3276             tmp0  = x[*idx++];
3277             sum1 -= v1[0] * tmp0;
3278             sum2 -= v2[0] * tmp0;
3279             sum3 -= v3[0] * tmp0;
3280             v1++; v2++; v3++;
3281           }
3282           t[row]   = sum1;
3283           t[row+1] = sum2;
3284           t[row+2] = sum3;
3285           sz      = ii[row+1] - diag[row] - 3;
3286           idx     = a->j + diag[row] + 3;
3287           v1 += 3;
3288           v2 += 3;
3289           v3 += 3;
3290           for (n = 0; n<sz-1; n+=2) {
3291             i1    = idx[0];
3292             i2    = idx[1];
3293             idx  += 2;
3294             tmp0  = x[i1];
3295             tmp1  = x[i2];
3296             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3297             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3298             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3299           }
3300           if (n == sz-1) {
3301             tmp0  = x[*idx];
3302             sum1 -= v1[0] * tmp0;
3303             sum2 -= v2[0] * tmp0;
3304             sum3 -= v3[0] * tmp0;
3305           }
3306           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3307           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3308           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3309           break;
3310         case 4:
3311           v2   = a->a + ii[row+1];
3312           v3   = a->a + ii[row+2];
3313           v4   = a->a + ii[row+3];
3314           sum1 = b[row];
3315           sum2 = b[row+1];
3316           sum3 = b[row+2];
3317           sum4 = b[row+3];
3318           for (n = 0; n<sz-1; n+=2) {
3319             i1    = idx[0];
3320             i2    = idx[1];
3321             idx  += 2;
3322             tmp0  = x[i1];
3323             tmp1  = x[i2];
3324             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3325             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3326             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3327             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3328           }
3329           if (n == sz-1) {
3330             tmp0  = x[*idx++];
3331             sum1 -= v1[0] * tmp0;
3332             sum2 -= v2[0] * tmp0;
3333             sum3 -= v3[0] * tmp0;
3334             sum4 -= v4[0] * tmp0;
3335             v1++; v2++; v3++; v4++;
3336           }
3337           t[row]   = sum1;
3338           t[row+1] = sum2;
3339           t[row+2] = sum3;
3340           t[row+3] = sum4;
3341           sz      = ii[row+1] - diag[row] - 4;
3342           idx     = a->j + diag[row] + 4;
3343           v1 += 4;
3344           v2 += 4;
3345           v3 += 4;
3346           v4 += 4;
3347           for (n = 0; n<sz-1; n+=2) {
3348             i1    = idx[0];
3349             i2    = idx[1];
3350             idx  += 2;
3351             tmp0  = x[i1];
3352             tmp1  = x[i2];
3353             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3354             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3355             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3356             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3357           }
3358           if (n == sz-1) {
3359             tmp0  = x[*idx];
3360             sum1 -= v1[0] * tmp0;
3361             sum2 -= v2[0] * tmp0;
3362             sum3 -= v3[0] * tmp0;
3363             sum4 -= v4[0] * tmp0;
3364           }
3365           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3366           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3367           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3368           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3369           break;
3370         case 5:
3371           v2   = a->a + ii[row+1];
3372           v3   = a->a + ii[row+2];
3373           v4   = a->a + ii[row+3];
3374           v5   = a->a + ii[row+4];
3375           sum1 = b[row];
3376           sum2 = b[row+1];
3377           sum3 = b[row+2];
3378           sum4 = b[row+3];
3379           sum5 = b[row+4];
3380           for (n = 0; n<sz-1; n+=2) {
3381             i1    = idx[0];
3382             i2    = idx[1];
3383             idx  += 2;
3384             tmp0  = x[i1];
3385             tmp1  = x[i2];
3386             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3387             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3388             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3389             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3390             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3391           }
3392           if (n == sz-1) {
3393             tmp0  = x[*idx++];
3394             sum1 -= v1[0] * tmp0;
3395             sum2 -= v2[0] * tmp0;
3396             sum3 -= v3[0] * tmp0;
3397             sum4 -= v4[0] * tmp0;
3398             sum5 -= v5[0] * tmp0;
3399             v1++; v2++; v3++; v4++; v5++;
3400           }
3401           t[row]   = sum1;
3402           t[row+1] = sum2;
3403           t[row+2] = sum3;
3404           t[row+3] = sum4;
3405           t[row+4] = sum5;
3406           sz      = ii[row+1] - diag[row] - 5;
3407           idx     = a->j + diag[row] + 5;
3408           v1 += 5;
3409           v2 += 5;
3410           v3 += 5;
3411           v4 += 5;
3412           v5 += 5;
3413           for (n = 0; n<sz-1; n+=2) {
3414             i1    = idx[0];
3415             i2    = idx[1];
3416             idx  += 2;
3417             tmp0  = x[i1];
3418             tmp1  = x[i2];
3419             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3420             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3421             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3422             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3423             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3424           }
3425           if (n == sz-1) {
3426             tmp0  = x[*idx];
3427             sum1 -= v1[0] * tmp0;
3428             sum2 -= v2[0] * tmp0;
3429             sum3 -= v3[0] * tmp0;
3430             sum4 -= v4[0] * tmp0;
3431             sum5 -= v5[0] * tmp0;
3432           }
3433           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3434           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3435           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3436           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3437           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3438           break;
3439         default:
3440           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3441         }
3442       }
3443       xb   = t;
3444       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3445     } else xb = b;
3446 
3447     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3448 
3449       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3450       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3451         ibdiag -= sizes[i]*sizes[i];
3452 
3453         /* set RHS */
3454         if (xb == b) {
3455           /* whole (old way) */
3456           sz      = ii[row+1] - ii[row];
3457           idx     = a->j + ii[row];
3458           switch (sizes[i]) {
3459           case 5:
3460             v5      = a->a + ii[row-4];
3461           case 4: /* fall through */
3462             v4      = a->a + ii[row-3];
3463           case 3:
3464             v3      = a->a + ii[row-2];
3465           case 2:
3466             v2      = a->a + ii[row-1];
3467           case 1:
3468             v1      = a->a + ii[row];
3469             break;
3470           default:
3471             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3472           }
3473         } else {
3474           /* upper, no diag */
3475           sz      = ii[row+1] - diag[row] - 1;
3476           idx     = a->j + diag[row] + 1;
3477           switch (sizes[i]) {
3478           case 5:
3479             v5      = a->a + diag[row-4] + 5;
3480           case 4: /* fall through */
3481             v4      = a->a + diag[row-3] + 4;
3482           case 3:
3483             v3      = a->a + diag[row-2] + 3;
3484           case 2:
3485             v2      = a->a + diag[row-1] + 2;
3486           case 1:
3487             v1      = a->a + diag[row] + 1;
3488           }
3489         }
3490         /* set sum */
3491         switch (sizes[i]) {
3492         case 5:
3493           sum5 = xb[row-4];
3494         case 4: /* fall through */
3495           sum4 = xb[row-3];
3496         case 3:
3497           sum3 = xb[row-2];
3498         case 2:
3499           sum2 = xb[row-1];
3500         case 1:
3501           /* note that sum1 is associated with the last row */
3502           sum1 = xb[row];
3503         }
3504         /* do sums */
3505         for (n = 0; n<sz-1; n+=2) {
3506             i1    = idx[0];
3507             i2    = idx[1];
3508             idx  += 2;
3509             tmp0  = x[i1];
3510             tmp1  = x[i2];
3511             switch (sizes[i]) {
3512             case 5:
3513               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3514             case 4: /* fall through */
3515               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3516             case 3:
3517               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3518             case 2:
3519               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3520             case 1:
3521               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3522             }
3523         }
3524         /* ragged edge */
3525         if (n == sz-1) {
3526           tmp0  = x[*idx];
3527           switch (sizes[i]) {
3528           case 5:
3529             sum5 -= *v5*tmp0;
3530           case 4: /* fall through */
3531             sum4 -= *v4*tmp0;
3532           case 3:
3533             sum3 -= *v3*tmp0;
3534           case 2:
3535             sum2 -= *v2*tmp0;
3536           case 1:
3537             sum1 -= *v1*tmp0;
3538           }
3539         }
3540         /* update */
3541         if (xb == b) {
3542           /* whole (old way) w/ diag */
3543           switch (sizes[i]) {
3544           case 5:
3545             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3546             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3547             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3548             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3549             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3550             break;
3551           case 4:
3552             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3553             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3554             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3555             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3556             break;
3557           case 3:
3558             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3559             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3560             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3561             break;
3562           case 2:
3563             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3564             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3565             break;
3566           case 1:
3567             x[row--] += sum1*(*ibdiag);
3568             break;
3569           }
3570         } else {
3571           /* no diag so set =  */
3572           switch (sizes[i]) {
3573           case 5:
3574             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3575             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3576             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3577             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3578             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3579             break;
3580           case 4:
3581             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3582             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3583             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3584             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3585             break;
3586           case 3:
3587             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3588             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3589             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3590             break;
3591           case 2:
3592             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3593             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3594             break;
3595           case 1:
3596             x[row--] = sum1*(*ibdiag);
3597             break;
3598           }
3599         }
3600       }
3601       if (xb == b) {
3602         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3603       } else {
3604         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3605       }
3606     }
3607   }
3608   if (flag & SOR_EISENSTAT) {
3609     /*
3610           Apply  (U + D)^-1  where D is now the block diagonal
3611     */
3612     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3613     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3614       ibdiag -= sizes[i]*sizes[i];
3615       sz      = ii[row+1] - diag[row] - 1;
3616       v1      = a->a + diag[row] + 1;
3617       idx     = a->j + diag[row] + 1;
3618       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3619       switch (sizes[i]) {
3620       case 1:
3621 
3622         sum1 = b[row];
3623         for (n = 0; n<sz-1; n+=2) {
3624           i1    = idx[0];
3625           i2    = idx[1];
3626           idx  += 2;
3627           tmp0  = x[i1];
3628           tmp1  = x[i2];
3629           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3630         }
3631 
3632         if (n == sz-1) {
3633           tmp0  = x[*idx];
3634           sum1 -= *v1*tmp0;
3635         }
3636         x[row] = sum1*(*ibdiag);row--;
3637         break;
3638 
3639       case 2:
3640 
3641         sum1 = b[row];
3642         sum2 = b[row-1];
3643         /* note that sum1 is associated with the second of the two rows */
3644         v2 = a->a + diag[row-1] + 2;
3645         for (n = 0; n<sz-1; n+=2) {
3646           i1    = idx[0];
3647           i2    = idx[1];
3648           idx  += 2;
3649           tmp0  = x[i1];
3650           tmp1  = x[i2];
3651           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3652           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3653         }
3654 
3655         if (n == sz-1) {
3656           tmp0  = x[*idx];
3657           sum1 -= *v1*tmp0;
3658           sum2 -= *v2*tmp0;
3659         }
3660         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3661         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3662         row     -= 2;
3663         break;
3664       case 3:
3665 
3666         sum1 = b[row];
3667         sum2 = b[row-1];
3668         sum3 = b[row-2];
3669         v2   = a->a + diag[row-1] + 2;
3670         v3   = a->a + diag[row-2] + 3;
3671         for (n = 0; n<sz-1; n+=2) {
3672           i1    = idx[0];
3673           i2    = idx[1];
3674           idx  += 2;
3675           tmp0  = x[i1];
3676           tmp1  = x[i2];
3677           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3678           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3679           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3680         }
3681 
3682         if (n == sz-1) {
3683           tmp0  = x[*idx];
3684           sum1 -= *v1*tmp0;
3685           sum2 -= *v2*tmp0;
3686           sum3 -= *v3*tmp0;
3687         }
3688         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3689         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3690         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3691         row     -= 3;
3692         break;
3693       case 4:
3694 
3695         sum1 = b[row];
3696         sum2 = b[row-1];
3697         sum3 = b[row-2];
3698         sum4 = b[row-3];
3699         v2   = a->a + diag[row-1] + 2;
3700         v3   = a->a + diag[row-2] + 3;
3701         v4   = a->a + diag[row-3] + 4;
3702         for (n = 0; n<sz-1; n+=2) {
3703           i1    = idx[0];
3704           i2    = idx[1];
3705           idx  += 2;
3706           tmp0  = x[i1];
3707           tmp1  = x[i2];
3708           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3709           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3710           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3711           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3712         }
3713 
3714         if (n == sz-1) {
3715           tmp0  = x[*idx];
3716           sum1 -= *v1*tmp0;
3717           sum2 -= *v2*tmp0;
3718           sum3 -= *v3*tmp0;
3719           sum4 -= *v4*tmp0;
3720         }
3721         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3722         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3723         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3724         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3725         row     -= 4;
3726         break;
3727       case 5:
3728 
3729         sum1 = b[row];
3730         sum2 = b[row-1];
3731         sum3 = b[row-2];
3732         sum4 = b[row-3];
3733         sum5 = b[row-4];
3734         v2   = a->a + diag[row-1] + 2;
3735         v3   = a->a + diag[row-2] + 3;
3736         v4   = a->a + diag[row-3] + 4;
3737         v5   = a->a + diag[row-4] + 5;
3738         for (n = 0; n<sz-1; n+=2) {
3739           i1    = idx[0];
3740           i2    = idx[1];
3741           idx  += 2;
3742           tmp0  = x[i1];
3743           tmp1  = x[i2];
3744           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3745           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3746           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3747           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3748           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3749         }
3750 
3751         if (n == sz-1) {
3752           tmp0  = x[*idx];
3753           sum1 -= *v1*tmp0;
3754           sum2 -= *v2*tmp0;
3755           sum3 -= *v3*tmp0;
3756           sum4 -= *v4*tmp0;
3757           sum5 -= *v5*tmp0;
3758         }
3759         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3760         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3761         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3762         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3763         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3764         row     -= 5;
3765         break;
3766       default:
3767         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3768       }
3769     }
3770     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3771 
3772     /*
3773            t = b - D x    where D is the block diagonal
3774     */
3775     cnt = 0;
3776     for (i=0, row=0; i<m; i++) {
3777       switch (sizes[i]) {
3778       case 1:
3779         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3780         break;
3781       case 2:
3782         x1       = x[row]; x2 = x[row+1];
3783         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3784         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3785         t[row]   = b[row] - tmp1;
3786         t[row+1] = b[row+1] - tmp2; row += 2;
3787         cnt     += 4;
3788         break;
3789       case 3:
3790         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3791         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3792         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3793         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3794         t[row]   = b[row] - tmp1;
3795         t[row+1] = b[row+1] - tmp2;
3796         t[row+2] = b[row+2] - tmp3; row += 3;
3797         cnt     += 9;
3798         break;
3799       case 4:
3800         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3801         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3802         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3803         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3804         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3805         t[row]   = b[row] - tmp1;
3806         t[row+1] = b[row+1] - tmp2;
3807         t[row+2] = b[row+2] - tmp3;
3808         t[row+3] = b[row+3] - tmp4; row += 4;
3809         cnt     += 16;
3810         break;
3811       case 5:
3812         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3813         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3814         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3815         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3816         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3817         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3818         t[row]   = b[row] - tmp1;
3819         t[row+1] = b[row+1] - tmp2;
3820         t[row+2] = b[row+2] - tmp3;
3821         t[row+3] = b[row+3] - tmp4;
3822         t[row+4] = b[row+4] - tmp5;row += 5;
3823         cnt     += 25;
3824         break;
3825       default:
3826         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3827       }
3828     }
3829     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3830 
3831 
3832 
3833     /*
3834           Apply (L + D)^-1 where D is the block diagonal
3835     */
3836     for (i=0, row=0; i<m; i++) {
3837       sz  = diag[row] - ii[row];
3838       v1  = a->a + ii[row];
3839       idx = a->j + ii[row];
3840       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3841       switch (sizes[i]) {
3842       case 1:
3843 
3844         sum1 = t[row];
3845         for (n = 0; n<sz-1; n+=2) {
3846           i1    = idx[0];
3847           i2    = idx[1];
3848           idx  += 2;
3849           tmp0  = t[i1];
3850           tmp1  = t[i2];
3851           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3852         }
3853 
3854         if (n == sz-1) {
3855           tmp0  = t[*idx];
3856           sum1 -= *v1 * tmp0;
3857         }
3858         x[row] += t[row] = sum1*(*ibdiag++); row++;
3859         break;
3860       case 2:
3861         v2   = a->a + ii[row+1];
3862         sum1 = t[row];
3863         sum2 = t[row+1];
3864         for (n = 0; n<sz-1; n+=2) {
3865           i1    = idx[0];
3866           i2    = idx[1];
3867           idx  += 2;
3868           tmp0  = t[i1];
3869           tmp1  = t[i2];
3870           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3871           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3872         }
3873 
3874         if (n == sz-1) {
3875           tmp0  = t[*idx];
3876           sum1 -= v1[0] * tmp0;
3877           sum2 -= v2[0] * tmp0;
3878         }
3879         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3880         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3881         ibdiag   += 4; row += 2;
3882         break;
3883       case 3:
3884         v2   = a->a + ii[row+1];
3885         v3   = a->a + ii[row+2];
3886         sum1 = t[row];
3887         sum2 = t[row+1];
3888         sum3 = t[row+2];
3889         for (n = 0; n<sz-1; n+=2) {
3890           i1    = idx[0];
3891           i2    = idx[1];
3892           idx  += 2;
3893           tmp0  = t[i1];
3894           tmp1  = t[i2];
3895           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3896           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3897           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3898         }
3899 
3900         if (n == sz-1) {
3901           tmp0  = t[*idx];
3902           sum1 -= v1[0] * tmp0;
3903           sum2 -= v2[0] * tmp0;
3904           sum3 -= v3[0] * tmp0;
3905         }
3906         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3907         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3908         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3909         ibdiag   += 9; row += 3;
3910         break;
3911       case 4:
3912         v2   = a->a + ii[row+1];
3913         v3   = a->a + ii[row+2];
3914         v4   = a->a + ii[row+3];
3915         sum1 = t[row];
3916         sum2 = t[row+1];
3917         sum3 = t[row+2];
3918         sum4 = t[row+3];
3919         for (n = 0; n<sz-1; n+=2) {
3920           i1    = idx[0];
3921           i2    = idx[1];
3922           idx  += 2;
3923           tmp0  = t[i1];
3924           tmp1  = t[i2];
3925           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3926           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3927           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3928           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3929         }
3930 
3931         if (n == sz-1) {
3932           tmp0  = t[*idx];
3933           sum1 -= v1[0] * tmp0;
3934           sum2 -= v2[0] * tmp0;
3935           sum3 -= v3[0] * tmp0;
3936           sum4 -= v4[0] * tmp0;
3937         }
3938         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3939         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3940         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3941         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3942         ibdiag   += 16; row += 4;
3943         break;
3944       case 5:
3945         v2   = a->a + ii[row+1];
3946         v3   = a->a + ii[row+2];
3947         v4   = a->a + ii[row+3];
3948         v5   = a->a + ii[row+4];
3949         sum1 = t[row];
3950         sum2 = t[row+1];
3951         sum3 = t[row+2];
3952         sum4 = t[row+3];
3953         sum5 = t[row+4];
3954         for (n = 0; n<sz-1; n+=2) {
3955           i1    = idx[0];
3956           i2    = idx[1];
3957           idx  += 2;
3958           tmp0  = t[i1];
3959           tmp1  = t[i2];
3960           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3961           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3962           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3963           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3964           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3965         }
3966 
3967         if (n == sz-1) {
3968           tmp0  = t[*idx];
3969           sum1 -= v1[0] * tmp0;
3970           sum2 -= v2[0] * tmp0;
3971           sum3 -= v3[0] * tmp0;
3972           sum4 -= v4[0] * tmp0;
3973           sum5 -= v5[0] * tmp0;
3974         }
3975         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3976         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3977         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3978         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3979         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3980         ibdiag   += 25; row += 5;
3981         break;
3982       default:
3983         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3984       }
3985     }
3986     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3987   }
3988   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3989   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3990   PetscFunctionReturn(0);
3991 }
3992 
3993 #undef __FUNCT__
3994 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3995 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3996 {
3997   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3998   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3999   const MatScalar   *bdiag = a->inode.bdiag;
4000   const PetscScalar *b;
4001   PetscErrorCode    ierr;
4002   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4003   const PetscInt    *sizes = a->inode.size;
4004 
4005   PetscFunctionBegin;
4006   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4007   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4008   cnt  = 0;
4009   for (i=0, row=0; i<m; i++) {
4010     switch (sizes[i]) {
4011     case 1:
4012       x[row] = b[row]*bdiag[cnt++];row++;
4013       break;
4014     case 2:
4015       x1       = b[row]; x2 = b[row+1];
4016       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4017       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4018       x[row++] = tmp1;
4019       x[row++] = tmp2;
4020       cnt     += 4;
4021       break;
4022     case 3:
4023       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4024       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4025       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4026       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4027       x[row++] = tmp1;
4028       x[row++] = tmp2;
4029       x[row++] = tmp3;
4030       cnt     += 9;
4031       break;
4032     case 4:
4033       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4034       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4035       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4036       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4037       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4038       x[row++] = tmp1;
4039       x[row++] = tmp2;
4040       x[row++] = tmp3;
4041       x[row++] = tmp4;
4042       cnt     += 16;
4043       break;
4044     case 5:
4045       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4046       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4047       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4048       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4049       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4050       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4051       x[row++] = tmp1;
4052       x[row++] = tmp2;
4053       x[row++] = tmp3;
4054       x[row++] = tmp4;
4055       x[row++] = tmp5;
4056       cnt     += 25;
4057       break;
4058     default:
4059       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4060     }
4061   }
4062   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4063   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4064   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4065   PetscFunctionReturn(0);
4066 }
4067 
4068 /*
4069     samestructure indicates that the matrix has not changed its nonzero structure so we
4070     do not need to recompute the inodes
4071 */
4072 #undef __FUNCT__
4073 #define __FUNCT__ "Mat_CheckInode"
4074 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
4075 {
4076   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4077   PetscErrorCode ierr;
4078   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
4079   PetscBool      flag;
4080 
4081   PetscFunctionBegin;
4082   if (!a->inode.use) PetscFunctionReturn(0);
4083   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4084 
4085 
4086   m = A->rmap->n;
4087   if (a->inode.size) ns = a->inode.size;
4088   else {
4089     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4090   }
4091 
4092   i          = 0;
4093   node_count = 0;
4094   idx        = a->j;
4095   ii         = a->i;
4096   while (i < m) {                /* For each row */
4097     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4098     /* Limits the number of elements in a node to 'a->inode.limit' */
4099     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4100       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4101       if (nzy != nzx) break;
4102       idy += nzx;              /* Same nonzero pattern */
4103       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4104       if (!flag) break;
4105     }
4106     ns[node_count++] = blk_size;
4107     idx             += blk_size*nzx;
4108     i                = j;
4109   }
4110   /* If not enough inodes found,, do not use inode version of the routines */
4111   if (!m || node_count > .8*m) {
4112     ierr = PetscFree(ns);CHKERRQ(ierr);
4113 
4114     a->inode.node_count       = 0;
4115     a->inode.size             = NULL;
4116     a->inode.use              = PETSC_FALSE;
4117     A->ops->mult              = MatMult_SeqAIJ;
4118     A->ops->sor               = MatSOR_SeqAIJ;
4119     A->ops->multadd           = MatMultAdd_SeqAIJ;
4120     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4121     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4122     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4123     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4124     A->ops->coloringpatch     = 0;
4125     A->ops->multdiagonalblock = 0;
4126 
4127     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4128   } else {
4129     if (!A->factortype) {
4130       A->ops->mult              = MatMult_SeqAIJ_Inode;
4131       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4132       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4133       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4134       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4135       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4136       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4137       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4138       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4139     } else {
4140       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4141     }
4142     a->inode.node_count = node_count;
4143     a->inode.size       = ns;
4144     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4145   }
4146   a->inode.checked = PETSC_TRUE;
4147   PetscFunctionReturn(0);
4148 }
4149 
4150 #undef __FUNCT__
4151 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4152 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4153 {
4154   Mat            B =*C;
4155   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4156   PetscErrorCode ierr;
4157   PetscInt       m=A->rmap->n;
4158 
4159   PetscFunctionBegin;
4160   c->inode.use       = a->inode.use;
4161   c->inode.limit     = a->inode.limit;
4162   c->inode.max_limit = a->inode.max_limit;
4163   if (a->inode.size) {
4164     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4165     c->inode.node_count = a->inode.node_count;
4166     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4167     /* note the table of functions below should match that in Mat_CheckInode() */
4168     if (!B->factortype) {
4169       B->ops->mult              = MatMult_SeqAIJ_Inode;
4170       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4171       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4172       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4173       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4174       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4175       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4176       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4177       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4178     } else {
4179       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4180     }
4181   } else {
4182     c->inode.size       = 0;
4183     c->inode.node_count = 0;
4184   }
4185   c->inode.ibdiagvalid = PETSC_FALSE;
4186   c->inode.ibdiag      = 0;
4187   c->inode.bdiag       = 0;
4188   PetscFunctionReturn(0);
4189 }
4190 
4191 #undef __FUNCT__
4192 #define __FUNCT__ "MatGetRow_FactoredLU"
4193 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4194 {
4195   PetscInt k, *vi;
4196 
4197   PetscFunctionBegin;
4198   vi = aj + ai[row];
4199   for (k=0; k<nzl; k++) cols[k] = vi[k];
4200   vi        = aj + adiag[row];
4201   cols[nzl] = vi[0];
4202   vi        = aj + adiag[row+1]+1;
4203   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4204   PetscFunctionReturn(0);
4205 }
4206 /*
4207    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4208    Modified from Mat_CheckInode().
4209 
4210    Input Parameters:
4211 +  Mat A - ILU or LU matrix factor
4212 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4213     do not need to recompute the inodes
4214 */
4215 #undef __FUNCT__
4216 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4217 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure)
4218 {
4219   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4220   PetscErrorCode ierr;
4221   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4222   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4223   PetscBool      flag;
4224 
4225   PetscFunctionBegin;
4226   if (!a->inode.use)                     PetscFunctionReturn(0);
4227   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4228 
4229   m = A->rmap->n;
4230   if (a->inode.size) ns = a->inode.size;
4231   else {
4232     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4233   }
4234 
4235   i          = 0;
4236   node_count = 0;
4237   while (i < m) {                /* For each row */
4238     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4239     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4240     nzx  = nzl1 + nzu1 + 1;
4241     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
4242     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4243 
4244     /* Limits the number of elements in a node to 'a->inode.limit' */
4245     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4246       nzl2 = ai[j+1] - ai[j];
4247       nzu2 = adiag[j] - adiag[j+1] - 1;
4248       nzy  = nzl2 + nzu2 + 1;
4249       if (nzy != nzx) break;
4250       ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
4251       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4252       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4253       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
4254       ierr = PetscFree(cols2);CHKERRQ(ierr);
4255     }
4256     ns[node_count++] = blk_size;
4257     ierr             = PetscFree(cols1);CHKERRQ(ierr);
4258     i                = j;
4259   }
4260   /* If not enough inodes found,, do not use inode version of the routines */
4261   if (!m || node_count > .8*m) {
4262     ierr = PetscFree(ns);CHKERRQ(ierr);
4263 
4264     a->inode.node_count = 0;
4265     a->inode.size       = NULL;
4266     a->inode.use        = PETSC_FALSE;
4267 
4268     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4269   } else {
4270     A->ops->mult              = 0;
4271     A->ops->sor               = 0;
4272     A->ops->multadd           = 0;
4273     A->ops->getrowij          = 0;
4274     A->ops->restorerowij      = 0;
4275     A->ops->getcolumnij       = 0;
4276     A->ops->restorecolumnij   = 0;
4277     A->ops->coloringpatch     = 0;
4278     A->ops->multdiagonalblock = 0;
4279     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4280     a->inode.node_count       = node_count;
4281     a->inode.size             = ns;
4282 
4283     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4284   }
4285   a->inode.checked = PETSC_TRUE;
4286   PetscFunctionReturn(0);
4287 }
4288 
4289 #undef __FUNCT__
4290 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4291 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4292 {
4293   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4294 
4295   PetscFunctionBegin;
4296   a->inode.ibdiagvalid = PETSC_FALSE;
4297   PetscFunctionReturn(0);
4298 }
4299 
4300 /*
4301      This is really ugly. if inodes are used this replaces the
4302   permutations with ones that correspond to rows/cols of the matrix
4303   rather then inode blocks
4304 */
4305 #undef __FUNCT__
4306 #define __FUNCT__ "MatInodeAdjustForInodes"
4307 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4308 {
4309   PetscErrorCode ierr;
4310 
4311   PetscFunctionBegin;
4312   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4313   PetscFunctionReturn(0);
4314 }
4315 
4316 #undef __FUNCT__
4317 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4318 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4319 {
4320   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4321   PetscErrorCode ierr;
4322   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4323   const PetscInt *ridx,*cidx;
4324   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4325   PetscInt       nslim_col,*ns_col;
4326   IS             ris = *rperm,cis = *cperm;
4327 
4328   PetscFunctionBegin;
4329   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4330   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4331 
4332   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4333   ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4334   ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4335 
4336   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4337   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4338 
4339   /* Form the inode structure for the rows of permuted matric using inv perm*/
4340   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4341 
4342   /* Construct the permutations for rows*/
4343   for (i=0,row = 0; i<nslim_row; ++i) {
4344     indx      = ridx[i];
4345     start_val = tns[indx];
4346     end_val   = tns[indx + 1];
4347     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4348   }
4349 
4350   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4351   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4352 
4353   /* Construct permutations for columns */
4354   for (i=0,col=0; i<nslim_col; ++i) {
4355     indx      = cidx[i];
4356     start_val = tns[indx];
4357     end_val   = tns[indx + 1];
4358     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4359   }
4360 
4361   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4362   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4363   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4364   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4365 
4366   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4367   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4368 
4369   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4370   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4371   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4372   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4373   ierr = PetscFree(tns);CHKERRQ(ierr);
4374   PetscFunctionReturn(0);
4375 }
4376 
4377 #undef __FUNCT__
4378 #define __FUNCT__ "MatInodeGetInodeSizes"
4379 /*@C
4380    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4381 
4382    Not Collective
4383 
4384    Input Parameter:
4385 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4386 
4387    Output Parameter:
4388 +  node_count - no of inodes present in the matrix.
4389 .  sizes      - an array of size node_count,with sizes of each inode.
4390 -  limit      - the max size used to generate the inodes.
4391 
4392    Level: advanced
4393 
4394    Notes: This routine returns some internal storage information
4395    of the matrix, it is intended to be used by advanced users.
4396    It should be called after the matrix is assembled.
4397    The contents of the sizes[] array should not be changed.
4398    NULL may be passed for information not requested.
4399 
4400 .keywords: matrix, seqaij, get, inode
4401 
4402 .seealso: MatGetInfo()
4403 @*/
4404 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4405 {
4406   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4407 
4408   PetscFunctionBegin;
4409   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4410   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4411   if (f) {
4412     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4413   }
4414   PetscFunctionReturn(0);
4415 }
4416 
4417 #undef __FUNCT__
4418 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4419 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4420 {
4421   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4422 
4423   PetscFunctionBegin;
4424   if (node_count) *node_count = a->inode.node_count;
4425   if (sizes)      *sizes      = a->inode.size;
4426   if (limit)      *limit      = a->inode.limit;
4427   PetscFunctionReturn(0);
4428 }
4429