xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 0e9bae810fdaeb60e2713eaa8ddb89f42e079fd1)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
11 {
12   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
24     for(; count+1 < n; count++,i++);
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for(; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n)  {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1){
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col)
86       tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     i2   = 0;
101     col  = *j++ + ishift;
102     i2   = tvc[col];
103     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104       ia[i1+1]++;
105       ia[i2+1]++;
106       i2++;                     /* Start col of next node */
107       while((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108       i2 = tvc[col];
109     }
110     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
111   }
112 
113   /* shift ia[i] to point to next row */
114   for (i1=1; i1<nslim_row+1; i1++) {
115     row        = ia[i1-1];
116     ia[i1]    += row;
117     work[i1-1] = row - oshift;
118   }
119 
120   /* allocate space for column pointers */
121   nz   = ia[nslim_row] + (!ishift);
122   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
123   *jja = ja;
124 
125  /* loop over lower triangular part putting into ja */
126   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127     j    = aj + ai[row] + ishift;
128     jmax = aj + ai[row+1] + ishift;
129     i2   = 0;                     /* Col inode index */
130     col  = *j++ + ishift;
131     i2   = tvc[col];
132     while (i2<i1 && j<jmax) {
133       ja[work[i2]++] = i1 + oshift;
134       ja[work[i1]++] = i2 + oshift;
135       ++i2;
136       while((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137       i2 = tvc[col];
138     }
139     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
140 
141   }
142   ierr = PetscFree(work);CHKERRQ(ierr);
143   ierr = PetscFree(tns);CHKERRQ(ierr);
144   ierr = PetscFree(tvc);CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 /*
149       This builds nonsymmetric version of nonzero structure,
150 */
151 #undef __FUNCT__
152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154 {
155   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
156   PetscErrorCode ierr;
157   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
159 
160   PetscFunctionBegin;
161   nslim_row = a->inode.node_count;
162   n         = A->cmap->n;
163 
164   /* Create The column_inode for this matrix */
165   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
166 
167   /* allocate space for reformated column_inode structure */
168   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
169   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
170   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
171 
172   for (i1=0,col=0; i1<nslim_col; ++i1){
173     nsz = ns_col[i1];
174     for (i2=0; i2<nsz; ++i2,++col)
175       tvc[col] = i1;
176   }
177   /* allocate space for row pointers */
178   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
179   *iia = ia;
180   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
181   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
182 
183   /* determine the number of columns in each row */
184   ia[0] = oshift;
185   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186     j   = aj + ai[row] + ishift;
187     col = *j++ + ishift;
188     i2  = tvc[col];
189     nz  = ai[row+1] - ai[row];
190     while (nz-- > 0) {           /* off-diagonal elemets */
191       ia[i1+1]++;
192       i2++;                     /* Start col of next node */
193       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
194       if (nz > 0) i2 = tvc[col];
195     }
196   }
197 
198   /* shift ia[i] to point to next row */
199   for (i1=1; i1<nslim_row+1; i1++) {
200     row        = ia[i1-1];
201     ia[i1]    += row;
202     work[i1-1] = row - oshift;
203   }
204 
205   /* allocate space for column pointers */
206   nz   = ia[nslim_row] + (!ishift);
207   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
208   *jja = ja;
209 
210  /* loop over matrix putting into ja */
211   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
212     j   = aj + ai[row] + ishift;
213     i2  = 0;                     /* Col inode index */
214     col = *j++ + ishift;
215     i2  = tvc[col];
216     nz  = ai[row+1] - ai[row];
217     while (nz-- > 0) {
218       ja[work[i1]++] = i2 + oshift;
219       ++i2;
220       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
221       if (nz > 0) i2 = tvc[col];
222     }
223   }
224   ierr = PetscFree(ns_col);CHKERRQ(ierr);
225   ierr = PetscFree(work);CHKERRQ(ierr);
226   ierr = PetscFree(tns);CHKERRQ(ierr);
227   ierr = PetscFree(tvc);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 #undef __FUNCT__
232 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
233 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
234 {
235   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
236   PetscErrorCode ierr;
237 
238   PetscFunctionBegin;
239   *n     = a->inode.node_count;
240   if (!ia) PetscFunctionReturn(0);
241   if (!blockcompressed) {
242     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
243   } else if (symmetric) {
244     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
245   } else {
246     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNCT__
252 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
253 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
254 {
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   if (!ia) PetscFunctionReturn(0);
259 
260   if (!blockcompressed) {
261     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
262   } else {
263     ierr = PetscFree(*ia);CHKERRQ(ierr);
264     ierr = PetscFree(*ja);CHKERRQ(ierr);
265   }
266 
267   PetscFunctionReturn(0);
268 }
269 
270 /* ----------------------------------------------------------- */
271 
272 #undef __FUNCT__
273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275 {
276   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
277   PetscErrorCode ierr;
278   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
280 
281   PetscFunctionBegin;
282   nslim_row = a->inode.node_count;
283   n         = A->cmap->n;
284 
285   /* Create The column_inode for this matrix */
286   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
287 
288   /* allocate space for reformated column_inode structure */
289   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
290   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
291   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
292 
293   for (i1=0,col=0; i1<nslim_col; ++i1){
294     nsz = ns_col[i1];
295     for (i2=0; i2<nsz; ++i2,++col)
296       tvc[col] = i1;
297   }
298   /* allocate space for column pointers */
299   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
300   *iia = ia;
301   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
302   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
303 
304   /* determine the number of columns in each row */
305   ia[0] = oshift;
306   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
307     j   = aj + ai[row] + ishift;
308     col = *j++ + ishift;
309     i2  = tvc[col];
310     nz  = ai[row+1] - ai[row];
311     while (nz-- > 0) {           /* off-diagonal elemets */
312       /* ia[i1+1]++; */
313       ia[i2+1]++;
314       i2++;
315       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
316       if (nz > 0) i2 = tvc[col];
317     }
318   }
319 
320   /* shift ia[i] to point to next col */
321   for (i1=1; i1<nslim_col+1; i1++) {
322     col        = ia[i1-1];
323     ia[i1]    += col;
324     work[i1-1] = col - oshift;
325   }
326 
327   /* allocate space for column pointers */
328   nz   = ia[nslim_col] + (!ishift);
329   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
330   *jja = ja;
331 
332  /* loop over matrix putting into ja */
333   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
334     j   = aj + ai[row] + ishift;
335     i2  = 0;                     /* Col inode index */
336     col = *j++ + ishift;
337     i2  = tvc[col];
338     nz  = ai[row+1] - ai[row];
339     while (nz-- > 0) {
340       /* ja[work[i1]++] = i2 + oshift; */
341       ja[work[i2]++] = i1 + oshift;
342       i2++;
343       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
344       if (nz > 0) i2 = tvc[col];
345     }
346   }
347   ierr = PetscFree(ns_col);CHKERRQ(ierr);
348   ierr = PetscFree(work);CHKERRQ(ierr);
349   ierr = PetscFree(tns);CHKERRQ(ierr);
350   ierr = PetscFree(tvc);CHKERRQ(ierr);
351   PetscFunctionReturn(0);
352 }
353 
354 #undef __FUNCT__
355 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
356 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
357 {
358   PetscErrorCode ierr;
359 
360   PetscFunctionBegin;
361   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
362   if (!ia) PetscFunctionReturn(0);
363 
364   if (!blockcompressed) {
365     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
366   } else if (symmetric) {
367     /* Since the indices are symmetric it does'nt matter */
368     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
369   } else {
370     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
371   }
372   PetscFunctionReturn(0);
373 }
374 
375 #undef __FUNCT__
376 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
377 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
378 {
379   PetscErrorCode ierr;
380 
381   PetscFunctionBegin;
382   if (!ia) PetscFunctionReturn(0);
383   if (!blockcompressed) {
384     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
385   } else {
386     ierr = PetscFree(*ia);CHKERRQ(ierr);
387     ierr = PetscFree(*ja);CHKERRQ(ierr);
388   }
389   PetscFunctionReturn(0);
390 }
391 
392 /* ----------------------------------------------------------- */
393 
394 #undef __FUNCT__
395 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
396 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
397 {
398   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
399   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
400   PetscScalar       *y;
401   const PetscScalar *x;
402   const MatScalar   *v1,*v2,*v3,*v4,*v5;
403   PetscErrorCode    ierr;
404   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405 
406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408 #endif
409 
410   PetscFunctionBegin;
411   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
412   node_max = a->inode.node_count;
413   ns       = a->inode.size;     /* Node Size array */
414   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
415   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
416   idx  = a->j;
417   v1   = a->a;
418   ii   = a->i;
419 
420   for (i = 0,row = 0; i< node_max; ++i){
421     nsz  = ns[i];
422     n    = ii[1] - ii[0];
423     nonzerorow += (n>0)*nsz;
424     ii  += nsz;
425     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
426     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
427     sz   = n;                   /* No of non zeros in this row */
428                                 /* Switch on the size of Node */
429     switch (nsz){               /* Each loop in 'case' is unrolled */
430     case 1 :
431       sum1  = 0.;
432 
433       for(n = 0; n< sz-1; n+=2) {
434         i1   = idx[0];          /* The instructions are ordered to */
435         i2   = idx[1];          /* make the compiler's job easy */
436         idx += 2;
437         tmp0 = x[i1];
438         tmp1 = x[i2];
439         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
440        }
441 
442       if (n == sz-1){          /* Take care of the last nonzero  */
443         tmp0  = x[*idx++];
444         sum1 += *v1++ * tmp0;
445       }
446       y[row++]=sum1;
447       break;
448     case 2:
449       sum1  = 0.;
450       sum2  = 0.;
451       v2    = v1 + n;
452 
453       for (n = 0; n< sz-1; n+=2) {
454         i1   = idx[0];
455         i2   = idx[1];
456         idx += 2;
457         tmp0 = x[i1];
458         tmp1 = x[i2];
459         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
460         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
461       }
462       if (n == sz-1){
463         tmp0  = x[*idx++];
464         sum1 += *v1++ * tmp0;
465         sum2 += *v2++ * tmp0;
466       }
467       y[row++]=sum1;
468       y[row++]=sum2;
469       v1      =v2;              /* Since the next block to be processed starts there*/
470       idx    +=sz;
471       break;
472     case 3:
473       sum1  = 0.;
474       sum2  = 0.;
475       sum3  = 0.;
476       v2    = v1 + n;
477       v3    = v2 + n;
478 
479       for (n = 0; n< sz-1; n+=2) {
480         i1   = idx[0];
481         i2   = idx[1];
482         idx += 2;
483         tmp0 = x[i1];
484         tmp1 = x[i2];
485         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
486         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
487         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
488       }
489       if (n == sz-1){
490         tmp0  = x[*idx++];
491         sum1 += *v1++ * tmp0;
492         sum2 += *v2++ * tmp0;
493         sum3 += *v3++ * tmp0;
494       }
495       y[row++]=sum1;
496       y[row++]=sum2;
497       y[row++]=sum3;
498       v1       =v3;             /* Since the next block to be processed starts there*/
499       idx     +=2*sz;
500       break;
501     case 4:
502       sum1  = 0.;
503       sum2  = 0.;
504       sum3  = 0.;
505       sum4  = 0.;
506       v2    = v1 + n;
507       v3    = v2 + n;
508       v4    = v3 + n;
509 
510       for (n = 0; n< sz-1; n+=2) {
511         i1   = idx[0];
512         i2   = idx[1];
513         idx += 2;
514         tmp0 = x[i1];
515         tmp1 = x[i2];
516         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
517         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
518         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
519         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
520       }
521       if (n == sz-1){
522         tmp0  = x[*idx++];
523         sum1 += *v1++ * tmp0;
524         sum2 += *v2++ * tmp0;
525         sum3 += *v3++ * tmp0;
526         sum4 += *v4++ * tmp0;
527       }
528       y[row++]=sum1;
529       y[row++]=sum2;
530       y[row++]=sum3;
531       y[row++]=sum4;
532       v1      =v4;              /* Since the next block to be processed starts there*/
533       idx    +=3*sz;
534       break;
535     case 5:
536       sum1  = 0.;
537       sum2  = 0.;
538       sum3  = 0.;
539       sum4  = 0.;
540       sum5  = 0.;
541       v2    = v1 + n;
542       v3    = v2 + n;
543       v4    = v3 + n;
544       v5    = v4 + n;
545 
546       for (n = 0; n<sz-1; n+=2) {
547         i1   = idx[0];
548         i2   = idx[1];
549         idx += 2;
550         tmp0 = x[i1];
551         tmp1 = x[i2];
552         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
553         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
554         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
555         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
556         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
557       }
558       if (n == sz-1){
559         tmp0  = x[*idx++];
560         sum1 += *v1++ * tmp0;
561         sum2 += *v2++ * tmp0;
562         sum3 += *v3++ * tmp0;
563         sum4 += *v4++ * tmp0;
564         sum5 += *v5++ * tmp0;
565       }
566       y[row++]=sum1;
567       y[row++]=sum2;
568       y[row++]=sum3;
569       y[row++]=sum4;
570       y[row++]=sum5;
571       v1      =v5;       /* Since the next block to be processed starts there */
572       idx    +=4*sz;
573       break;
574     default :
575       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
576     }
577   }
578   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
579   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
580   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 /* ----------------------------------------------------------- */
584 /* Almost same code as the MatMult_SeqAIJ_Inode() */
585 #undef __FUNCT__
586 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
587 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
588 {
589   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
590   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
591   MatScalar      *v1,*v2,*v3,*v4,*v5;
592   PetscScalar    *x,*y,*z,*zt;
593   PetscErrorCode ierr;
594   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
595 
596   PetscFunctionBegin;
597   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
598   node_max = a->inode.node_count;
599   ns       = a->inode.size;     /* Node Size array */
600   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
601   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
602   if (zz != yy) {
603     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
604   } else {
605     z = y;
606   }
607   zt = z;
608 
609   idx  = a->j;
610   v1   = a->a;
611   ii   = a->i;
612 
613   for (i = 0,row = 0; i< node_max; ++i){
614     nsz  = ns[i];
615     n    = ii[1] - ii[0];
616     ii  += nsz;
617     sz   = n;                   /* No of non zeros in this row */
618                                 /* Switch on the size of Node */
619     switch (nsz){               /* Each loop in 'case' is unrolled */
620     case 1 :
621       sum1  = *zt++;
622 
623       for(n = 0; n< sz-1; n+=2) {
624         i1   = idx[0];          /* The instructions are ordered to */
625         i2   = idx[1];          /* make the compiler's job easy */
626         idx += 2;
627         tmp0 = x[i1];
628         tmp1 = x[i2];
629         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
630        }
631 
632       if(n   == sz-1){          /* Take care of the last nonzero  */
633         tmp0  = x[*idx++];
634         sum1 += *v1++ * tmp0;
635       }
636       y[row++]=sum1;
637       break;
638     case 2:
639       sum1  = *zt++;
640       sum2  = *zt++;
641       v2    = v1 + n;
642 
643       for(n = 0; n< sz-1; n+=2) {
644         i1   = idx[0];
645         i2   = idx[1];
646         idx += 2;
647         tmp0 = x[i1];
648         tmp1 = x[i2];
649         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
650         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
651       }
652       if(n   == sz-1){
653         tmp0  = x[*idx++];
654         sum1 += *v1++ * tmp0;
655         sum2 += *v2++ * tmp0;
656       }
657       y[row++]=sum1;
658       y[row++]=sum2;
659       v1      =v2;              /* Since the next block to be processed starts there*/
660       idx    +=sz;
661       break;
662     case 3:
663       sum1  = *zt++;
664       sum2  = *zt++;
665       sum3  = *zt++;
666       v2    = v1 + n;
667       v3    = v2 + n;
668 
669       for (n = 0; n< sz-1; n+=2) {
670         i1   = idx[0];
671         i2   = idx[1];
672         idx += 2;
673         tmp0 = x[i1];
674         tmp1 = x[i2];
675         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
676         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
677         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
678       }
679       if (n == sz-1){
680         tmp0  = x[*idx++];
681         sum1 += *v1++ * tmp0;
682         sum2 += *v2++ * tmp0;
683         sum3 += *v3++ * tmp0;
684       }
685       y[row++]=sum1;
686       y[row++]=sum2;
687       y[row++]=sum3;
688       v1       =v3;             /* Since the next block to be processed starts there*/
689       idx     +=2*sz;
690       break;
691     case 4:
692       sum1  = *zt++;
693       sum2  = *zt++;
694       sum3  = *zt++;
695       sum4  = *zt++;
696       v2    = v1 + n;
697       v3    = v2 + n;
698       v4    = v3 + n;
699 
700       for (n = 0; n< sz-1; n+=2) {
701         i1   = idx[0];
702         i2   = idx[1];
703         idx += 2;
704         tmp0 = x[i1];
705         tmp1 = x[i2];
706         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
707         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
708         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
709         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
710       }
711       if (n == sz-1){
712         tmp0  = x[*idx++];
713         sum1 += *v1++ * tmp0;
714         sum2 += *v2++ * tmp0;
715         sum3 += *v3++ * tmp0;
716         sum4 += *v4++ * tmp0;
717       }
718       y[row++]=sum1;
719       y[row++]=sum2;
720       y[row++]=sum3;
721       y[row++]=sum4;
722       v1      =v4;              /* Since the next block to be processed starts there*/
723       idx    +=3*sz;
724       break;
725     case 5:
726       sum1  = *zt++;
727       sum2  = *zt++;
728       sum3  = *zt++;
729       sum4  = *zt++;
730       sum5  = *zt++;
731       v2    = v1 + n;
732       v3    = v2 + n;
733       v4    = v3 + n;
734       v5    = v4 + n;
735 
736       for (n = 0; n<sz-1; n+=2) {
737         i1   = idx[0];
738         i2   = idx[1];
739         idx += 2;
740         tmp0 = x[i1];
741         tmp1 = x[i2];
742         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
743         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
744         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
745         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
746         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
747       }
748       if(n   == sz-1){
749         tmp0  = x[*idx++];
750         sum1 += *v1++ * tmp0;
751         sum2 += *v2++ * tmp0;
752         sum3 += *v3++ * tmp0;
753         sum4 += *v4++ * tmp0;
754         sum5 += *v5++ * tmp0;
755       }
756       y[row++]=sum1;
757       y[row++]=sum2;
758       y[row++]=sum3;
759       y[row++]=sum4;
760       y[row++]=sum5;
761       v1      =v5;       /* Since the next block to be processed starts there */
762       idx    +=4*sz;
763       break;
764     default :
765       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
766     }
767   }
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
770   if (zz != yy) {
771     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
772   }
773   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
774   PetscFunctionReturn(0);
775 }
776 
777 /* ----------------------------------------------------------- */
778 #undef __FUNCT__
779 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
780 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
781 {
782   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
783   IS                iscol = a->col,isrow = a->row;
784   PetscErrorCode    ierr;
785   const PetscInt    *r,*c,*rout,*cout;
786   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
787   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
788   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
789   PetscScalar       sum1,sum2,sum3,sum4,sum5;
790   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
791   const PetscScalar *b;
792 
793   PetscFunctionBegin;
794   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
795   node_max = a->inode.node_count;
796   ns       = a->inode.size;     /* Node Size array */
797 
798   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
799   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
800   tmp  = a->solve_work;
801 
802   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
803   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
804 
805   /* forward solve the lower triangular */
806   tmps = tmp ;
807   aa   = a_a ;
808   aj   = a_j ;
809   ad   = a->diag;
810 
811   for (i = 0,row = 0; i< node_max; ++i){
812     nsz = ns[i];
813     aii = ai[row];
814     v1  = aa + aii;
815     vi  = aj + aii;
816     nz  = ad[row]- aii;
817     if (i < node_max-1) {
818       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
819       * but our indexing to determine it's size could. */
820       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
821       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
822       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
823       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
824     }
825 
826     switch (nsz){               /* Each loop in 'case' is unrolled */
827     case 1 :
828       sum1 = b[*r++];
829       for(j=0; j<nz-1; j+=2){
830         i0   = vi[0];
831         i1   = vi[1];
832         vi  +=2;
833         tmp0 = tmps[i0];
834         tmp1 = tmps[i1];
835         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
836       }
837       if(j == nz-1){
838         tmp0 = tmps[*vi++];
839         sum1 -= *v1++ *tmp0;
840       }
841       tmp[row ++]=sum1;
842       break;
843     case 2:
844       sum1 = b[*r++];
845       sum2 = b[*r++];
846       v2   = aa + ai[row+1];
847 
848       for(j=0; j<nz-1; j+=2){
849         i0   = vi[0];
850         i1   = vi[1];
851         vi  +=2;
852         tmp0 = tmps[i0];
853         tmp1 = tmps[i1];
854         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
855         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
856       }
857       if(j == nz-1){
858         tmp0 = tmps[*vi++];
859         sum1 -= *v1++ *tmp0;
860         sum2 -= *v2++ *tmp0;
861       }
862       sum2 -= *v2++ * sum1;
863       tmp[row ++]=sum1;
864       tmp[row ++]=sum2;
865       break;
866     case 3:
867       sum1 = b[*r++];
868       sum2 = b[*r++];
869       sum3 = b[*r++];
870       v2   = aa + ai[row+1];
871       v3   = aa + ai[row+2];
872 
873       for (j=0; j<nz-1; j+=2){
874         i0   = vi[0];
875         i1   = vi[1];
876         vi  +=2;
877         tmp0 = tmps[i0];
878         tmp1 = tmps[i1];
879         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
880         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
881         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
882       }
883       if (j == nz-1){
884         tmp0 = tmps[*vi++];
885         sum1 -= *v1++ *tmp0;
886         sum2 -= *v2++ *tmp0;
887         sum3 -= *v3++ *tmp0;
888       }
889       sum2 -= *v2++ * sum1;
890       sum3 -= *v3++ * sum1;
891       sum3 -= *v3++ * sum2;
892       tmp[row ++]=sum1;
893       tmp[row ++]=sum2;
894       tmp[row ++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2){
907         i0   = vi[0];
908         i1   = vi[1];
909         vi  +=2;
910         tmp0 = tmps[i0];
911         tmp1 = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1){
918         tmp0 = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row ++]=sum1;
932       tmp[row ++]=sum2;
933       tmp[row ++]=sum3;
934       tmp[row ++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2){
948         i0   = vi[0];
949         i1   = vi[1];
950         vi  +=2;
951         tmp0 = tmps[i0];
952         tmp1 = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1){
960         tmp0 = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row ++]=sum1;
980       tmp[row ++]=sum2;
981       tmp[row ++]=sum3;
982       tmp[row ++]=sum4;
983       tmp[row ++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz){               /* Each loop in 'case' is unrolled */
997     case 1 :
998       sum1 = tmp[row];
999 
1000       for(j=nz ; j>1; j-=2){
1001         vi  -=2;
1002         i0   = vi[2];
1003         i1   = vi[1];
1004         tmp0 = tmps[i0];
1005         tmp1 = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1){
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2 :
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz ; j>1; j-=2){
1020         vi  -=2;
1021         i0   = vi[2];
1022         i1   = vi[1];
1023         tmp0 = tmps[i0];
1024         tmp1 = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1){
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3 :
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz ; j>1; j-=2){
1047         vi  -=2;
1048         i0   = vi[2];
1049         i1   = vi[1];
1050         tmp0 = tmps[i0];
1051         tmp1 = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1){
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4 :
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz ; j>1; j-=2){
1083         vi  -=2;
1084         i0   = vi[2];
1085         i1   = vi[1];
1086         tmp0 = tmps[i0];
1087         tmp1 = tmps[i1];
1088         v1  -= 2;
1089         v2  -= 2;
1090         v3  -= 2;
1091         v4  -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1){
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5 :
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz ; j>1; j-=2){
1127         vi  -= 2;
1128         i0   = vi[2];
1129         i1   = vi[1];
1130         tmp0 = tmps[i0];
1131         tmp1 = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1){
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat              C=B;
1185   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1186   IS               isrow = b->row,isicol = b->icol;
1187   PetscErrorCode   ierr;
1188   const PetscInt   *r,*ic,*ics;
1189   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt         i,j,k,nz,nzL,row,*pj;
1191   const PetscInt   *ajtmp,*bjtmp;
1192   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx   sctx;
1195   const PetscInt   *ddiag;
1196   PetscReal        rs;
1197   MatScalar        d;
1198   PetscInt         inod,nodesz,node_max,col;
1199   const PetscInt   *ns;
1200   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp2 = rtmp1 + n;
1231   rtmp3 = rtmp2 + n;
1232   rtmp4 = rtmp3 + n;
1233   ics   = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1238 
1239   /* If max inode size > 4, split it into two inodes.*/
1240   /* also map the inode sizes according to the ordering */
1241   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1242   for (i=0,j=0; i<node_max; ++i,++j){
1243     if (ns[i] > 4) {
1244       tmp_vec1[j] = 4;
1245       ++j;
1246       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247     } else {
1248       tmp_vec1[j] = ns[i];
1249     }
1250   }
1251   /* Use the correct node_max */
1252   node_max = j;
1253 
1254   /* Now reorder the inode info based on mat re-ordering info */
1255   /* First create a row -> inode_size_array_index map */
1256   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1257   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1258   for (i=0,row=0; i<node_max; i++) {
1259     nodesz = tmp_vec1[i];
1260     for (j=0; j<nodesz; j++,row++) {
1261       nsmap[row] = i;
1262     }
1263   }
1264   /* Using nsmap, create a reordered ns structure */
1265   for (i=0,j=0; i< node_max; i++) {
1266     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1267     tmp_vec2[i]  = nodesz;
1268     j           += nodesz;
1269   }
1270   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1271   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1272 
1273   /* Now use the correct ns */
1274   ns = tmp_vec2;
1275 
1276   do {
1277     sctx.newshift = PETSC_FALSE;
1278     /* Now loop over each block-row, and do the factorization */
1279     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1280       nodesz = ns[inod];
1281 
1282       switch (nodesz){
1283       case 1:
1284       /*----------*/
1285         /* zero rtmp1 */
1286         /* L part */
1287         nz    = bi[i+1] - bi[i];
1288         bjtmp = bj + bi[i];
1289         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* U part */
1292         nz = bdiag[i]-bdiag[i+1];
1293         bjtmp = bj + bdiag[i+1]+1;
1294         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295 
1296         /* load in initial (unfactored row) */
1297         nz    = ai[r[i]+1] - ai[r[i]];
1298         ajtmp = aj + ai[r[i]];
1299         v     = aa + ai[r[i]];
1300         for (j=0; j<nz; j++) {
1301           rtmp1[ics[ajtmp[j]]] = v[j];
1302         }
1303         /* ZeropivotApply() */
1304         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1305 
1306         /* elimination */
1307         bjtmp = bj + bi[i];
1308         row   = *bjtmp++;
1309         nzL   = bi[i+1] - bi[i];
1310         for(k=0; k < nzL;k++) {
1311           pc = rtmp1 + row;
1312           if (*pc != 0.0) {
1313             pv   = b->a + bdiag[row];
1314             mul1 = *pc * (*pv);
1315             *pc  = mul1;
1316             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1317             pv = b->a + bdiag[row+1]+1;
1318             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1319             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1320             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1321           }
1322           row = *bjtmp++;
1323         }
1324 
1325         /* finished row so stick it into b->a */
1326         rs = 0.0;
1327         /* L part */
1328         pv = b->a + bi[i] ;
1329         pj = b->j + bi[i] ;
1330         nz = bi[i+1] - bi[i];
1331         for (j=0; j<nz; j++) {
1332           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* U part */
1336         pv = b->a + bdiag[i+1]+1;
1337         pj = b->j + bdiag[i+1]+1;
1338         nz = bdiag[i] - bdiag[i+1]-1;
1339         for (j=0; j<nz; j++) {
1340           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1341         }
1342 
1343         /* Check zero pivot */
1344         sctx.rs = rs;
1345         sctx.pv = rtmp1[i];
1346         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1347 	if(sctx.newshift) break;
1348 
1349         /* Mark diagonal and invert diagonal for simplier triangular solves */
1350         pv  = b->a + bdiag[i];
1351         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352         break;
1353 
1354       case 2:
1355       /*----------*/
1356         /* zero rtmp1 and rtmp2 */
1357         /* L part */
1358         nz    = bi[i+1] - bi[i];
1359         bjtmp = bj + bi[i];
1360         for  (j=0; j<nz; j++) {
1361           col = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* U part */
1366         nz = bdiag[i]-bdiag[i+1];
1367         bjtmp = bj + bdiag[i+1]+1;
1368         for  (j=0; j<nz; j++) {
1369           col = bjtmp[j];
1370           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371         }
1372 
1373         /* load in initial (unfactored row) */
1374         nz    = ai[r[i]+1] - ai[r[i]];
1375         ajtmp = aj + ai[r[i]];
1376         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377         for (j=0; j<nz; j++) {
1378           col = ics[ajtmp[j]];
1379           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380         }
1381         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383 
1384         /* elimination */
1385         bjtmp = bj + bi[i];
1386         row   = *bjtmp++; /* pivot row */
1387         nzL   = bi[i+1] - bi[i];
1388         for(k=0; k < nzL;k++) {
1389           pc1 = rtmp1 + row;
1390           pc2 = rtmp2 + row;
1391           if (*pc1 != 0.0 || *pc2 != 0.0) {
1392             pv   = b->a + bdiag[row];
1393             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1394             *pc1 = mul1;       *pc2 = mul2;
1395 
1396             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1397             pv = b->a + bdiag[row+1]+1;
1398             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399             for (j=0; j<nz; j++){
1400               col = pj[j];
1401               rtmp1[col] -= mul1 * pv[j];
1402               rtmp2[col] -= mul2 * pv[j];
1403             }
1404             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1405           }
1406           row = *bjtmp++;
1407         }
1408 
1409         /* finished row i; check zero pivot, then stick row i into b->a */
1410         rs  = 0.0;
1411         /* L part */
1412         pc1 = b->a + bi[i];
1413         pj  = b->j + bi[i] ;
1414         nz  = bi[i+1] - bi[i];
1415         for (j=0; j<nz; j++) {
1416           col = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419         /* U part */
1420         pc1 = b->a + bdiag[i+1]+1;
1421         pj  = b->j + bdiag[i+1]+1;
1422         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1423         for (j=0; j<nz; j++) {
1424           col = pj[j];
1425           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1426         }
1427 
1428         sctx.rs  = rs;
1429         sctx.pv  = rtmp1[i];
1430         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1431 	if(sctx.newshift) break;
1432         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433         *pc1 = 1.0/sctx.pv;
1434 
1435         /* Now take care of diagonal 2x2 block. */
1436         pc2 = rtmp2 + i;
1437         if (*pc2 != 0.0){
1438           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439           *pc2 = mul1;          /* insert L entry */
1440           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442           for (j=0; j<nz; j++) {
1443             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444           }
1445           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1446         }
1447 
1448         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449         rs = 0.0;
1450         /* L part */
1451         pc2 = b->a + bi[i+1];
1452         pj  = b->j + bi[i+1] ;
1453         nz  = bi[i+2] - bi[i+1];
1454         for (j=0; j<nz; j++) {
1455           col = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458         /* U part */
1459         pc2 = b->a + bdiag[i+2]+1;
1460         pj  = b->j + bdiag[i+2]+1;
1461         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462         for (j=0; j<nz; j++) {
1463           col = pj[j];
1464           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465         }
1466 
1467         sctx.rs  = rs;
1468         sctx.pv  = rtmp2[i+1];
1469         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1470 	if(sctx.newshift) break;
1471         pc2  = b->a + bdiag[i+1];
1472         *pc2 = 1.0/sctx.pv;
1473         break;
1474 
1475       case 3:
1476       /*----------*/
1477         /* zero rtmp */
1478         /* L part */
1479         nz    = bi[i+1] - bi[i];
1480         bjtmp = bj + bi[i];
1481         for  (j=0; j<nz; j++) {
1482           col = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* U part */
1487         nz = bdiag[i]-bdiag[i+1];
1488         bjtmp = bj + bdiag[i+1]+1;
1489         for  (j=0; j<nz; j++) {
1490           col = bjtmp[j];
1491           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1492         }
1493 
1494         /* load in initial (unfactored row) */
1495         nz    = ai[r[i]+1] - ai[r[i]];
1496         ajtmp = aj + ai[r[i]];
1497         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1498         for (j=0; j<nz; j++) {
1499           col = ics[ajtmp[j]];
1500           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1501         }
1502         /* ZeropivotApply(): shift the diagonal of the matrix  */
1503         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1504 
1505         /* elimination */
1506         bjtmp = bj + bi[i];
1507         row   = *bjtmp++; /* pivot row */
1508         nzL   = bi[i+1] - bi[i];
1509         for(k=0; k < nzL;k++) {
1510           pc1 = rtmp1 + row;
1511           pc2 = rtmp2 + row;
1512           pc3 = rtmp3 + row;
1513           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1514             pv  = b->a + bdiag[row];
1515             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1516             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1517 
1518             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1519             pv = b->a + bdiag[row+1]+1;
1520             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1521             for (j=0; j<nz; j++){
1522               col = pj[j];
1523               rtmp1[col] -= mul1 * pv[j];
1524               rtmp2[col] -= mul2 * pv[j];
1525               rtmp3[col] -= mul3 * pv[j];
1526             }
1527             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1528           }
1529           row = *bjtmp++;
1530         }
1531 
1532         /* finished row i; check zero pivot, then stick row i into b->a */
1533         rs  = 0.0;
1534         /* L part */
1535         pc1 = b->a + bi[i];
1536         pj  = b->j + bi[i] ;
1537         nz  = bi[i+1] - bi[i];
1538         for (j=0; j<nz; j++) {
1539           col = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542         /* U part */
1543         pc1 = b->a + bdiag[i+1]+1;
1544         pj  = b->j + bdiag[i+1]+1;
1545         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1546         for (j=0; j<nz; j++) {
1547           col = pj[j];
1548           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1549         }
1550 
1551         sctx.rs  = rs;
1552         sctx.pv  = rtmp1[i];
1553         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1554 	if(sctx.newshift) break;
1555         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1556         *pc1 = 1.0/sctx.pv;
1557 
1558         /* Now take care of 1st column of diagonal 3x3 block. */
1559         pc2 = rtmp2 + i;
1560         pc3 = rtmp3 + i;
1561         if (*pc2 != 0.0 || *pc3 != 0.0){
1562           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1563           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1564           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1565           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1566           for (j=0; j<nz; j++) {
1567             col = pj[j];
1568             rtmp2[col] -= mul2 * rtmp1[col];
1569             rtmp3[col] -= mul3 * rtmp1[col];
1570           }
1571           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1572         }
1573 
1574         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1575         rs = 0.0;
1576         /* L part */
1577         pc2 = b->a + bi[i+1];
1578         pj  = b->j + bi[i+1] ;
1579         nz  = bi[i+2] - bi[i+1];
1580         for (j=0; j<nz; j++) {
1581           col = pj[j];
1582           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583         }
1584         /* U part */
1585         pc2 = b->a + bdiag[i+2]+1;
1586         pj  = b->j + bdiag[i+2]+1;
1587         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1588         for (j=0; j<nz; j++) {
1589           col = pj[j];
1590           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1591         }
1592 
1593         sctx.rs  = rs;
1594         sctx.pv  = rtmp2[i+1];
1595         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1596 	if(sctx.newshift) break;
1597         pc2  = b->a + bdiag[i+1];
1598         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1599 
1600         /* Now take care of 2nd column of diagonal 3x3 block. */
1601         pc3 = rtmp3 + i+1;
1602         if (*pc3 != 0.0){
1603           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1604           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1605           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1606           for (j=0; j<nz; j++) {
1607             col = pj[j];
1608             rtmp3[col] -= mul3 * rtmp2[col];
1609           }
1610           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1611         }
1612 
1613         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1614         rs = 0.0;
1615         /* L part */
1616         pc3 = b->a + bi[i+2];
1617         pj  = b->j + bi[i+2] ;
1618         nz  = bi[i+3] - bi[i+2];
1619         for (j=0; j<nz; j++) {
1620           col = pj[j];
1621           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622         }
1623         /* U part */
1624         pc3 = b->a + bdiag[i+3]+1;
1625         pj  = b->j + bdiag[i+3]+1;
1626         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1627         for (j=0; j<nz; j++) {
1628           col = pj[j];
1629           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1630         }
1631 
1632         sctx.rs  = rs;
1633         sctx.pv  = rtmp3[i+2];
1634         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1635 	if(sctx.newshift) break;
1636         pc3  = b->a + bdiag[i+2];
1637         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1638         break;
1639       case 4:
1640       /*----------*/
1641         /* zero rtmp */
1642         /* L part */
1643         nz    = bi[i+1] - bi[i];
1644         bjtmp = bj + bi[i];
1645         for  (j=0; j<nz; j++) {
1646           col = bjtmp[j];
1647           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1648         }
1649 
1650         /* U part */
1651         nz = bdiag[i]-bdiag[i+1];
1652         bjtmp = bj + bdiag[i+1]+1;
1653         for  (j=0; j<nz; j++) {
1654           col = bjtmp[j];
1655           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1656         }
1657 
1658         /* load in initial (unfactored row) */
1659         nz    = ai[r[i]+1] - ai[r[i]];
1660         ajtmp = aj + ai[r[i]];
1661         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1662         for (j=0; j<nz; j++) {
1663           col = ics[ajtmp[j]];
1664           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1665         }
1666         /* ZeropivotApply(): shift the diagonal of the matrix  */
1667         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1668 
1669         /* elimination */
1670         bjtmp = bj + bi[i];
1671         row   = *bjtmp++; /* pivot row */
1672         nzL   = bi[i+1] - bi[i];
1673         for(k=0; k < nzL;k++) {
1674           pc1 = rtmp1 + row;
1675           pc2 = rtmp2 + row;
1676           pc3 = rtmp3 + row;
1677 	  pc4 = rtmp4 + row;
1678           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1679             pv  = b->a + bdiag[row];
1680             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1681             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1682 
1683             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1684             pv = b->a + bdiag[row+1]+1;
1685             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1686             for (j=0; j<nz; j++){
1687               col = pj[j];
1688               rtmp1[col] -= mul1 * pv[j];
1689               rtmp2[col] -= mul2 * pv[j];
1690               rtmp3[col] -= mul3 * pv[j];
1691 	      rtmp4[col] -= mul4 * pv[j];
1692             }
1693             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1694           }
1695           row = *bjtmp++;
1696         }
1697 
1698         /* finished row i; check zero pivot, then stick row i into b->a */
1699         rs  = 0.0;
1700         /* L part */
1701         pc1 = b->a + bi[i];
1702         pj  = b->j + bi[i] ;
1703         nz  = bi[i+1] - bi[i];
1704         for (j=0; j<nz; j++) {
1705           col = pj[j];
1706           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707         }
1708         /* U part */
1709         pc1 = b->a + bdiag[i+1]+1;
1710         pj  = b->j + bdiag[i+1]+1;
1711         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1712         for (j=0; j<nz; j++) {
1713           col = pj[j];
1714           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1715         }
1716 
1717         sctx.rs  = rs;
1718         sctx.pv  = rtmp1[i];
1719         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1720 	if(sctx.newshift) break;
1721         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1722         *pc1 = 1.0/sctx.pv;
1723 
1724         /* Now take care of 1st column of diagonal 4x4 block. */
1725         pc2 = rtmp2 + i;
1726         pc3 = rtmp3 + i;
1727 	pc4 = rtmp4 + i;
1728         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1729           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1730           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1731 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1732           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1733           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1734           for (j=0; j<nz; j++) {
1735             col = pj[j];
1736             rtmp2[col] -= mul2 * rtmp1[col];
1737             rtmp3[col] -= mul3 * rtmp1[col];
1738 	    rtmp4[col] -= mul4 * rtmp1[col];
1739           }
1740           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1741         }
1742 
1743         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1744         rs = 0.0;
1745         /* L part */
1746         pc2 = b->a + bi[i+1];
1747         pj  = b->j + bi[i+1] ;
1748         nz  = bi[i+2] - bi[i+1];
1749         for (j=0; j<nz; j++) {
1750           col = pj[j];
1751           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752         }
1753         /* U part */
1754         pc2 = b->a + bdiag[i+2]+1;
1755         pj  = b->j + bdiag[i+2]+1;
1756         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1757         for (j=0; j<nz; j++) {
1758           col = pj[j];
1759           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1760         }
1761 
1762         sctx.rs  = rs;
1763         sctx.pv  = rtmp2[i+1];
1764         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1765 	if(sctx.newshift) break;
1766         pc2  = b->a + bdiag[i+1];
1767         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1768 
1769         /* Now take care of 2nd column of diagonal 4x4 block. */
1770         pc3 = rtmp3 + i+1;
1771 	pc4 = rtmp4 + i+1;
1772         if (*pc3 != 0.0 || *pc4 != 0.0){
1773           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1774 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1775           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1776           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1777           for (j=0; j<nz; j++) {
1778             col = pj[j];
1779             rtmp3[col] -= mul3 * rtmp2[col];
1780 	    rtmp4[col] -= mul4 * rtmp2[col];
1781           }
1782           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1783         }
1784 
1785         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1786         rs = 0.0;
1787         /* L part */
1788         pc3 = b->a + bi[i+2];
1789         pj  = b->j + bi[i+2] ;
1790         nz  = bi[i+3] - bi[i+2];
1791         for (j=0; j<nz; j++) {
1792           col = pj[j];
1793           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794         }
1795         /* U part */
1796         pc3 = b->a + bdiag[i+3]+1;
1797         pj  = b->j + bdiag[i+3]+1;
1798         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1799         for (j=0; j<nz; j++) {
1800           col = pj[j];
1801           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1802         }
1803 
1804         sctx.rs  = rs;
1805         sctx.pv  = rtmp3[i+2];
1806         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1807 	if(sctx.newshift) break;
1808         pc3  = b->a + bdiag[i+2];
1809         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1810 
1811 	/* Now take care of 3rd column of diagonal 4x4 block. */
1812 	pc4 = rtmp4 + i+2;
1813         if (*pc4 != 0.0){
1814 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1815           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1816           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1817           for (j=0; j<nz; j++) {
1818             col = pj[j];
1819 	    rtmp4[col] -= mul4 * rtmp3[col];
1820           }
1821           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1822         }
1823 
1824         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1825         rs = 0.0;
1826         /* L part */
1827         pc4 = b->a + bi[i+3];
1828         pj  = b->j + bi[i+3] ;
1829         nz  = bi[i+4] - bi[i+3];
1830         for (j=0; j<nz; j++) {
1831           col = pj[j];
1832           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833         }
1834         /* U part */
1835         pc4 = b->a + bdiag[i+4]+1;
1836         pj  = b->j + bdiag[i+4]+1;
1837         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1838         for (j=0; j<nz; j++) {
1839           col = pj[j];
1840           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1841         }
1842 
1843         sctx.rs  = rs;
1844         sctx.pv  = rtmp4[i+3];
1845         ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1846 	if(sctx.newshift) break;
1847         pc4  = b->a + bdiag[i+3];
1848         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1849          break;
1850 
1851       default:
1852         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1853       }
1854       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1855       i += nodesz;                 /* Update the row */
1856     }
1857 
1858     /* MatPivotRefine() */
1859     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1860       /*
1861        * if no shift in this attempt & shifting & started shifting & can refine,
1862        * then try lower shift
1863        */
1864       sctx.shift_hi       = sctx.shift_fraction;
1865       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1866       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1867       sctx.newshift       = PETSC_TRUE;
1868       sctx.nshift++;
1869     }
1870   } while (sctx.newshift);
1871 
1872   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1873   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1874   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1876 
1877   C->ops->solve              = MatSolve_SeqAIJ;
1878   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1879   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1880   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1881   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1882   C->assembled    = PETSC_TRUE;
1883   C->preallocated = PETSC_TRUE;
1884   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1885 
1886   /* MatShiftView(A,info,&sctx) */
1887   if (sctx.nshift){
1888     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1889       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1890     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1891       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1892     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1893       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1894     }
1895   }
1896   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 #undef __FUNCT__
1901 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1902 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1903 {
1904   Mat               C = B;
1905   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1906   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1907   PetscErrorCode    ierr;
1908   const PetscInt    *r,*ic,*c,*ics;
1909   PetscInt          n = A->rmap->n,*bi = b->i;
1910   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1911   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1912   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1913   PetscScalar       mul1,mul2,mul3,tmp;
1914   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1915   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1916   PetscReal         rs=0.0;
1917   FactorShiftCtx    sctx;
1918 
1919   PetscFunctionBegin;
1920   sctx.shift_top      = 0;
1921   sctx.nshift_max     = 0;
1922   sctx.shift_lo       = 0;
1923   sctx.shift_hi       = 0;
1924   sctx.shift_fraction = 0;
1925 
1926   /* if both shift schemes are chosen by user, only use info->shiftpd */
1927   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928     sctx.shift_top = 0;
1929     for (i=0; i<n; i++) {
1930       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931       rs    = 0.0;
1932       ajtmp = aj + ai[i];
1933       rtmp1 = aa + ai[i];
1934       nz = ai[i+1] - ai[i];
1935       for (j=0; j<nz; j++){
1936         if (*ajtmp != i){
1937           rs += PetscAbsScalar(*rtmp1++);
1938         } else {
1939           rs -= PetscRealPart(*rtmp1++);
1940         }
1941         ajtmp++;
1942       }
1943       if (rs>sctx.shift_top) sctx.shift_top = rs;
1944     }
1945     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946     sctx.shift_top *= 1.1;
1947     sctx.nshift_max = 5;
1948     sctx.shift_lo   = 0.;
1949     sctx.shift_hi   = 1.;
1950   }
1951   sctx.shift_amount = 0;
1952   sctx.nshift       = 0;
1953 
1954   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1955   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1957   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1958   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1959   ics   = ic ;
1960   rtmp22 = rtmp11 + n;
1961   rtmp33 = rtmp22 + n;
1962 
1963   node_max = a->inode.node_count;
1964   ns       = a->inode.size;
1965   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1966 
1967   /* If max inode size > 3, split it into two inodes.*/
1968   /* also map the inode sizes according to the ordering */
1969   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1970   for (i=0,j=0; i<node_max; ++i,++j){
1971     if (ns[i]>3) {
1972       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1973       ++j;
1974       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975     } else {
1976       tmp_vec1[j] = ns[i];
1977     }
1978   }
1979   /* Use the correct node_max */
1980   node_max = j;
1981 
1982   /* Now reorder the inode info based on mat re-ordering info */
1983   /* First create a row -> inode_size_array_index map */
1984   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1985   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1986   for (i=0,row=0; i<node_max; i++) {
1987     nodesz = tmp_vec1[i];
1988     for (j=0; j<nodesz; j++,row++) {
1989       nsmap[row] = i;
1990     }
1991   }
1992   /* Using nsmap, create a reordered ns structure */
1993   for (i=0,j=0; i< node_max; i++) {
1994     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1995     tmp_vec2[i]  = nodesz;
1996     j           += nodesz;
1997   }
1998   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1999   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2000   /* Now use the correct ns */
2001   ns = tmp_vec2;
2002 
2003   do {
2004     sctx.newshift = PETSC_FALSE;
2005     /* Now loop over each block-row, and do the factorization */
2006     for (i=0,row=0; i<node_max; i++) {
2007       nodesz = ns[i];
2008       nz     = bi[row+1] - bi[row];
2009       bjtmp  = bj + bi[row];
2010 
2011       switch (nodesz){
2012       case 1:
2013         for  (j=0; j<nz; j++){
2014           idx        = bjtmp[j];
2015           rtmp11[idx] = 0.0;
2016         }
2017 
2018         /* load in initial (unfactored row) */
2019         idx    = r[row];
2020         nz_tmp = ai[idx+1] - ai[idx];
2021         ajtmp  = aj + ai[idx];
2022         v1     = aa + ai[idx];
2023 
2024         for (j=0; j<nz_tmp; j++) {
2025           idx        = ics[ajtmp[j]];
2026           rtmp11[idx] = v1[j];
2027         }
2028         rtmp11[ics[r[row]]] += sctx.shift_amount;
2029 
2030         prow = *bjtmp++ ;
2031         while (prow < row) {
2032           pc1 = rtmp11 + prow;
2033           if (*pc1 != 0.0){
2034             pv   = ba + bd[prow];
2035             pj   = nbj + bd[prow];
2036             mul1 = *pc1 * *pv++;
2037             *pc1 = mul1;
2038             nz_tmp = bi[prow+1] - bd[prow] - 1;
2039             ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2040             for (j=0; j<nz_tmp; j++) {
2041               tmp = pv[j];
2042               idx = pj[j];
2043               rtmp11[idx] -= mul1 * tmp;
2044             }
2045           }
2046           prow = *bjtmp++ ;
2047         }
2048         pj  = bj + bi[row];
2049         pc1 = ba + bi[row];
2050 
2051         sctx.pv    = rtmp11[row];
2052         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053         rs         = 0.0;
2054         for (j=0; j<nz; j++) {
2055           idx    = pj[j];
2056           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058         }
2059         sctx.rs  = rs;
2060         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2061         if (sctx.newshift) goto endofwhile;
2062         break;
2063 
2064       case 2:
2065         for (j=0; j<nz; j++) {
2066           idx        = bjtmp[j];
2067           rtmp11[idx] = 0.0;
2068           rtmp22[idx] = 0.0;
2069         }
2070 
2071         /* load in initial (unfactored row) */
2072         idx    = r[row];
2073         nz_tmp = ai[idx+1] - ai[idx];
2074         ajtmp  = aj + ai[idx];
2075         v1     = aa + ai[idx];
2076         v2     = aa + ai[idx+1];
2077         for (j=0; j<nz_tmp; j++) {
2078           idx        = ics[ajtmp[j]];
2079           rtmp11[idx] = v1[j];
2080           rtmp22[idx] = v2[j];
2081         }
2082         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2083         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2084 
2085         prow = *bjtmp++ ;
2086         while (prow < row) {
2087           pc1 = rtmp11 + prow;
2088           pc2 = rtmp22 + prow;
2089           if (*pc1 != 0.0 || *pc2 != 0.0){
2090             pv   = ba + bd[prow];
2091             pj   = nbj + bd[prow];
2092             mul1 = *pc1 * *pv;
2093             mul2 = *pc2 * *pv;
2094             ++pv;
2095             *pc1 = mul1;
2096             *pc2 = mul2;
2097 
2098             nz_tmp = bi[prow+1] - bd[prow] - 1;
2099             for (j=0; j<nz_tmp; j++) {
2100               tmp = pv[j];
2101               idx = pj[j];
2102               rtmp11[idx] -= mul1 * tmp;
2103               rtmp22[idx] -= mul2 * tmp;
2104             }
2105             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2106           }
2107           prow = *bjtmp++ ;
2108         }
2109 
2110         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111         pc1 = rtmp11 + prow;
2112         pc2 = rtmp22 + prow;
2113 
2114         sctx.pv = *pc1;
2115         pj      = bj + bi[prow];
2116         rs      = 0.0;
2117         for (j=0; j<nz; j++){
2118           idx = pj[j];
2119           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120         }
2121         sctx.rs = rs;
2122         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2123         if (sctx.newshift) goto endofwhile;
2124 
2125         if (*pc2 != 0.0){
2126           pj     = nbj + bd[prow];
2127           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128           *pc2   = mul2;
2129           nz_tmp = bi[prow+1] - bd[prow] - 1;
2130           for (j=0; j<nz_tmp; j++) {
2131             idx = pj[j] ;
2132             tmp = rtmp11[idx];
2133             rtmp22[idx] -= mul2 * tmp;
2134           }
2135           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2136         }
2137 
2138         pj  = bj + bi[row];
2139         pc1 = ba + bi[row];
2140         pc2 = ba + bi[row+1];
2141 
2142         sctx.pv = rtmp22[row+1];
2143         rs = 0.0;
2144         rtmp11[row]   = 1.0/rtmp11[row];
2145         rtmp22[row+1] = 1.0/rtmp22[row+1];
2146         /* copy row entries from dense representation to sparse */
2147         for (j=0; j<nz; j++) {
2148           idx    = pj[j];
2149           pc1[j] = rtmp11[idx];
2150           pc2[j] = rtmp22[idx];
2151           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152         }
2153         sctx.rs = rs;
2154         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2155         if (sctx.newshift) goto endofwhile;
2156         break;
2157 
2158       case 3:
2159         for  (j=0; j<nz; j++) {
2160           idx        = bjtmp[j];
2161           rtmp11[idx] = 0.0;
2162           rtmp22[idx] = 0.0;
2163           rtmp33[idx] = 0.0;
2164         }
2165         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166         idx    = r[row];
2167         nz_tmp = ai[idx+1] - ai[idx];
2168         ajtmp = aj + ai[idx];
2169         v1    = aa + ai[idx];
2170         v2    = aa + ai[idx+1];
2171         v3    = aa + ai[idx+2];
2172         for (j=0; j<nz_tmp; j++) {
2173           idx        = ics[ajtmp[j]];
2174           rtmp11[idx] = v1[j];
2175           rtmp22[idx] = v2[j];
2176           rtmp33[idx] = v3[j];
2177         }
2178         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2179         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2181 
2182         /* loop over all pivot row blocks above this row block */
2183         prow = *bjtmp++ ;
2184         while (prow < row) {
2185           pc1 = rtmp11 + prow;
2186           pc2 = rtmp22 + prow;
2187           pc3 = rtmp33 + prow;
2188           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2189             pv   = ba  + bd[prow];
2190             pj   = nbj + bd[prow];
2191             mul1 = *pc1 * *pv;
2192             mul2 = *pc2 * *pv;
2193             mul3 = *pc3 * *pv;
2194             ++pv;
2195             *pc1 = mul1;
2196             *pc2 = mul2;
2197             *pc3 = mul3;
2198 
2199             nz_tmp = bi[prow+1] - bd[prow] - 1;
2200             /* update this row based on pivot row */
2201             for (j=0; j<nz_tmp; j++) {
2202               tmp = pv[j];
2203               idx = pj[j];
2204               rtmp11[idx] -= mul1 * tmp;
2205               rtmp22[idx] -= mul2 * tmp;
2206               rtmp33[idx] -= mul3 * tmp;
2207             }
2208             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2209           }
2210           prow = *bjtmp++ ;
2211         }
2212 
2213         /* Now take care of diagonal 3x3 block in this set of rows */
2214         /* note: prow = row here */
2215         pc1 = rtmp11 + prow;
2216         pc2 = rtmp22 + prow;
2217         pc3 = rtmp33 + prow;
2218 
2219         sctx.pv = *pc1;
2220         pj      = bj + bi[prow];
2221         rs      = 0.0;
2222         for (j=0; j<nz; j++){
2223           idx = pj[j];
2224           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225         }
2226         sctx.rs = rs;
2227         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2228         if (sctx.newshift) goto endofwhile;
2229 
2230         if (*pc2 != 0.0 || *pc3 != 0.0){
2231           mul2 = (*pc2)/(*pc1);
2232           mul3 = (*pc3)/(*pc1);
2233           *pc2 = mul2;
2234           *pc3 = mul3;
2235           nz_tmp = bi[prow+1] - bd[prow] - 1;
2236           pj     = nbj + bd[prow];
2237           for (j=0; j<nz_tmp; j++) {
2238             idx = pj[j] ;
2239             tmp = rtmp11[idx];
2240             rtmp22[idx] -= mul2 * tmp;
2241             rtmp33[idx] -= mul3 * tmp;
2242           }
2243           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2244         }
2245         ++prow;
2246 
2247         pc2 = rtmp22 + prow;
2248         pc3 = rtmp33 + prow;
2249         sctx.pv = *pc2;
2250         pj      = bj + bi[prow];
2251         rs      = 0.0;
2252         for (j=0; j<nz; j++){
2253           idx = pj[j];
2254           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255         }
2256         sctx.rs = rs;
2257         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2258         if (sctx.newshift) goto endofwhile;
2259 
2260         if (*pc3 != 0.0){
2261           mul3   = (*pc3)/(*pc2);
2262           *pc3   = mul3;
2263           pj     = nbj + bd[prow];
2264           nz_tmp = bi[prow+1] - bd[prow] - 1;
2265           for (j=0; j<nz_tmp; j++) {
2266             idx = pj[j] ;
2267             tmp = rtmp22[idx];
2268             rtmp33[idx] -= mul3 * tmp;
2269           }
2270           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2271         }
2272 
2273         pj  = bj + bi[row];
2274         pc1 = ba + bi[row];
2275         pc2 = ba + bi[row+1];
2276         pc3 = ba + bi[row+2];
2277 
2278         sctx.pv = rtmp33[row+2];
2279         rs = 0.0;
2280         rtmp11[row]   = 1.0/rtmp11[row];
2281         rtmp22[row+1] = 1.0/rtmp22[row+1];
2282         rtmp33[row+2] = 1.0/rtmp33[row+2];
2283         /* copy row entries from dense representation to sparse */
2284         for (j=0; j<nz; j++) {
2285           idx    = pj[j];
2286           pc1[j] = rtmp11[idx];
2287           pc2[j] = rtmp22[idx];
2288           pc3[j] = rtmp33[idx];
2289           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290         }
2291 
2292         sctx.rs = rs;
2293         ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2294         if (sctx.newshift) goto endofwhile;
2295         break;
2296 
2297       default:
2298         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299       }
2300       row += nodesz;                 /* Update the row */
2301     }
2302     endofwhile:;
2303   } while (sctx.newshift);
2304   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2305   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2306   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2309   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2310   /* do not set solve add, since MatSolve_Inode + Add is faster */
2311   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2312   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2313   C->assembled   = PETSC_TRUE;
2314   C->preallocated = PETSC_TRUE;
2315   if (sctx.nshift) {
2316     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2317       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2318     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2319       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2320     }
2321   }
2322   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2323   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2324   PetscFunctionReturn(0);
2325 }
2326 
2327 
2328 /* ----------------------------------------------------------- */
2329 #undef __FUNCT__
2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2332 {
2333   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2334   IS                iscol = a->col,isrow = a->row;
2335   PetscErrorCode    ierr;
2336   const PetscInt    *r,*c,*rout,*cout;
2337   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2338   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2339   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2340   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2341   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2342   const PetscScalar *b;
2343 
2344   PetscFunctionBegin;
2345   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2346   node_max = a->inode.node_count;
2347   ns       = a->inode.size;     /* Node Size array */
2348 
2349   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2350   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2351   tmp  = a->solve_work;
2352 
2353   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2354   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2355 
2356   /* forward solve the lower triangular */
2357   tmps = tmp ;
2358   aa   = a_a ;
2359   aj   = a_j ;
2360   ad   = a->diag;
2361 
2362   for (i = 0,row = 0; i< node_max; ++i){
2363     nsz = ns[i];
2364     aii = ai[row];
2365     v1  = aa + aii;
2366     vi  = aj + aii;
2367     nz  = ai[row+1]- ai[row];
2368 
2369     if (i < node_max-1) {
2370       /* Prefetch the indices for the next block */
2371       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2372       /* Prefetch the data for the next block */
2373       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2374     }
2375 
2376     switch (nsz){               /* Each loop in 'case' is unrolled */
2377     case 1 :
2378       sum1 = b[r[row]];
2379       for(j=0; j<nz-1; j+=2){
2380         i0   = vi[j];
2381         i1   = vi[j+1];
2382         tmp0 = tmps[i0];
2383         tmp1 = tmps[i1];
2384         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2385       }
2386       if(j == nz-1){
2387         tmp0 = tmps[vi[j]];
2388         sum1 -= v1[j]*tmp0;
2389       }
2390       tmp[row++]=sum1;
2391       break;
2392     case 2:
2393       sum1 = b[r[row]];
2394       sum2 = b[r[row+1]];
2395       v2   = aa + ai[row+1];
2396 
2397       for(j=0; j<nz-1; j+=2){
2398         i0   = vi[j];
2399         i1   = vi[j+1];
2400         tmp0 = tmps[i0];
2401         tmp1 = tmps[i1];
2402         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2404       }
2405       if(j == nz-1){
2406         tmp0 = tmps[vi[j]];
2407         sum1 -= v1[j] *tmp0;
2408         sum2 -= v2[j] *tmp0;
2409       }
2410       sum2 -= v2[nz] * sum1;
2411       tmp[row ++]=sum1;
2412       tmp[row ++]=sum2;
2413       break;
2414     case 3:
2415       sum1 = b[r[row]];
2416       sum2 = b[r[row+1]];
2417       sum3 = b[r[row+2]];
2418       v2   = aa + ai[row+1];
2419       v3   = aa + ai[row+2];
2420 
2421       for (j=0; j<nz-1; j+=2){
2422         i0   = vi[j];
2423         i1   = vi[j+1];
2424         tmp0 = tmps[i0];
2425         tmp1 = tmps[i1];
2426         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2427         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2428         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2429       }
2430       if (j == nz-1){
2431         tmp0 = tmps[vi[j]];
2432         sum1 -= v1[j] *tmp0;
2433         sum2 -= v2[j] *tmp0;
2434         sum3 -= v3[j] *tmp0;
2435       }
2436       sum2 -= v2[nz] * sum1;
2437       sum3 -= v3[nz] * sum1;
2438       sum3 -= v3[nz+1] * sum2;
2439       tmp[row ++]=sum1;
2440       tmp[row ++]=sum2;
2441       tmp[row ++]=sum3;
2442       break;
2443 
2444     case 4:
2445       sum1 = b[r[row]];
2446       sum2 = b[r[row+1]];
2447       sum3 = b[r[row+2]];
2448       sum4 = b[r[row+3]];
2449       v2   = aa + ai[row+1];
2450       v3   = aa + ai[row+2];
2451       v4   = aa + ai[row+3];
2452 
2453       for (j=0; j<nz-1; j+=2){
2454         i0   = vi[j];
2455         i1   = vi[j+1];
2456         tmp0 = tmps[i0];
2457         tmp1 = tmps[i1];
2458         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2459         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2460         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2461         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2462       }
2463       if (j == nz-1){
2464         tmp0 = tmps[vi[j]];
2465         sum1 -= v1[j] *tmp0;
2466         sum2 -= v2[j] *tmp0;
2467         sum3 -= v3[j] *tmp0;
2468         sum4 -= v4[j] *tmp0;
2469       }
2470       sum2 -= v2[nz] * sum1;
2471       sum3 -= v3[nz] * sum1;
2472       sum4 -= v4[nz] * sum1;
2473       sum3 -= v3[nz+1] * sum2;
2474       sum4 -= v4[nz+1] * sum2;
2475       sum4 -= v4[nz+2] * sum3;
2476 
2477       tmp[row ++]=sum1;
2478       tmp[row ++]=sum2;
2479       tmp[row ++]=sum3;
2480       tmp[row ++]=sum4;
2481       break;
2482     case 5:
2483       sum1 = b[r[row]];
2484       sum2 = b[r[row+1]];
2485       sum3 = b[r[row+2]];
2486       sum4 = b[r[row+3]];
2487       sum5 = b[r[row+4]];
2488       v2   = aa + ai[row+1];
2489       v3   = aa + ai[row+2];
2490       v4   = aa + ai[row+3];
2491       v5   = aa + ai[row+4];
2492 
2493       for (j=0; j<nz-1; j+=2){
2494         i0   = vi[j];
2495         i1   = vi[j+1];
2496         tmp0 = tmps[i0];
2497         tmp1 = tmps[i1];
2498         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2500         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2501         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2502         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2503       }
2504       if (j == nz-1){
2505         tmp0 = tmps[vi[j]];
2506         sum1 -= v1[j] *tmp0;
2507         sum2 -= v2[j] *tmp0;
2508         sum3 -= v3[j] *tmp0;
2509         sum4 -= v4[j] *tmp0;
2510         sum5 -= v5[j] *tmp0;
2511       }
2512 
2513       sum2 -= v2[nz] * sum1;
2514       sum3 -= v3[nz] * sum1;
2515       sum4 -= v4[nz] * sum1;
2516       sum5 -= v5[nz] * sum1;
2517       sum3 -= v3[nz+1] * sum2;
2518       sum4 -= v4[nz+1] * sum2;
2519       sum5 -= v5[nz+1] * sum2;
2520       sum4 -= v4[nz+2] * sum3;
2521       sum5 -= v5[nz+2] * sum3;
2522       sum5 -= v5[nz+3] * sum4;
2523 
2524       tmp[row ++]=sum1;
2525       tmp[row ++]=sum2;
2526       tmp[row ++]=sum3;
2527       tmp[row ++]=sum4;
2528       tmp[row ++]=sum5;
2529       break;
2530     default:
2531       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2532     }
2533   }
2534   /* backward solve the upper triangular */
2535   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2536     nsz = ns[i];
2537     aii = ad[row+1] + 1;
2538     v1  = aa + aii;
2539     vi  = aj + aii;
2540     nz  = ad[row]- ad[row+1] - 1;
2541 
2542     if (i > 0) {
2543       /* Prefetch the indices for the next block */
2544       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545       /* Prefetch the data for the next block */
2546       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2547     }
2548 
2549     switch (nsz){               /* Each loop in 'case' is unrolled */
2550     case 1 :
2551       sum1 = tmp[row];
2552 
2553       for(j=0 ; j<nz-1; j+=2){
2554         i0   = vi[j];
2555         i1   = vi[j+1];
2556         tmp0 = tmps[i0];
2557         tmp1 = tmps[i1];
2558         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559       }
2560       if (j == nz-1){
2561         tmp0  = tmps[vi[j]];
2562         sum1 -= v1[j]*tmp0;
2563       }
2564       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565       break;
2566     case 2 :
2567       sum1 = tmp[row];
2568       sum2 = tmp[row-1];
2569       v2   = aa + ad[row] + 1;
2570       for (j=0 ; j<nz-1; j+=2){
2571         i0   = vi[j];
2572         i1   = vi[j+1];
2573         tmp0 = tmps[i0];
2574         tmp1 = tmps[i1];
2575         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2576         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2577       }
2578       if (j == nz-1){
2579         tmp0  = tmps[vi[j]];
2580         sum1 -= v1[j]* tmp0;
2581         sum2 -= v2[j+1]* tmp0;
2582       }
2583 
2584       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585       sum2   -= v2[0] * tmp0;
2586       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2587       break;
2588     case 3 :
2589       sum1 = tmp[row];
2590       sum2 = tmp[row -1];
2591       sum3 = tmp[row -2];
2592       v2   = aa + ad[row] + 1;
2593       v3   = aa + ad[row -1] + 1;
2594       for (j=0 ; j<nz-1; j+=2){
2595         i0   = vi[j];
2596         i1   = vi[j+1];
2597         tmp0 = tmps[i0];
2598         tmp1 = tmps[i1];
2599         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2600         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2601         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2602       }
2603       if (j== nz-1){
2604         tmp0  = tmps[vi[j]];
2605         sum1 -= v1[j] * tmp0;
2606         sum2 -= v2[j+1] * tmp0;
2607         sum3 -= v3[j+2] * tmp0;
2608       }
2609       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2610       sum2   -= v2[0]* tmp0;
2611       sum3   -= v3[1] * tmp0;
2612       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2613       sum3   -= v3[0]* tmp0;
2614       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2615 
2616       break;
2617     case 4 :
2618       sum1 = tmp[row];
2619       sum2 = tmp[row -1];
2620       sum3 = tmp[row -2];
2621       sum4 = tmp[row -3];
2622       v2   = aa + ad[row]+1;
2623       v3   = aa + ad[row -1]+1;
2624       v4   = aa + ad[row -2]+1;
2625 
2626       for (j=0 ; j<nz-1; j+=2){
2627         i0   = vi[j];
2628         i1   = vi[j+1];
2629         tmp0 = tmps[i0];
2630         tmp1 = tmps[i1];
2631         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2632         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2633         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2634         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2635       }
2636       if (j== nz-1){
2637         tmp0  = tmps[vi[j]];
2638         sum1 -= v1[j] * tmp0;
2639         sum2 -= v2[j+1] * tmp0;
2640         sum3 -= v3[j+2] * tmp0;
2641         sum4 -= v4[j+3] * tmp0;
2642       }
2643 
2644       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645       sum2   -= v2[0] * tmp0;
2646       sum3   -= v3[1] * tmp0;
2647       sum4   -= v4[2] * tmp0;
2648       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2649       sum3   -= v3[0] * tmp0;
2650       sum4   -= v4[1] * tmp0;
2651       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652       sum4   -= v4[0] * tmp0;
2653       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2654       break;
2655     case 5 :
2656       sum1 = tmp[row];
2657       sum2 = tmp[row -1];
2658       sum3 = tmp[row -2];
2659       sum4 = tmp[row -3];
2660       sum5 = tmp[row -4];
2661       v2   = aa + ad[row]+1;
2662       v3   = aa + ad[row -1]+1;
2663       v4   = aa + ad[row -2]+1;
2664       v5   = aa + ad[row -3]+1;
2665       for (j=0 ; j<nz-1; j+=2){
2666         i0   = vi[j];
2667         i1   = vi[j+1];
2668         tmp0 = tmps[i0];
2669         tmp1 = tmps[i1];
2670         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2671         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2672         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2673         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2674         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2675       }
2676       if (j==nz-1){
2677         tmp0  = tmps[vi[j]];
2678         sum1 -= v1[j] * tmp0;
2679         sum2 -= v2[j+1] * tmp0;
2680         sum3 -= v3[j+2] * tmp0;
2681         sum4 -= v4[j+3] * tmp0;
2682         sum5 -= v5[j+4] * tmp0;
2683       }
2684 
2685       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2686       sum2   -= v2[0] * tmp0;
2687       sum3   -= v3[1] * tmp0;
2688       sum4   -= v4[2] * tmp0;
2689       sum5   -= v5[3] * tmp0;
2690       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2691       sum3   -= v3[0] * tmp0;
2692       sum4   -= v4[1] * tmp0;
2693       sum5   -= v5[2] * tmp0;
2694       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2695       sum4   -= v4[0] * tmp0;
2696       sum5   -= v5[1] * tmp0;
2697       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2698       sum5   -= v5[0] * tmp0;
2699       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2700       break;
2701     default:
2702       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2703     }
2704   }
2705   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2706   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2707   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2709   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2710   PetscFunctionReturn(0);
2711 }
2712 
2713 
2714 /*
2715      Makes a longer coloring[] array and calls the usual code with that
2716 */
2717 #undef __FUNCT__
2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2720 {
2721   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2722   PetscErrorCode  ierr;
2723   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2724   PetscInt        *colorused,i;
2725   ISColoringValue *newcolor;
2726 
2727   PetscFunctionBegin;
2728   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2729   /* loop over inodes, marking a color for each column*/
2730   row = 0;
2731   for (i=0; i<m; i++){
2732     for (j=0; j<ns[i]; j++) {
2733       newcolor[row++] = coloring[i] + j*ncolors;
2734     }
2735   }
2736 
2737   /* eliminate unneeded colors */
2738   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2739   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2740   for (i=0; i<n; i++) {
2741     colorused[newcolor[i]] = 1;
2742   }
2743 
2744   for (i=1; i<5*ncolors; i++) {
2745     colorused[i] += colorused[i-1];
2746   }
2747   ncolors = colorused[5*ncolors-1];
2748   for (i=0; i<n; i++) {
2749     newcolor[i] = colorused[newcolor[i]]-1;
2750   }
2751   ierr = PetscFree(colorused);CHKERRQ(ierr);
2752   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2753   ierr = PetscFree(coloring);CHKERRQ(ierr);
2754   PetscFunctionReturn(0);
2755 }
2756 
2757 #include <../src/mat/blockinvert.h>
2758 
2759 #undef __FUNCT__
2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2762 {
2763   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2764   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2765   MatScalar          *ibdiag,*bdiag,work[25],*t;
2766   PetscScalar        *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2767   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2768   const PetscScalar  *xb, *b;
2769   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2770   PetscErrorCode     ierr;
2771   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2772   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2773 
2774   PetscFunctionBegin;
2775   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2776   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2777 
2778   if (!a->inode.ibdiagvalid) {
2779     if (!a->inode.ibdiag) {
2780       /* calculate space needed for diagonal blocks */
2781       for (i=0; i<m; i++) {
2782 	cnt += sizes[i]*sizes[i];
2783       }
2784       a->inode.bdiagsize = cnt;
2785       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2786     }
2787 
2788     /* copy over the diagonal blocks and invert them */
2789     ibdiag = a->inode.ibdiag;
2790     bdiag  = a->inode.bdiag;
2791     cnt = 0;
2792     for (i=0, row = 0; i<m; i++) {
2793       for (j=0; j<sizes[i]; j++) {
2794         for (k=0; k<sizes[i]; k++) {
2795           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2796         }
2797       }
2798       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2799 
2800       switch(sizes[i]) {
2801         case 1:
2802           /* Create matrix data structure */
2803           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2804           ibdiag[cnt] = 1.0/ibdiag[cnt];
2805           break;
2806         case 2:
2807           ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2808           break;
2809         case 3:
2810           ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2811           break;
2812         case 4:
2813           ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2814           break;
2815         case 5:
2816           ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2817           break;
2818        default:
2819 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2820       }
2821       cnt += sizes[i]*sizes[i];
2822       row += sizes[i];
2823     }
2824     a->inode.ibdiagvalid = PETSC_TRUE;
2825   }
2826   ibdiag = a->inode.ibdiag;
2827   bdiag  = a->inode.bdiag;
2828   t = a->inode.ssor_work;
2829 
2830   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2831   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2832   CHKMEMQ;
2833   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2834   if (flag & SOR_ZERO_INITIAL_GUESS) {
2835     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2836 
2837       for (i=0, row=0; i<m; i++) {
2838         sz  = diag[row] - ii[row];
2839         v1  = a->a + ii[row];
2840         idx = a->j + ii[row];
2841 
2842         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2843         switch (sizes[i]){
2844           case 1:
2845 
2846             sum1  = b[row];
2847             for(n = 0; n<sz-1; n+=2) {
2848               i1   = idx[0];
2849               i2   = idx[1];
2850               idx += 2;
2851               tmp0 = x[i1];
2852               tmp1 = x[i2];
2853               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2854             }
2855 
2856             if (n == sz-1){
2857               tmp0  = x[*idx];
2858               sum1 -= *v1 * tmp0;
2859             }
2860             t[row] = sum1;
2861             x[row++] = sum1*(*ibdiag++);
2862             break;
2863           case 2:
2864             v2    = a->a + ii[row+1];
2865             sum1  = b[row];
2866             sum2  = b[row+1];
2867             for(n = 0; n<sz-1; n+=2) {
2868               i1   = idx[0];
2869               i2   = idx[1];
2870               idx += 2;
2871               tmp0 = x[i1];
2872               tmp1 = x[i2];
2873               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2874               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2875             }
2876 
2877             if (n == sz-1){
2878               tmp0  = x[*idx];
2879               sum1 -= v1[0] * tmp0;
2880               sum2 -= v2[0] * tmp0;
2881             }
2882             t[row]   = sum1;
2883             t[row+1] = sum2;
2884             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2885             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2886             ibdiag  += 4;
2887             break;
2888           case 3:
2889             v2    = a->a + ii[row+1];
2890             v3    = a->a + ii[row+2];
2891             sum1  = b[row];
2892             sum2  = b[row+1];
2893             sum3  = b[row+2];
2894             for(n = 0; n<sz-1; n+=2) {
2895               i1   = idx[0];
2896               i2   = idx[1];
2897               idx += 2;
2898               tmp0 = x[i1];
2899               tmp1 = x[i2];
2900               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2901               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2902               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2903             }
2904 
2905             if (n == sz-1){
2906               tmp0  = x[*idx];
2907               sum1 -= v1[0] * tmp0;
2908               sum2 -= v2[0] * tmp0;
2909               sum3 -= v3[0] * tmp0;
2910             }
2911             t[row]   = sum1;
2912             t[row+1] = sum2;
2913             t[row+2] = sum3;
2914             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2915             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2916             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2917             ibdiag  += 9;
2918             break;
2919           case 4:
2920             v2    = a->a + ii[row+1];
2921             v3    = a->a + ii[row+2];
2922             v4    = a->a + ii[row+3];
2923             sum1  = b[row];
2924             sum2  = b[row+1];
2925             sum3  = b[row+2];
2926             sum4  = b[row+3];
2927             for(n = 0; n<sz-1; n+=2) {
2928               i1   = idx[0];
2929               i2   = idx[1];
2930               idx += 2;
2931               tmp0 = x[i1];
2932               tmp1 = x[i2];
2933               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2934               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2935               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2936               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2937             }
2938 
2939             if (n == sz-1){
2940               tmp0  = x[*idx];
2941               sum1 -= v1[0] * tmp0;
2942               sum2 -= v2[0] * tmp0;
2943               sum3 -= v3[0] * tmp0;
2944               sum4 -= v4[0] * tmp0;
2945             }
2946             t[row]   = sum1;
2947             t[row+1] = sum2;
2948             t[row+2] = sum3;
2949             t[row+3] = sum4;
2950             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2951             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2952             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2953             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2954             ibdiag  += 16;
2955             break;
2956           case 5:
2957             v2    = a->a + ii[row+1];
2958             v3    = a->a + ii[row+2];
2959             v4    = a->a + ii[row+3];
2960             v5    = a->a + ii[row+4];
2961             sum1  = b[row];
2962             sum2  = b[row+1];
2963             sum3  = b[row+2];
2964             sum4  = b[row+3];
2965             sum5  = b[row+4];
2966             for(n = 0; n<sz-1; n+=2) {
2967               i1   = idx[0];
2968               i2   = idx[1];
2969               idx += 2;
2970               tmp0 = x[i1];
2971               tmp1 = x[i2];
2972               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2973               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2974               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2975               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2976               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2977             }
2978 
2979             if (n == sz-1){
2980               tmp0  = x[*idx];
2981               sum1 -= v1[0] * tmp0;
2982               sum2 -= v2[0] * tmp0;
2983               sum3 -= v3[0] * tmp0;
2984               sum4 -= v4[0] * tmp0;
2985               sum5 -= v5[0] * tmp0;
2986             }
2987             t[row]   = sum1;
2988             t[row+1] = sum2;
2989             t[row+2] = sum3;
2990             t[row+3] = sum4;
2991             t[row+4] = sum5;
2992             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2993             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2994             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2995             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2996             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2997             ibdiag  += 25;
2998             break;
2999 	  default:
3000    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3001         }
3002       }
3003 
3004       xb = t;
3005       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3006     } else xb = b;
3007     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3008 
3009       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3010       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3011         ibdiag -= sizes[i]*sizes[i];
3012         sz      = ii[row+1] - diag[row] - 1;
3013         v1      = a->a + diag[row] + 1;
3014         idx     = a->j + diag[row] + 1;
3015 
3016         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3017         switch (sizes[i]){
3018           case 1:
3019 
3020             sum1  = xb[row];
3021             for(n = 0; n<sz-1; n+=2) {
3022               i1   = idx[0];
3023               i2   = idx[1];
3024               idx += 2;
3025               tmp0 = x[i1];
3026               tmp1 = x[i2];
3027               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3028             }
3029 
3030             if (n == sz-1){
3031               tmp0  = x[*idx];
3032               sum1 -= *v1*tmp0;
3033             }
3034             x[row--] = sum1*(*ibdiag);
3035             break;
3036 
3037           case 2:
3038 
3039             sum1  = xb[row];
3040             sum2  = xb[row-1];
3041             /* note that sum1 is associated with the second of the two rows */
3042             v2    = a->a + diag[row-1] + 2;
3043             for(n = 0; n<sz-1; n+=2) {
3044               i1   = idx[0];
3045               i2   = idx[1];
3046               idx += 2;
3047               tmp0 = x[i1];
3048               tmp1 = x[i2];
3049               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3050               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3051             }
3052 
3053             if (n == sz-1){
3054               tmp0  = x[*idx];
3055               sum1 -= *v1*tmp0;
3056               sum2 -= *v2*tmp0;
3057             }
3058             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3059             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3060             break;
3061           case 3:
3062 
3063             sum1  = xb[row];
3064             sum2  = xb[row-1];
3065             sum3  = xb[row-2];
3066             v2    = a->a + diag[row-1] + 2;
3067             v3    = a->a + diag[row-2] + 3;
3068             for(n = 0; n<sz-1; n+=2) {
3069               i1   = idx[0];
3070               i2   = idx[1];
3071               idx += 2;
3072               tmp0 = x[i1];
3073               tmp1 = x[i2];
3074               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3076               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3077             }
3078 
3079             if (n == sz-1){
3080               tmp0  = x[*idx];
3081               sum1 -= *v1*tmp0;
3082               sum2 -= *v2*tmp0;
3083               sum3 -= *v3*tmp0;
3084             }
3085             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3086             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3087             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3088             break;
3089           case 4:
3090 
3091             sum1  = xb[row];
3092             sum2  = xb[row-1];
3093             sum3  = xb[row-2];
3094             sum4  = xb[row-3];
3095             v2    = a->a + diag[row-1] + 2;
3096             v3    = a->a + diag[row-2] + 3;
3097             v4    = a->a + diag[row-3] + 4;
3098             for(n = 0; n<sz-1; n+=2) {
3099               i1   = idx[0];
3100               i2   = idx[1];
3101               idx += 2;
3102               tmp0 = x[i1];
3103               tmp1 = x[i2];
3104               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3105               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3106               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3107               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3108             }
3109 
3110             if (n == sz-1){
3111               tmp0  = x[*idx];
3112               sum1 -= *v1*tmp0;
3113               sum2 -= *v2*tmp0;
3114               sum3 -= *v3*tmp0;
3115               sum4 -= *v4*tmp0;
3116             }
3117             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3118             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3119             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3120             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3121             break;
3122           case 5:
3123 
3124             sum1  = xb[row];
3125             sum2  = xb[row-1];
3126             sum3  = xb[row-2];
3127             sum4  = xb[row-3];
3128             sum5  = xb[row-4];
3129             v2    = a->a + diag[row-1] + 2;
3130             v3    = a->a + diag[row-2] + 3;
3131             v4    = a->a + diag[row-3] + 4;
3132             v5    = a->a + diag[row-4] + 5;
3133             for(n = 0; n<sz-1; n+=2) {
3134               i1   = idx[0];
3135               i2   = idx[1];
3136               idx += 2;
3137               tmp0 = x[i1];
3138               tmp1 = x[i2];
3139               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3140               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3141               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3142               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3143               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3144             }
3145 
3146             if (n == sz-1){
3147               tmp0  = x[*idx];
3148               sum1 -= *v1*tmp0;
3149               sum2 -= *v2*tmp0;
3150               sum3 -= *v3*tmp0;
3151               sum4 -= *v4*tmp0;
3152               sum5 -= *v5*tmp0;
3153             }
3154             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3155             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3156             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3157             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3158             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3159             break;
3160 	  default:
3161    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3162         }
3163       }
3164 
3165       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3166     }
3167     its--;
3168   }
3169   while (its--) {
3170 
3171     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
3172       ibdiag = a->inode.ibdiag;
3173 
3174       for (i=0, row=0; i<m; i++) {
3175         sz  = ii[row + 1] - ii[row];
3176         v1  = a->a + ii[row];
3177         idx = a->j + ii[row];
3178 
3179         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3180         switch (sizes[i]){
3181           case 1:
3182 
3183             sum1  = b[row];
3184             for(n = 0; n<sz-1; n+=2) {
3185               i1   = idx[0];
3186               i2   = idx[1];
3187               idx += 2;
3188               tmp0 = x[i1];
3189               tmp1 = x[i2];
3190               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3191             }
3192 
3193             if (n == sz-1){
3194               tmp0  = x[*idx];
3195               sum1 -= *v1 * tmp0;
3196             }
3197 
3198             /* in MatSOR_SeqAIJ this line would be
3199              *
3200              * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3201              *
3202              * but omega == 1, so this becomes
3203              *
3204              * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3205              *
3206              * but bdiag and ibdiag cancel each other, so we can change this
3207              * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3208              * block sizes as well
3209              */
3210             x[row++] += sum1*(*ibdiag++);
3211             break;
3212           case 2:
3213             v2    = a->a + ii[row+1];
3214             sum1  = b[row];
3215             sum2  = b[row+1];
3216             for(n = 0; n<sz-1; n+=2) {
3217               i1   = idx[0];
3218               i2   = idx[1];
3219               idx += 2;
3220               tmp0 = x[i1];
3221               tmp1 = x[i2];
3222               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3223               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3224             }
3225 
3226             if (n == sz-1){
3227               tmp0  = x[*idx];
3228               sum1 -= v1[0] * tmp0;
3229               sum2 -= v2[0] * tmp0;
3230             }
3231             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3232             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3233             ibdiag  += 4;
3234             break;
3235           case 3:
3236             v2    = a->a + ii[row+1];
3237             v3    = a->a + ii[row+2];
3238             sum1  = b[row];
3239             sum2  = b[row+1];
3240             sum3  = b[row+2];
3241             for(n = 0; n<sz-1; n+=2) {
3242               i1   = idx[0];
3243               i2   = idx[1];
3244               idx += 2;
3245               tmp0 = x[i1];
3246               tmp1 = x[i2];
3247               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3249               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3250             }
3251 
3252             if (n == sz-1){
3253               tmp0  = x[*idx];
3254               sum1 -= v1[0] * tmp0;
3255               sum2 -= v2[0] * tmp0;
3256               sum3 -= v3[0] * tmp0;
3257             }
3258             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3259             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3260             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3261             ibdiag  += 9;
3262             break;
3263           case 4:
3264             v2    = a->a + ii[row+1];
3265             v3    = a->a + ii[row+2];
3266             v4    = a->a + ii[row+3];
3267             sum1  = b[row];
3268             sum2  = b[row+1];
3269             sum3  = b[row+2];
3270             sum4  = b[row+3];
3271             for(n = 0; n<sz-1; n+=2) {
3272               i1   = idx[0];
3273               i2   = idx[1];
3274               idx += 2;
3275               tmp0 = x[i1];
3276               tmp1 = x[i2];
3277               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3278               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3279               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3280               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3281             }
3282 
3283             if (n == sz-1){
3284               tmp0  = x[*idx];
3285               sum1 -= v1[0] * tmp0;
3286               sum2 -= v2[0] * tmp0;
3287               sum3 -= v3[0] * tmp0;
3288               sum4 -= v4[0] * tmp0;
3289             }
3290             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3291             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3292             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3293             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3294             ibdiag  += 16;
3295             break;
3296           case 5:
3297             v2    = a->a + ii[row+1];
3298             v3    = a->a + ii[row+2];
3299             v4    = a->a + ii[row+3];
3300             v5    = a->a + ii[row+4];
3301             sum1  = b[row];
3302             sum2  = b[row+1];
3303             sum3  = b[row+2];
3304             sum4  = b[row+3];
3305             sum5  = b[row+4];
3306             for(n = 0; n<sz-1; n+=2) {
3307               i1   = idx[0];
3308               i2   = idx[1];
3309               idx += 2;
3310               tmp0 = x[i1];
3311               tmp1 = x[i2];
3312               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3313               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3314               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3315               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3316               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3317             }
3318 
3319             if (n == sz-1){
3320               tmp0  = x[*idx];
3321               sum1 -= v1[0] * tmp0;
3322               sum2 -= v2[0] * tmp0;
3323               sum3 -= v3[0] * tmp0;
3324               sum4 -= v4[0] * tmp0;
3325               sum5 -= v5[0] * tmp0;
3326             }
3327             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3328             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3329             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3330             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3331             x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3332             ibdiag  += 25;
3333             break;
3334 	  default:
3335    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3336         }
3337       }
3338 
3339       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3340     }
3341     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3342 
3343       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3344       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3345         ibdiag -= sizes[i]*sizes[i];
3346         sz      = ii[row+1] - ii[row];
3347         v1      = a->a + ii[row];
3348         idx     = a->j + ii[row];
3349 
3350         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3351         switch (sizes[i]){
3352           case 1:
3353 
3354             sum1  = b[row];
3355             for(n = 0; n<sz-1; n+=2) {
3356               i1   = idx[0];
3357               i2   = idx[1];
3358               idx += 2;
3359               tmp0 = x[i1];
3360               tmp1 = x[i2];
3361               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3362             }
3363 
3364             if (n == sz-1){
3365               tmp0  = x[*idx];
3366               sum1 -= *v1*tmp0;
3367             }
3368             x[row--] += sum1*(*ibdiag);
3369             break;
3370 
3371           case 2:
3372 
3373             sum1  = b[row];
3374             sum2  = b[row-1];
3375             /* note that sum1 is associated with the second of the two rows */
3376             v2    = a->a + ii[row - 1];
3377             for(n = 0; n<sz-1; n+=2) {
3378               i1   = idx[0];
3379               i2   = idx[1];
3380               idx += 2;
3381               tmp0 = x[i1];
3382               tmp1 = x[i2];
3383               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3384               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3385             }
3386 
3387             if (n == sz-1){
3388               tmp0  = x[*idx];
3389               sum1 -= *v1*tmp0;
3390               sum2 -= *v2*tmp0;
3391             }
3392             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3393             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3394             break;
3395           case 3:
3396 
3397             sum1  = b[row];
3398             sum2  = b[row-1];
3399             sum3  = b[row-2];
3400             v2    = a->a + ii[row-1];
3401             v3    = a->a + ii[row-2];
3402             for(n = 0; n<sz-1; n+=2) {
3403               i1   = idx[0];
3404               i2   = idx[1];
3405               idx += 2;
3406               tmp0 = x[i1];
3407               tmp1 = x[i2];
3408               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3409               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3410               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3411             }
3412 
3413             if (n == sz-1){
3414               tmp0  = x[*idx];
3415               sum1 -= *v1*tmp0;
3416               sum2 -= *v2*tmp0;
3417               sum3 -= *v3*tmp0;
3418             }
3419             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3420             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3421             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3422             break;
3423           case 4:
3424 
3425             sum1  = b[row];
3426             sum2  = b[row-1];
3427             sum3  = b[row-2];
3428             sum4  = b[row-3];
3429             v2    = a->a + ii[row-1];
3430             v3    = a->a + ii[row-2];
3431             v4    = a->a + ii[row-3];
3432             for(n = 0; n<sz-1; n+=2) {
3433               i1   = idx[0];
3434               i2   = idx[1];
3435               idx += 2;
3436               tmp0 = x[i1];
3437               tmp1 = x[i2];
3438               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3439               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3440               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3441               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3442             }
3443 
3444             if (n == sz-1){
3445               tmp0  = x[*idx];
3446               sum1 -= *v1*tmp0;
3447               sum2 -= *v2*tmp0;
3448               sum3 -= *v3*tmp0;
3449               sum4 -= *v4*tmp0;
3450             }
3451             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3452             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3453             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3454             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3455             break;
3456           case 5:
3457 
3458             sum1  = b[row];
3459             sum2  = b[row-1];
3460             sum3  = b[row-2];
3461             sum4  = b[row-3];
3462             sum5  = b[row-4];
3463             v2    = a->a + ii[row-1];
3464             v3    = a->a + ii[row-2];
3465             v4    = a->a + ii[row-3];
3466             v5    = a->a + ii[row-4];
3467             for(n = 0; n<sz-1; n+=2) {
3468               i1   = idx[0];
3469               i2   = idx[1];
3470               idx += 2;
3471               tmp0 = x[i1];
3472               tmp1 = x[i2];
3473               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3474               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3475               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3476               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3477               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3478             }
3479 
3480             if (n == sz-1){
3481               tmp0  = x[*idx];
3482               sum1 -= *v1*tmp0;
3483               sum2 -= *v2*tmp0;
3484               sum3 -= *v3*tmp0;
3485               sum4 -= *v4*tmp0;
3486               sum5 -= *v5*tmp0;
3487             }
3488             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3489             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3490             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3491             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3492             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3493             break;
3494 	  default:
3495    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3496         }
3497       }
3498 
3499       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3500     }
3501   }
3502   if (flag & SOR_EISENSTAT) {
3503     /*
3504           Apply  (U + D)^-1  where D is now the block diagonal
3505     */
3506     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3507     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3508       ibdiag -= sizes[i]*sizes[i];
3509       sz      = ii[row+1] - diag[row] - 1;
3510       v1      = a->a + diag[row] + 1;
3511       idx     = a->j + diag[row] + 1;
3512       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3513       switch (sizes[i]){
3514         case 1:
3515 
3516 	  sum1  = b[row];
3517 	  for(n = 0; n<sz-1; n+=2) {
3518 	    i1   = idx[0];
3519 	    i2   = idx[1];
3520 	    idx += 2;
3521 	    tmp0 = x[i1];
3522 	    tmp1 = x[i2];
3523 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3524 	  }
3525 
3526 	  if (n == sz-1){
3527 	    tmp0  = x[*idx];
3528 	    sum1 -= *v1*tmp0;
3529 	  }
3530 	  x[row] = sum1*(*ibdiag);row--;
3531 	  break;
3532 
3533         case 2:
3534 
3535 	  sum1  = b[row];
3536 	  sum2  = b[row-1];
3537 	  /* note that sum1 is associated with the second of the two rows */
3538 	  v2    = a->a + diag[row-1] + 2;
3539 	  for(n = 0; n<sz-1; n+=2) {
3540 	    i1   = idx[0];
3541 	    i2   = idx[1];
3542 	    idx += 2;
3543 	    tmp0 = x[i1];
3544 	    tmp1 = x[i2];
3545 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3546 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3547 	  }
3548 
3549 	  if (n == sz-1){
3550 	    tmp0  = x[*idx];
3551 	    sum1 -= *v1*tmp0;
3552 	    sum2 -= *v2*tmp0;
3553 	  }
3554 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3555 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3556           row -= 2;
3557 	  break;
3558         case 3:
3559 
3560 	  sum1  = b[row];
3561 	  sum2  = b[row-1];
3562 	  sum3  = b[row-2];
3563 	  v2    = a->a + diag[row-1] + 2;
3564 	  v3    = a->a + diag[row-2] + 3;
3565 	  for(n = 0; n<sz-1; n+=2) {
3566 	    i1   = idx[0];
3567 	    i2   = idx[1];
3568 	    idx += 2;
3569 	    tmp0 = x[i1];
3570 	    tmp1 = x[i2];
3571 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3572 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3573 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3574 	  }
3575 
3576 	  if (n == sz-1){
3577 	    tmp0  = x[*idx];
3578 	    sum1 -= *v1*tmp0;
3579 	    sum2 -= *v2*tmp0;
3580 	    sum3 -= *v3*tmp0;
3581 	  }
3582 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3583 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3584 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3585           row -= 3;
3586 	  break;
3587         case 4:
3588 
3589 	  sum1  = b[row];
3590 	  sum2  = b[row-1];
3591 	  sum3  = b[row-2];
3592 	  sum4  = b[row-3];
3593 	  v2    = a->a + diag[row-1] + 2;
3594 	  v3    = a->a + diag[row-2] + 3;
3595 	  v4    = a->a + diag[row-3] + 4;
3596 	  for(n = 0; n<sz-1; n+=2) {
3597 	    i1   = idx[0];
3598 	    i2   = idx[1];
3599 	    idx += 2;
3600 	    tmp0 = x[i1];
3601 	    tmp1 = x[i2];
3602 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3603 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3604 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3605 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3606 	  }
3607 
3608 	  if (n == sz-1){
3609 	    tmp0  = x[*idx];
3610 	    sum1 -= *v1*tmp0;
3611 	    sum2 -= *v2*tmp0;
3612 	    sum3 -= *v3*tmp0;
3613 	    sum4 -= *v4*tmp0;
3614 	  }
3615 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3616 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3617 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3618 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3619           row -= 4;
3620 	  break;
3621         case 5:
3622 
3623 	  sum1  = b[row];
3624 	  sum2  = b[row-1];
3625 	  sum3  = b[row-2];
3626 	  sum4  = b[row-3];
3627 	  sum5  = b[row-4];
3628 	  v2    = a->a + diag[row-1] + 2;
3629 	  v3    = a->a + diag[row-2] + 3;
3630 	  v4    = a->a + diag[row-3] + 4;
3631 	  v5    = a->a + diag[row-4] + 5;
3632 	  for(n = 0; n<sz-1; n+=2) {
3633 	    i1   = idx[0];
3634 	    i2   = idx[1];
3635 	    idx += 2;
3636 	    tmp0 = x[i1];
3637 	    tmp1 = x[i2];
3638 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3639 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3640 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3641 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3642 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3643 	  }
3644 
3645 	  if (n == sz-1){
3646 	    tmp0  = x[*idx];
3647 	    sum1 -= *v1*tmp0;
3648 	    sum2 -= *v2*tmp0;
3649 	    sum3 -= *v3*tmp0;
3650 	    sum4 -= *v4*tmp0;
3651 	    sum5 -= *v5*tmp0;
3652 	  }
3653 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3654 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3655 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3656 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3657 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3658           row -= 5;
3659 	  break;
3660         default:
3661 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3662       }
3663     }
3664     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3665 
3666     /*
3667            t = b - D x    where D is the block diagonal
3668     */
3669     cnt = 0;
3670     for (i=0, row=0; i<m; i++) {
3671       switch (sizes[i]){
3672         case 1:
3673 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3674 	  break;
3675         case 2:
3676 	  x1   = x[row]; x2 = x[row+1];
3677 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3678 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3679 	  t[row]   = b[row] - tmp1;
3680 	  t[row+1] = b[row+1] - tmp2; row += 2;
3681 	  cnt += 4;
3682 	  break;
3683         case 3:
3684 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3685 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3686 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3687 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3688 	  t[row] = b[row] - tmp1;
3689 	  t[row+1] = b[row+1] - tmp2;
3690 	  t[row+2] = b[row+2] - tmp3; row += 3;
3691 	  cnt += 9;
3692 	  break;
3693         case 4:
3694 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3695 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3696 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3697 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3698 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3699 	  t[row] = b[row] - tmp1;
3700 	  t[row+1] = b[row+1] - tmp2;
3701 	  t[row+2] = b[row+2] - tmp3;
3702 	  t[row+3] = b[row+3] - tmp4; row += 4;
3703 	  cnt += 16;
3704 	  break;
3705         case 5:
3706 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3707 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3708 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3709 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3710 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3711 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3712 	  t[row] = b[row] - tmp1;
3713 	  t[row+1] = b[row+1] - tmp2;
3714 	  t[row+2] = b[row+2] - tmp3;
3715 	  t[row+3] = b[row+3] - tmp4;
3716 	  t[row+4] = b[row+4] - tmp5;row += 5;
3717 	  cnt += 25;
3718 	  break;
3719         default:
3720 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3721       }
3722     }
3723     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3724 
3725 
3726 
3727     /*
3728           Apply (L + D)^-1 where D is the block diagonal
3729     */
3730     for (i=0, row=0; i<m; i++) {
3731       sz  = diag[row] - ii[row];
3732       v1  = a->a + ii[row];
3733       idx = a->j + ii[row];
3734       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3735       switch (sizes[i]){
3736         case 1:
3737 
3738 	  sum1  = t[row];
3739 	  for(n = 0; n<sz-1; n+=2) {
3740 	    i1   = idx[0];
3741 	    i2   = idx[1];
3742 	    idx += 2;
3743 	    tmp0 = t[i1];
3744 	    tmp1 = t[i2];
3745 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3746 	  }
3747 
3748 	  if (n == sz-1){
3749 	    tmp0  = t[*idx];
3750 	    sum1 -= *v1 * tmp0;
3751 	  }
3752 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3753 	  break;
3754         case 2:
3755 	  v2    = a->a + ii[row+1];
3756 	  sum1  = t[row];
3757 	  sum2  = t[row+1];
3758 	  for(n = 0; n<sz-1; n+=2) {
3759 	    i1   = idx[0];
3760 	    i2   = idx[1];
3761 	    idx += 2;
3762 	    tmp0 = t[i1];
3763 	    tmp1 = t[i2];
3764 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3765 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3766 	  }
3767 
3768 	  if (n == sz-1){
3769 	    tmp0  = t[*idx];
3770 	    sum1 -= v1[0] * tmp0;
3771 	    sum2 -= v2[0] * tmp0;
3772 	  }
3773 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3774 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3775 	  ibdiag  += 4; row += 2;
3776 	  break;
3777         case 3:
3778 	  v2    = a->a + ii[row+1];
3779 	  v3    = a->a + ii[row+2];
3780 	  sum1  = t[row];
3781 	  sum2  = t[row+1];
3782 	  sum3  = t[row+2];
3783 	  for(n = 0; n<sz-1; n+=2) {
3784 	    i1   = idx[0];
3785 	    i2   = idx[1];
3786 	    idx += 2;
3787 	    tmp0 = t[i1];
3788 	    tmp1 = t[i2];
3789 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3790 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3791 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3792 	  }
3793 
3794 	  if (n == sz-1){
3795 	    tmp0  = t[*idx];
3796 	    sum1 -= v1[0] * tmp0;
3797 	    sum2 -= v2[0] * tmp0;
3798 	    sum3 -= v3[0] * tmp0;
3799 	  }
3800 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3801 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3802 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3803 	  ibdiag  += 9; row += 3;
3804 	  break;
3805         case 4:
3806 	  v2    = a->a + ii[row+1];
3807 	  v3    = a->a + ii[row+2];
3808 	  v4    = a->a + ii[row+3];
3809 	  sum1  = t[row];
3810 	  sum2  = t[row+1];
3811 	  sum3  = t[row+2];
3812 	  sum4  = t[row+3];
3813 	  for(n = 0; n<sz-1; n+=2) {
3814 	    i1   = idx[0];
3815 	    i2   = idx[1];
3816 	    idx += 2;
3817 	    tmp0 = t[i1];
3818 	    tmp1 = t[i2];
3819 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3820 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3821 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3822 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3823 	  }
3824 
3825 	  if (n == sz-1){
3826 	    tmp0  = t[*idx];
3827 	    sum1 -= v1[0] * tmp0;
3828 	    sum2 -= v2[0] * tmp0;
3829 	    sum3 -= v3[0] * tmp0;
3830 	    sum4 -= v4[0] * tmp0;
3831 	  }
3832 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3833 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3834 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3835 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3836 	  ibdiag  += 16; row += 4;
3837 	  break;
3838         case 5:
3839 	  v2    = a->a + ii[row+1];
3840 	  v3    = a->a + ii[row+2];
3841 	  v4    = a->a + ii[row+3];
3842 	  v5    = a->a + ii[row+4];
3843 	  sum1  = t[row];
3844 	  sum2  = t[row+1];
3845 	  sum3  = t[row+2];
3846 	  sum4  = t[row+3];
3847 	  sum5  = t[row+4];
3848 	  for(n = 0; n<sz-1; n+=2) {
3849 	    i1   = idx[0];
3850 	    i2   = idx[1];
3851 	    idx += 2;
3852 	    tmp0 = t[i1];
3853 	    tmp1 = t[i2];
3854 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3855 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3856 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3857 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3858 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3859 	  }
3860 
3861 	  if (n == sz-1){
3862 	    tmp0  = t[*idx];
3863 	    sum1 -= v1[0] * tmp0;
3864 	    sum2 -= v2[0] * tmp0;
3865 	    sum3 -= v3[0] * tmp0;
3866 	    sum4 -= v4[0] * tmp0;
3867 	    sum5 -= v5[0] * tmp0;
3868 	  }
3869 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3870 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3871 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3872 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3873 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3874 	  ibdiag  += 25; row += 5;
3875 	  break;
3876         default:
3877 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3878       }
3879     }
3880     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3881   }
3882   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3883   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3884   CHKMEMQ;  PetscFunctionReturn(0);
3885 }
3886 
3887 #undef __FUNCT__
3888 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3889 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3890 {
3891   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3892   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3893   const MatScalar    *bdiag = a->inode.bdiag;
3894   const PetscScalar  *b;
3895   PetscErrorCode      ierr;
3896   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3897   const PetscInt      *sizes = a->inode.size;
3898 
3899   PetscFunctionBegin;
3900   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3901   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3902   cnt = 0;
3903   for (i=0, row=0; i<m; i++) {
3904     switch (sizes[i]){
3905       case 1:
3906 	x[row] = b[row]*bdiag[cnt++];row++;
3907 	break;
3908       case 2:
3909 	x1   = b[row]; x2 = b[row+1];
3910 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3911 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3912 	x[row++] = tmp1;
3913 	x[row++] = tmp2;
3914 	cnt += 4;
3915 	break;
3916       case 3:
3917 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3918 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3919 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3920 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3921 	x[row++] = tmp1;
3922 	x[row++] = tmp2;
3923 	x[row++] = tmp3;
3924 	cnt += 9;
3925 	break;
3926       case 4:
3927 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3928 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3929 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3930 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3931 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3932 	x[row++] = tmp1;
3933 	x[row++] = tmp2;
3934 	x[row++] = tmp3;
3935 	x[row++] = tmp4;
3936 	cnt += 16;
3937 	break;
3938       case 5:
3939 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3940 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3941 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3942 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3943 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3944 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3945 	x[row++] = tmp1;
3946 	x[row++] = tmp2;
3947 	x[row++] = tmp3;
3948 	x[row++] = tmp4;
3949 	x[row++] = tmp5;
3950 	cnt += 25;
3951 	break;
3952       default:
3953 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3954     }
3955   }
3956   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3957   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3958   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3959   PetscFunctionReturn(0);
3960 }
3961 
3962 /*
3963     samestructure indicates that the matrix has not changed its nonzero structure so we
3964     do not need to recompute the inodes
3965 */
3966 #undef __FUNCT__
3967 #define __FUNCT__ "Mat_CheckInode"
3968 PetscErrorCode Mat_CheckInode(Mat A,PetscBool  samestructure)
3969 {
3970   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3971   PetscErrorCode ierr;
3972   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3973   PetscBool      flag;
3974 
3975   PetscFunctionBegin;
3976   if (!a->inode.use)                     PetscFunctionReturn(0);
3977   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3978 
3979 
3980   m = A->rmap->n;
3981   if (a->inode.size) {ns = a->inode.size;}
3982   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3983 
3984   i          = 0;
3985   node_count = 0;
3986   idx        = a->j;
3987   ii         = a->i;
3988   while (i < m){                /* For each row */
3989     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3990     /* Limits the number of elements in a node to 'a->inode.limit' */
3991     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3992       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3993       if (nzy != nzx) break;
3994       idy  += nzx;             /* Same nonzero pattern */
3995       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3996       if (!flag) break;
3997     }
3998     ns[node_count++] = blk_size;
3999     idx += blk_size*nzx;
4000     i    = j;
4001   }
4002   /* If not enough inodes found,, do not use inode version of the routines */
4003   if (!m || node_count > .8*m) {
4004     ierr = PetscFree(ns);CHKERRQ(ierr);
4005     a->inode.node_count       = 0;
4006     a->inode.size             = PETSC_NULL;
4007     a->inode.use              = PETSC_FALSE;
4008     A->ops->mult              = MatMult_SeqAIJ;
4009     A->ops->sor               = MatSOR_SeqAIJ;
4010     A->ops->multadd           = MatMultAdd_SeqAIJ;
4011     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4012     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4013     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4014     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4015     A->ops->coloringpatch     = 0;
4016     A->ops->multdiagonalblock = 0;
4017     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4018   } else {
4019     if (!A->factortype) {
4020       A->ops->mult              = MatMult_SeqAIJ_Inode;
4021       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4022       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4023       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4024       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4025       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4026       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4027       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4028       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4029     } else {
4030       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4031     }
4032     a->inode.node_count       = node_count;
4033     a->inode.size             = ns;
4034     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4035   }
4036   a->inode.checked = PETSC_TRUE;
4037   PetscFunctionReturn(0);
4038 }
4039 
4040 #undef __FUNCT__
4041 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4042 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4043 {
4044   Mat            B=*C;
4045   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4046   PetscErrorCode ierr;
4047   PetscInt       m=A->rmap->n;
4048 
4049   PetscFunctionBegin;
4050 
4051   c->inode.use          = a->inode.use;
4052   c->inode.limit        = a->inode.limit;
4053   c->inode.max_limit    = a->inode.max_limit;
4054   if (a->inode.size){
4055     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4056     c->inode.node_count = a->inode.node_count;
4057     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4058     /* note the table of functions below should match that in Mat_CheckInode() */
4059     if (!B->factortype) {
4060       B->ops->mult              = MatMult_SeqAIJ_Inode;
4061       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4062       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4063       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4064       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4065       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4066       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4067       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4068       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4069     } else {
4070       B->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4071     }
4072   } else {
4073     c->inode.size       = 0;
4074     c->inode.node_count = 0;
4075   }
4076   c->inode.ibdiagvalid = PETSC_FALSE;
4077   c->inode.ibdiag      = 0;
4078   c->inode.bdiag       = 0;
4079   PetscFunctionReturn(0);
4080 }
4081 
4082 #undef __FUNCT__
4083 #define __FUNCT__ "MatGetRow_FactoredLU"
4084 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4085 {
4086   PetscInt k, *vi;
4087   PetscFunctionBegin;
4088   vi = aj + ai[row];
4089   for(k=0;k<nzl;k++) cols[k] = vi[k];
4090   vi = aj + adiag[row];
4091   cols[nzl] = vi[0];
4092   vi = aj + adiag[row+1]+1;
4093   for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
4094   PetscFunctionReturn(0);
4095 }
4096 /*
4097    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4098    Modified from Mat_CheckInode().
4099 
4100    Input Parameters:
4101 +  Mat A - ILU or LU matrix factor
4102 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4103     do not need to recompute the inodes
4104 */
4105 #undef __FUNCT__
4106 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4107 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool  samestructure)
4108 {
4109   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4110   PetscErrorCode ierr;
4111   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4112   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4113   PetscBool      flag;
4114 
4115   PetscFunctionBegin;
4116   if (!a->inode.use)                     PetscFunctionReturn(0);
4117   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4118 
4119   m = A->rmap->n;
4120   if (a->inode.size) {ns = a->inode.size;}
4121   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
4122 
4123   i          = 0;
4124   node_count = 0;
4125   while (i < m){                /* For each row */
4126     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4127     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4128     nzx  = nzl1 + nzu1 + 1;
4129     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
4130     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4131 
4132     /* Limits the number of elements in a node to 'a->inode.limit' */
4133     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4134       nzl2    = ai[j+1] - ai[j];
4135       nzu2    = adiag[j] - adiag[j+1] - 1;
4136       nzy     = nzl2 + nzu2 + 1;
4137       if( nzy != nzx) break;
4138       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
4139       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4140       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4141       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
4142       ierr = PetscFree(cols2);CHKERRQ(ierr);
4143     }
4144     ns[node_count++] = blk_size;
4145     ierr = PetscFree(cols1);CHKERRQ(ierr);
4146     i    = j;
4147   }
4148   /* If not enough inodes found,, do not use inode version of the routines */
4149   if (!m || node_count > .8*m) {
4150     ierr = PetscFree(ns);CHKERRQ(ierr);
4151     a->inode.node_count     = 0;
4152     a->inode.size           = PETSC_NULL;
4153     a->inode.use            = PETSC_FALSE;
4154     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4155   } else {
4156     A->ops->mult              = 0;
4157     A->ops->sor               = 0;
4158     A->ops->multadd           = 0;
4159     A->ops->getrowij          = 0;
4160     A->ops->restorerowij      = 0;
4161     A->ops->getcolumnij       = 0;
4162     A->ops->restorecolumnij   = 0;
4163     A->ops->coloringpatch     = 0;
4164     A->ops->multdiagonalblock = 0;
4165     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4166     a->inode.node_count       = node_count;
4167     a->inode.size             = ns;
4168     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4169   }
4170   a->inode.checked = PETSC_TRUE;
4171   PetscFunctionReturn(0);
4172 }
4173 
4174 /*
4175      This is really ugly. if inodes are used this replaces the
4176   permutations with ones that correspond to rows/cols of the matrix
4177   rather then inode blocks
4178 */
4179 #undef __FUNCT__
4180 #define __FUNCT__ "MatInodeAdjustForInodes"
4181 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4182 {
4183   PetscErrorCode ierr;
4184 
4185   PetscFunctionBegin;
4186   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4187   PetscFunctionReturn(0);
4188 }
4189 
4190 EXTERN_C_BEGIN
4191 #undef __FUNCT__
4192 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4193 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4194 {
4195   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
4196   PetscErrorCode ierr;
4197   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4198   const PetscInt *ridx,*cidx;
4199   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4200   PetscInt       nslim_col,*ns_col;
4201   IS             ris = *rperm,cis = *cperm;
4202 
4203   PetscFunctionBegin;
4204   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4205   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4206 
4207   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4208   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4209   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4210 
4211   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4212   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4213 
4214   /* Form the inode structure for the rows of permuted matric using inv perm*/
4215   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4216 
4217   /* Construct the permutations for rows*/
4218   for (i=0,row = 0; i<nslim_row; ++i){
4219     indx      = ridx[i];
4220     start_val = tns[indx];
4221     end_val   = tns[indx + 1];
4222     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4223   }
4224 
4225   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4226   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4227 
4228  /* Construct permutations for columns */
4229   for (i=0,col=0; i<nslim_col; ++i){
4230     indx      = cidx[i];
4231     start_val = tns[indx];
4232     end_val   = tns[indx + 1];
4233     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4234   }
4235 
4236   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4237   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4238   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4239   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4240 
4241   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4242   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4243 
4244   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4245   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4246   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4247   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4248   ierr = PetscFree(tns);CHKERRQ(ierr);
4249   PetscFunctionReturn(0);
4250 }
4251 EXTERN_C_END
4252 
4253 #undef __FUNCT__
4254 #define __FUNCT__ "MatInodeGetInodeSizes"
4255 /*@C
4256    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4257 
4258    Not Collective
4259 
4260    Input Parameter:
4261 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4262 
4263    Output Parameter:
4264 +  node_count - no of inodes present in the matrix.
4265 .  sizes      - an array of size node_count,with sizes of each inode.
4266 -  limit      - the max size used to generate the inodes.
4267 
4268    Level: advanced
4269 
4270    Notes: This routine returns some internal storage information
4271    of the matrix, it is intended to be used by advanced users.
4272    It should be called after the matrix is assembled.
4273    The contents of the sizes[] array should not be changed.
4274    PETSC_NULL may be passed for information not requested.
4275 
4276 .keywords: matrix, seqaij, get, inode
4277 
4278 .seealso: MatGetInfo()
4279 @*/
4280 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4281 {
4282   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4283 
4284   PetscFunctionBegin;
4285   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4286   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
4287   if (f) {
4288     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4289   }
4290   PetscFunctionReturn(0);
4291 }
4292 
4293 EXTERN_C_BEGIN
4294 #undef __FUNCT__
4295 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4296 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4297 {
4298   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4299 
4300   PetscFunctionBegin;
4301   if (node_count) *node_count = a->inode.node_count;
4302   if (sizes)      *sizes      = a->inode.size;
4303   if (limit)      *limit      = a->inode.limit;
4304   PetscFunctionReturn(0);
4305 }
4306 EXTERN_C_END
4307