xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 030f984af8d8bb4c203755d35bded3c05b3d83ce)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
13 
14   PetscFunctionBegin;
15   n = A->cmap->n;
16   m = A->rmap->n;
17   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
18   ns_row = a->inode.size;
19 
20   min_mn = (m < n) ? m : n;
21   if (!ns) {
22     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
23     for (; count+1 < n; count++,i++) ;
24     if (count < n)  {
25       i++;
26     }
27     *size = i;
28     PetscFunctionReturn(0);
29   }
30   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
31 
32   /* Use the same row structure wherever feasible. */
33   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
34     ns_col[i] = ns_row[i];
35   }
36 
37   /* if m < n; pad up the remainder with inode_limit */
38   for (; count+1 < n; count++,i++) {
39     ns_col[i] = 1;
40   }
41   /* The last node is the odd ball. padd it up with the remaining rows; */
42   if (count < n) {
43     ns_col[i] = n - count;
44     i++;
45   } else if (count > n) {
46     /* Adjust for the over estimation */
47     ns_col[i-1] += n - count;
48   }
49   *size = i;
50   *ns   = ns_col;
51   PetscFunctionReturn(0);
52 }
53 
54 /*
55       This builds symmetric version of nonzero structure,
56 */
57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58 {
59   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
60   PetscErrorCode ierr;
61   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
64 
65   PetscFunctionBegin;
66   nslim_row = a->inode.node_count;
67   m         = A->rmap->n;
68   n         = A->cmap->n;
69   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
70   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
71 
72   /* Use the row_inode as column_inode */
73   nslim_col = nslim_row;
74   ns_col    = ns_row;
75 
76   /* allocate space for reformated inode structure */
77   ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr);
78   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
79 
80   for (i1=0,col=0; i1<nslim_col; ++i1) {
81     nsz = ns_col[i1];
82     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
83   }
84   /* allocate space for row pointers */
85   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
86   *iia = ia;
87   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
88 
89   /* determine the number of columns in each row */
90   ia[0] = oshift;
91   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
92 
93     j    = aj + ai[row] + ishift;
94     jmax = aj + ai[row+1] + ishift;
95     if (j==jmax) continue; /* empty row */
96     col  = *j++ + ishift;
97     i2   = tvc[col];
98     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
99       ia[i1+1]++;
100       ia[i2+1]++;
101       i2++;                     /* Start col of next node */
102       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
103       i2 = tvc[col];
104     }
105     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
106   }
107 
108   /* shift ia[i] to point to next row */
109   for (i1=1; i1<nslim_row+1; i1++) {
110     row        = ia[i1-1];
111     ia[i1]    += row;
112     work[i1-1] = row - oshift;
113   }
114 
115   /* allocate space for column pointers */
116   nz   = ia[nslim_row] + (!ishift);
117   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
118   *jja = ja;
119 
120   /* loop over lower triangular part putting into ja */
121   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
122     j    = aj + ai[row] + ishift;
123     jmax = aj + ai[row+1] + ishift;
124     if (j==jmax) continue; /* empty row */
125     col  = *j++ + ishift;
126     i2   = tvc[col];
127     while (i2<i1 && j<jmax) {
128       ja[work[i2]++] = i1 + oshift;
129       ja[work[i1]++] = i2 + oshift;
130       ++i2;
131       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
132       i2 = tvc[col];
133     }
134     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
135 
136   }
137   ierr = PetscFree(work);CHKERRQ(ierr);
138   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
139   PetscFunctionReturn(0);
140 }
141 
142 /*
143       This builds nonsymmetric version of nonzero structure,
144 */
145 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
146 {
147   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
148   PetscErrorCode ierr;
149   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
150   PetscInt       *tns,*tvc,nsz,i1,i2;
151   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
152 
153   PetscFunctionBegin;
154   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
155   nslim_row = a->inode.node_count;
156   n         = A->cmap->n;
157 
158   /* Create The column_inode for this matrix */
159   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
160 
161   /* allocate space for reformated column_inode structure */
162   ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr);
163   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
164 
165   for (i1=0,col=0; i1<nslim_col; ++i1) {
166     nsz = ns_col[i1];
167     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
168   }
169   /* allocate space for row pointers */
170   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
171   *iia = ia;
172   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
173 
174   /* determine the number of columns in each row */
175   ia[0] = oshift;
176   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
177     j   = aj + ai[row] + ishift;
178     nz  = ai[row+1] - ai[row];
179     if (!nz) continue; /* empty row */
180     col = *j++ + ishift;
181     i2  = tvc[col];
182     while (nz-- > 0) {           /* off-diagonal elemets */
183       ia[i1+1]++;
184       i2++;                     /* Start col of next node */
185       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
186       if (nz > 0) i2 = tvc[col];
187     }
188   }
189 
190   /* shift ia[i] to point to next row */
191   for (i1=1; i1<nslim_row+1; i1++) {
192     row        = ia[i1-1];
193     ia[i1]    += row;
194     work[i1-1] = row - oshift;
195   }
196 
197   /* allocate space for column pointers */
198   nz   = ia[nslim_row] + (!ishift);
199   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
200   *jja = ja;
201 
202   /* loop over matrix putting into ja */
203   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
204     j   = aj + ai[row] + ishift;
205     nz  = ai[row+1] - ai[row];
206     if (!nz) continue; /* empty row */
207     col = *j++ + ishift;
208     i2  = tvc[col];
209     while (nz-- > 0) {
210       ja[work[i1]++] = i2 + oshift;
211       ++i2;
212       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
213       if (nz > 0) i2 = tvc[col];
214     }
215   }
216   ierr = PetscFree(ns_col);CHKERRQ(ierr);
217   ierr = PetscFree(work);CHKERRQ(ierr);
218   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
223 {
224   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   *n = a->inode.node_count;
229   if (!ia) PetscFunctionReturn(0);
230   if (!blockcompressed) {
231     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
232   } else if (symmetric) {
233     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
234   } else {
235     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
236   }
237   PetscFunctionReturn(0);
238 }
239 
240 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
241 {
242   PetscErrorCode ierr;
243 
244   PetscFunctionBegin;
245   if (!ia) PetscFunctionReturn(0);
246 
247   if (!blockcompressed) {
248     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
249   } else {
250     ierr = PetscFree(*ia);CHKERRQ(ierr);
251     ierr = PetscFree(*ja);CHKERRQ(ierr);
252   }
253   PetscFunctionReturn(0);
254 }
255 
256 /* ----------------------------------------------------------- */
257 
258 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
259 {
260   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
261   PetscErrorCode ierr;
262   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
263   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
264 
265   PetscFunctionBegin;
266   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
267   nslim_row = a->inode.node_count;
268   n         = A->cmap->n;
269 
270   /* Create The column_inode for this matrix */
271   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
272 
273   /* allocate space for reformated column_inode structure */
274   ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr);
275   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
276 
277   for (i1=0,col=0; i1<nslim_col; ++i1) {
278     nsz = ns_col[i1];
279     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
280   }
281   /* allocate space for column pointers */
282   ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
283   *iia = ia;
284   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
285 
286   /* determine the number of columns in each row */
287   ia[0] = oshift;
288   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
289     j   = aj + ai[row] + ishift;
290     col = *j++ + ishift;
291     i2  = tvc[col];
292     nz  = ai[row+1] - ai[row];
293     while (nz-- > 0) {           /* off-diagonal elemets */
294       /* ia[i1+1]++; */
295       ia[i2+1]++;
296       i2++;
297       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
298       if (nz > 0) i2 = tvc[col];
299     }
300   }
301 
302   /* shift ia[i] to point to next col */
303   for (i1=1; i1<nslim_col+1; i1++) {
304     col        = ia[i1-1];
305     ia[i1]    += col;
306     work[i1-1] = col - oshift;
307   }
308 
309   /* allocate space for column pointers */
310   nz   = ia[nslim_col] + (!ishift);
311   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
312   *jja = ja;
313 
314   /* loop over matrix putting into ja */
315   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
316     j   = aj + ai[row] + ishift;
317     col = *j++ + ishift;
318     i2  = tvc[col];
319     nz  = ai[row+1] - ai[row];
320     while (nz-- > 0) {
321       /* ja[work[i1]++] = i2 + oshift; */
322       ja[work[i2]++] = i1 + oshift;
323       i2++;
324       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
325       if (nz > 0) i2 = tvc[col];
326     }
327   }
328   ierr = PetscFree(ns_col);CHKERRQ(ierr);
329   ierr = PetscFree(work);CHKERRQ(ierr);
330   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
335 {
336   PetscErrorCode ierr;
337 
338   PetscFunctionBegin;
339   ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr);
340   if (!ia) PetscFunctionReturn(0);
341 
342   if (!blockcompressed) {
343     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
344   } else if (symmetric) {
345     /* Since the indices are symmetric it does'nt matter */
346     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
347   } else {
348     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
349   }
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
354 {
355   PetscErrorCode ierr;
356 
357   PetscFunctionBegin;
358   if (!ia) PetscFunctionReturn(0);
359   if (!blockcompressed) {
360     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
361   } else {
362     ierr = PetscFree(*ia);CHKERRQ(ierr);
363     ierr = PetscFree(*ja);CHKERRQ(ierr);
364   }
365   PetscFunctionReturn(0);
366 }
367 
368 /* ----------------------------------------------------------- */
369 
370 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
371 {
372   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
373   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
374   PetscScalar       *y;
375   const PetscScalar *x;
376   const MatScalar   *v1,*v2,*v3,*v4,*v5;
377   PetscErrorCode    ierr;
378   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
379   const PetscInt    *idx,*ns,*ii;
380 
381 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
382 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
383 #endif
384 
385   PetscFunctionBegin;
386   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
387   node_max = a->inode.node_count;
388   ns       = a->inode.size;     /* Node Size array */
389   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
391   idx      = a->j;
392   v1       = a->a;
393   ii       = a->i;
394 
395   for (i = 0,row = 0; i< node_max; ++i) {
396     nsz         = ns[i];
397     n           = ii[1] - ii[0];
398     nonzerorow += (n>0)*nsz;
399     ii         += nsz;
400     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
401     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
402     sz = n;                     /* No of non zeros in this row */
403                                 /* Switch on the size of Node */
404     switch (nsz) {               /* Each loop in 'case' is unrolled */
405     case 1:
406       sum1 = 0.;
407 
408       for (n = 0; n< sz-1; n+=2) {
409         i1    = idx[0];         /* The instructions are ordered to */
410         i2    = idx[1];         /* make the compiler's job easy */
411         idx  += 2;
412         tmp0  = x[i1];
413         tmp1  = x[i2];
414         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
415       }
416 
417       if (n == sz-1) {          /* Take care of the last nonzero  */
418         tmp0  = x[*idx++];
419         sum1 += *v1++ *tmp0;
420       }
421       y[row++]=sum1;
422       break;
423     case 2:
424       sum1 = 0.;
425       sum2 = 0.;
426       v2   = v1 + n;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];
430         i2    = idx[1];
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
436       }
437       if (n == sz-1) {
438         tmp0  = x[*idx++];
439         sum1 += *v1++ * tmp0;
440         sum2 += *v2++ * tmp0;
441       }
442       y[row++]=sum1;
443       y[row++]=sum2;
444       v1      =v2;              /* Since the next block to be processed starts there*/
445       idx    +=sz;
446       break;
447     case 3:
448       sum1 = 0.;
449       sum2 = 0.;
450       sum3 = 0.;
451       v2   = v1 + n;
452       v3   = v2 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1    = idx[0];
456         i2    = idx[1];
457         idx  += 2;
458         tmp0  = x[i1];
459         tmp1  = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
463       }
464       if (n == sz-1) {
465         tmp0  = x[*idx++];
466         sum1 += *v1++ * tmp0;
467         sum2 += *v2++ * tmp0;
468         sum3 += *v3++ * tmp0;
469       }
470       y[row++]=sum1;
471       y[row++]=sum2;
472       y[row++]=sum3;
473       v1      =v3;              /* Since the next block to be processed starts there*/
474       idx    +=2*sz;
475       break;
476     case 4:
477       sum1 = 0.;
478       sum2 = 0.;
479       sum3 = 0.;
480       sum4 = 0.;
481       v2   = v1 + n;
482       v3   = v2 + n;
483       v4   = v3 + n;
484 
485       for (n = 0; n< sz-1; n+=2) {
486         i1    = idx[0];
487         i2    = idx[1];
488         idx  += 2;
489         tmp0  = x[i1];
490         tmp1  = x[i2];
491         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
492         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
493         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
494         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
495       }
496       if (n == sz-1) {
497         tmp0  = x[*idx++];
498         sum1 += *v1++ * tmp0;
499         sum2 += *v2++ * tmp0;
500         sum3 += *v3++ * tmp0;
501         sum4 += *v4++ * tmp0;
502       }
503       y[row++]=sum1;
504       y[row++]=sum2;
505       y[row++]=sum3;
506       y[row++]=sum4;
507       v1      =v4;              /* Since the next block to be processed starts there*/
508       idx    +=3*sz;
509       break;
510     case 5:
511       sum1 = 0.;
512       sum2 = 0.;
513       sum3 = 0.;
514       sum4 = 0.;
515       sum5 = 0.;
516       v2   = v1 + n;
517       v3   = v2 + n;
518       v4   = v3 + n;
519       v5   = v4 + n;
520 
521       for (n = 0; n<sz-1; n+=2) {
522         i1    = idx[0];
523         i2    = idx[1];
524         idx  += 2;
525         tmp0  = x[i1];
526         tmp1  = x[i2];
527         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
528         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
529         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
530         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
531         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
532       }
533       if (n == sz-1) {
534         tmp0  = x[*idx++];
535         sum1 += *v1++ * tmp0;
536         sum2 += *v2++ * tmp0;
537         sum3 += *v3++ * tmp0;
538         sum4 += *v4++ * tmp0;
539         sum5 += *v5++ * tmp0;
540       }
541       y[row++]=sum1;
542       y[row++]=sum2;
543       y[row++]=sum3;
544       y[row++]=sum4;
545       y[row++]=sum5;
546       v1      =v5;       /* Since the next block to be processed starts there */
547       idx    +=4*sz;
548       break;
549     default:
550       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
551     }
552   }
553   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
554   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
555   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
556   PetscFunctionReturn(0);
557 }
558 /* ----------------------------------------------------------- */
559 /* Almost same code as the MatMult_SeqAIJ_Inode() */
560 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
561 {
562   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
563   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
564   const MatScalar   *v1,*v2,*v3,*v4,*v5;
565   const PetscScalar *x;
566   PetscScalar       *y,*z,*zt;
567   PetscErrorCode    ierr;
568   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
569   const PetscInt    *idx,*ns,*ii;
570 
571   PetscFunctionBegin;
572   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
573   node_max = a->inode.node_count;
574   ns       = a->inode.size;     /* Node Size array */
575 
576   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
577   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
578   zt = z;
579 
580   idx = a->j;
581   v1  = a->a;
582   ii  = a->i;
583 
584   for (i = 0,row = 0; i< node_max; ++i) {
585     nsz = ns[i];
586     n   = ii[1] - ii[0];
587     ii += nsz;
588     sz  = n;                    /* No of non zeros in this row */
589                                 /* Switch on the size of Node */
590     switch (nsz) {               /* Each loop in 'case' is unrolled */
591     case 1:
592       sum1 = *zt++;
593 
594       for (n = 0; n< sz-1; n+=2) {
595         i1    = idx[0];         /* The instructions are ordered to */
596         i2    = idx[1];         /* make the compiler's job easy */
597         idx  += 2;
598         tmp0  = x[i1];
599         tmp1  = x[i2];
600         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
601       }
602 
603       if (n   == sz-1) {          /* Take care of the last nonzero  */
604         tmp0  = x[*idx++];
605         sum1 += *v1++ * tmp0;
606       }
607       y[row++]=sum1;
608       break;
609     case 2:
610       sum1 = *zt++;
611       sum2 = *zt++;
612       v2   = v1 + n;
613 
614       for (n = 0; n< sz-1; n+=2) {
615         i1    = idx[0];
616         i2    = idx[1];
617         idx  += 2;
618         tmp0  = x[i1];
619         tmp1  = x[i2];
620         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
621         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
622       }
623       if (n   == sz-1) {
624         tmp0  = x[*idx++];
625         sum1 += *v1++ * tmp0;
626         sum2 += *v2++ * tmp0;
627       }
628       y[row++]=sum1;
629       y[row++]=sum2;
630       v1      =v2;              /* Since the next block to be processed starts there*/
631       idx    +=sz;
632       break;
633     case 3:
634       sum1 = *zt++;
635       sum2 = *zt++;
636       sum3 = *zt++;
637       v2   = v1 + n;
638       v3   = v2 + n;
639 
640       for (n = 0; n< sz-1; n+=2) {
641         i1    = idx[0];
642         i2    = idx[1];
643         idx  += 2;
644         tmp0  = x[i1];
645         tmp1  = x[i2];
646         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
649       }
650       if (n == sz-1) {
651         tmp0  = x[*idx++];
652         sum1 += *v1++ * tmp0;
653         sum2 += *v2++ * tmp0;
654         sum3 += *v3++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       y[row++]=sum3;
659       v1      =v3;              /* Since the next block to be processed starts there*/
660       idx    +=2*sz;
661       break;
662     case 4:
663       sum1 = *zt++;
664       sum2 = *zt++;
665       sum3 = *zt++;
666       sum4 = *zt++;
667       v2   = v1 + n;
668       v3   = v2 + n;
669       v4   = v3 + n;
670 
671       for (n = 0; n< sz-1; n+=2) {
672         i1    = idx[0];
673         i2    = idx[1];
674         idx  += 2;
675         tmp0  = x[i1];
676         tmp1  = x[i2];
677         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
678         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
679         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
680         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
681       }
682       if (n == sz-1) {
683         tmp0  = x[*idx++];
684         sum1 += *v1++ * tmp0;
685         sum2 += *v2++ * tmp0;
686         sum3 += *v3++ * tmp0;
687         sum4 += *v4++ * tmp0;
688       }
689       y[row++]=sum1;
690       y[row++]=sum2;
691       y[row++]=sum3;
692       y[row++]=sum4;
693       v1      =v4;              /* Since the next block to be processed starts there*/
694       idx    +=3*sz;
695       break;
696     case 5:
697       sum1 = *zt++;
698       sum2 = *zt++;
699       sum3 = *zt++;
700       sum4 = *zt++;
701       sum5 = *zt++;
702       v2   = v1 + n;
703       v3   = v2 + n;
704       v4   = v3 + n;
705       v5   = v4 + n;
706 
707       for (n = 0; n<sz-1; n+=2) {
708         i1    = idx[0];
709         i2    = idx[1];
710         idx  += 2;
711         tmp0  = x[i1];
712         tmp1  = x[i2];
713         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
714         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
715         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
716         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
717         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
718       }
719       if (n   == sz-1) {
720         tmp0  = x[*idx++];
721         sum1 += *v1++ * tmp0;
722         sum2 += *v2++ * tmp0;
723         sum3 += *v3++ * tmp0;
724         sum4 += *v4++ * tmp0;
725         sum5 += *v5++ * tmp0;
726       }
727       y[row++]=sum1;
728       y[row++]=sum2;
729       y[row++]=sum3;
730       y[row++]=sum4;
731       y[row++]=sum5;
732       v1      =v5;       /* Since the next block to be processed starts there */
733       idx    +=4*sz;
734       break;
735     default:
736       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
737     }
738   }
739   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
740   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
741   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
742   PetscFunctionReturn(0);
743 }
744 
745 /* ----------------------------------------------------------- */
746 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
747 {
748   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
749   IS                iscol = a->col,isrow = a->row;
750   PetscErrorCode    ierr;
751   const PetscInt    *r,*c,*rout,*cout;
752   PetscInt          i,j,n = A->rmap->n,nz;
753   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
754   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
755   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
756   PetscScalar       sum1,sum2,sum3,sum4,sum5;
757   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
758   const PetscScalar *b;
759 
760   PetscFunctionBegin;
761   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
762   node_max = a->inode.node_count;
763   ns       = a->inode.size;     /* Node Size array */
764 
765   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
766   ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
767   tmp  = a->solve_work;
768 
769   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
770   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
771 
772   /* forward solve the lower triangular */
773   tmps = tmp;
774   aa   = a_a;
775   aj   = a_j;
776   ad   = a->diag;
777 
778   for (i = 0,row = 0; i< node_max; ++i) {
779     nsz = ns[i];
780     aii = ai[row];
781     v1  = aa + aii;
782     vi  = aj + aii;
783     nz  = ad[row]- aii;
784     if (i < node_max-1) {
785       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
786       * but our indexing to determine it's size could. */
787       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
788       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
789       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
790       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
791     }
792 
793     switch (nsz) {               /* Each loop in 'case' is unrolled */
794     case 1:
795       sum1 = b[*r++];
796       for (j=0; j<nz-1; j+=2) {
797         i0    = vi[0];
798         i1    = vi[1];
799         vi   +=2;
800         tmp0  = tmps[i0];
801         tmp1  = tmps[i1];
802         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
803       }
804       if (j == nz-1) {
805         tmp0  = tmps[*vi++];
806         sum1 -= *v1++ *tmp0;
807       }
808       tmp[row++]=sum1;
809       break;
810     case 2:
811       sum1 = b[*r++];
812       sum2 = b[*r++];
813       v2   = aa + ai[row+1];
814 
815       for (j=0; j<nz-1; j+=2) {
816         i0    = vi[0];
817         i1    = vi[1];
818         vi   +=2;
819         tmp0  = tmps[i0];
820         tmp1  = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
823       }
824       if (j == nz-1) {
825         tmp0  = tmps[*vi++];
826         sum1 -= *v1++ *tmp0;
827         sum2 -= *v2++ *tmp0;
828       }
829       sum2     -= *v2++ *sum1;
830       tmp[row++]=sum1;
831       tmp[row++]=sum2;
832       break;
833     case 3:
834       sum1 = b[*r++];
835       sum2 = b[*r++];
836       sum3 = b[*r++];
837       v2   = aa + ai[row+1];
838       v3   = aa + ai[row+2];
839 
840       for (j=0; j<nz-1; j+=2) {
841         i0    = vi[0];
842         i1    = vi[1];
843         vi   +=2;
844         tmp0  = tmps[i0];
845         tmp1  = tmps[i1];
846         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
849       }
850       if (j == nz-1) {
851         tmp0  = tmps[*vi++];
852         sum1 -= *v1++ *tmp0;
853         sum2 -= *v2++ *tmp0;
854         sum3 -= *v3++ *tmp0;
855       }
856       sum2 -= *v2++ * sum1;
857       sum3 -= *v3++ * sum1;
858       sum3 -= *v3++ * sum2;
859 
860       tmp[row++]=sum1;
861       tmp[row++]=sum2;
862       tmp[row++]=sum3;
863       break;
864 
865     case 4:
866       sum1 = b[*r++];
867       sum2 = b[*r++];
868       sum3 = b[*r++];
869       sum4 = b[*r++];
870       v2   = aa + ai[row+1];
871       v3   = aa + ai[row+2];
872       v4   = aa + ai[row+3];
873 
874       for (j=0; j<nz-1; j+=2) {
875         i0    = vi[0];
876         i1    = vi[1];
877         vi   +=2;
878         tmp0  = tmps[i0];
879         tmp1  = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
884       }
885       if (j == nz-1) {
886         tmp0  = tmps[*vi++];
887         sum1 -= *v1++ *tmp0;
888         sum2 -= *v2++ *tmp0;
889         sum3 -= *v3++ *tmp0;
890         sum4 -= *v4++ *tmp0;
891       }
892       sum2 -= *v2++ * sum1;
893       sum3 -= *v3++ * sum1;
894       sum4 -= *v4++ * sum1;
895       sum3 -= *v3++ * sum2;
896       sum4 -= *v4++ * sum2;
897       sum4 -= *v4++ * sum3;
898 
899       tmp[row++]=sum1;
900       tmp[row++]=sum2;
901       tmp[row++]=sum3;
902       tmp[row++]=sum4;
903       break;
904     case 5:
905       sum1 = b[*r++];
906       sum2 = b[*r++];
907       sum3 = b[*r++];
908       sum4 = b[*r++];
909       sum5 = b[*r++];
910       v2   = aa + ai[row+1];
911       v3   = aa + ai[row+2];
912       v4   = aa + ai[row+3];
913       v5   = aa + ai[row+4];
914 
915       for (j=0; j<nz-1; j+=2) {
916         i0    = vi[0];
917         i1    = vi[1];
918         vi   +=2;
919         tmp0  = tmps[i0];
920         tmp1  = tmps[i1];
921         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
922         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
923         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
924         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
925         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
926       }
927       if (j == nz-1) {
928         tmp0  = tmps[*vi++];
929         sum1 -= *v1++ *tmp0;
930         sum2 -= *v2++ *tmp0;
931         sum3 -= *v3++ *tmp0;
932         sum4 -= *v4++ *tmp0;
933         sum5 -= *v5++ *tmp0;
934       }
935 
936       sum2 -= *v2++ * sum1;
937       sum3 -= *v3++ * sum1;
938       sum4 -= *v4++ * sum1;
939       sum5 -= *v5++ * sum1;
940       sum3 -= *v3++ * sum2;
941       sum4 -= *v4++ * sum2;
942       sum5 -= *v5++ * sum2;
943       sum4 -= *v4++ * sum3;
944       sum5 -= *v5++ * sum3;
945       sum5 -= *v5++ * sum4;
946 
947       tmp[row++]=sum1;
948       tmp[row++]=sum2;
949       tmp[row++]=sum3;
950       tmp[row++]=sum4;
951       tmp[row++]=sum5;
952       break;
953     default:
954       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
955     }
956   }
957   /* backward solve the upper triangular */
958   for (i=node_max -1,row = n-1; i>=0; i--) {
959     nsz = ns[i];
960     aii = ai[row+1] -1;
961     v1  = aa + aii;
962     vi  = aj + aii;
963     nz  = aii- ad[row];
964     switch (nsz) {               /* Each loop in 'case' is unrolled */
965     case 1:
966       sum1 = tmp[row];
967 
968       for (j=nz; j>1; j-=2) {
969         vi   -=2;
970         i0    = vi[2];
971         i1    = vi[1];
972         tmp0  = tmps[i0];
973         tmp1  = tmps[i1];
974         v1   -= 2;
975         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
976       }
977       if (j==1) {
978         tmp0  = tmps[*vi--];
979         sum1 -= *v1-- * tmp0;
980       }
981       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
982       break;
983     case 2:
984       sum1 = tmp[row];
985       sum2 = tmp[row -1];
986       v2   = aa + ai[row]-1;
987       for (j=nz; j>1; j-=2) {
988         vi   -=2;
989         i0    = vi[2];
990         i1    = vi[1];
991         tmp0  = tmps[i0];
992         tmp1  = tmps[i1];
993         v1   -= 2;
994         v2   -= 2;
995         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
996         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
997       }
998       if (j==1) {
999         tmp0  = tmps[*vi--];
1000         sum1 -= *v1-- * tmp0;
1001         sum2 -= *v2-- * tmp0;
1002       }
1003 
1004       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1005       sum2   -= *v2-- * tmp0;
1006       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1007       break;
1008     case 3:
1009       sum1 = tmp[row];
1010       sum2 = tmp[row -1];
1011       sum3 = tmp[row -2];
1012       v2   = aa + ai[row]-1;
1013       v3   = aa + ai[row -1]-1;
1014       for (j=nz; j>1; j-=2) {
1015         vi   -=2;
1016         i0    = vi[2];
1017         i1    = vi[1];
1018         tmp0  = tmps[i0];
1019         tmp1  = tmps[i1];
1020         v1   -= 2;
1021         v2   -= 2;
1022         v3   -= 2;
1023         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1026       }
1027       if (j==1) {
1028         tmp0  = tmps[*vi--];
1029         sum1 -= *v1-- * tmp0;
1030         sum2 -= *v2-- * tmp0;
1031         sum3 -= *v3-- * tmp0;
1032       }
1033       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1034       sum2   -= *v2-- * tmp0;
1035       sum3   -= *v3-- * tmp0;
1036       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1037       sum3   -= *v3-- * tmp0;
1038       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1039 
1040       break;
1041     case 4:
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       sum4 = tmp[row -3];
1046       v2   = aa + ai[row]-1;
1047       v3   = aa + ai[row -1]-1;
1048       v4   = aa + ai[row -2]-1;
1049 
1050       for (j=nz; j>1; j-=2) {
1051         vi   -=2;
1052         i0    = vi[2];
1053         i1    = vi[1];
1054         tmp0  = tmps[i0];
1055         tmp1  = tmps[i1];
1056         v1   -= 2;
1057         v2   -= 2;
1058         v3   -= 2;
1059         v4   -= 2;
1060         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1061         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1062         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1063         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1064       }
1065       if (j==1) {
1066         tmp0  = tmps[*vi--];
1067         sum1 -= *v1-- * tmp0;
1068         sum2 -= *v2-- * tmp0;
1069         sum3 -= *v3-- * tmp0;
1070         sum4 -= *v4-- * tmp0;
1071       }
1072 
1073       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1074       sum2   -= *v2-- * tmp0;
1075       sum3   -= *v3-- * tmp0;
1076       sum4   -= *v4-- * tmp0;
1077       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1078       sum3   -= *v3-- * tmp0;
1079       sum4   -= *v4-- * tmp0;
1080       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1081       sum4   -= *v4-- * tmp0;
1082       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1083       break;
1084     case 5:
1085       sum1 = tmp[row];
1086       sum2 = tmp[row -1];
1087       sum3 = tmp[row -2];
1088       sum4 = tmp[row -3];
1089       sum5 = tmp[row -4];
1090       v2   = aa + ai[row]-1;
1091       v3   = aa + ai[row -1]-1;
1092       v4   = aa + ai[row -2]-1;
1093       v5   = aa + ai[row -3]-1;
1094       for (j=nz; j>1; j-=2) {
1095         vi   -= 2;
1096         i0    = vi[2];
1097         i1    = vi[1];
1098         tmp0  = tmps[i0];
1099         tmp1  = tmps[i1];
1100         v1   -= 2;
1101         v2   -= 2;
1102         v3   -= 2;
1103         v4   -= 2;
1104         v5   -= 2;
1105         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1106         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1107         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1108         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1109         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1110       }
1111       if (j==1) {
1112         tmp0  = tmps[*vi--];
1113         sum1 -= *v1-- * tmp0;
1114         sum2 -= *v2-- * tmp0;
1115         sum3 -= *v3-- * tmp0;
1116         sum4 -= *v4-- * tmp0;
1117         sum5 -= *v5-- * tmp0;
1118       }
1119 
1120       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1121       sum2   -= *v2-- * tmp0;
1122       sum3   -= *v3-- * tmp0;
1123       sum4   -= *v4-- * tmp0;
1124       sum5   -= *v5-- * tmp0;
1125       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1126       sum3   -= *v3-- * tmp0;
1127       sum4   -= *v4-- * tmp0;
1128       sum5   -= *v5-- * tmp0;
1129       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1130       sum4   -= *v4-- * tmp0;
1131       sum5   -= *v5-- * tmp0;
1132       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1133       sum5   -= *v5-- * tmp0;
1134       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1135       break;
1136     default:
1137       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1138     }
1139   }
1140   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1141   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1142   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1143   ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
1144   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1145   PetscFunctionReturn(0);
1146 }
1147 
1148 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1149 {
1150   Mat             C     =B;
1151   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1152   IS              isrow = b->row,isicol = b->icol;
1153   PetscErrorCode  ierr;
1154   const PetscInt  *r,*ic,*ics;
1155   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1156   PetscInt        i,j,k,nz,nzL,row,*pj;
1157   const PetscInt  *ajtmp,*bjtmp;
1158   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1159   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1160   FactorShiftCtx  sctx;
1161   const PetscInt  *ddiag;
1162   PetscReal       rs;
1163   MatScalar       d;
1164   PetscInt        inod,nodesz,node_max,col;
1165   const PetscInt  *ns;
1166   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1167 
1168   PetscFunctionBegin;
1169   /* MatPivotSetUp(): initialize shift context sctx */
1170   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1171 
1172   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1173     ddiag          = a->diag;
1174     sctx.shift_top = info->zeropivot;
1175     for (i=0; i<n; i++) {
1176       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1177       d  = (aa)[ddiag[i]];
1178       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1179       v  = aa+ai[i];
1180       nz = ai[i+1] - ai[i];
1181       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1182       if (rs>sctx.shift_top) sctx.shift_top = rs;
1183     }
1184     sctx.shift_top *= 1.1;
1185     sctx.nshift_max = 5;
1186     sctx.shift_lo   = 0.;
1187     sctx.shift_hi   = 1.;
1188   }
1189 
1190   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1191   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1192 
1193   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1194   ics   = ic;
1195 
1196   node_max = a->inode.node_count;
1197   ns       = a->inode.size;
1198   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1199 
1200   /* If max inode size > 4, split it into two inodes.*/
1201   /* also map the inode sizes according to the ordering */
1202   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1203   for (i=0,j=0; i<node_max; ++i,++j) {
1204     if (ns[i] > 4) {
1205       tmp_vec1[j] = 4;
1206       ++j;
1207       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1208     } else {
1209       tmp_vec1[j] = ns[i];
1210     }
1211   }
1212   /* Use the correct node_max */
1213   node_max = j;
1214 
1215   /* Now reorder the inode info based on mat re-ordering info */
1216   /* First create a row -> inode_size_array_index map */
1217   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1218   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1219   for (i=0,row=0; i<node_max; i++) {
1220     nodesz = tmp_vec1[i];
1221     for (j=0; j<nodesz; j++,row++) {
1222       nsmap[row] = i;
1223     }
1224   }
1225   /* Using nsmap, create a reordered ns structure */
1226   for (i=0,j=0; i< node_max; i++) {
1227     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1228     tmp_vec2[i] = nodesz;
1229     j          += nodesz;
1230   }
1231   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1232   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1233 
1234   /* Now use the correct ns */
1235   ns = tmp_vec2;
1236 
1237   do {
1238     sctx.newshift = PETSC_FALSE;
1239     /* Now loop over each block-row, and do the factorization */
1240     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1241       nodesz = ns[inod];
1242 
1243       switch (nodesz) {
1244       case 1:
1245         /*----------*/
1246         /* zero rtmp1 */
1247         /* L part */
1248         nz    = bi[i+1] - bi[i];
1249         bjtmp = bj + bi[i];
1250         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1251 
1252         /* U part */
1253         nz    = bdiag[i]-bdiag[i+1];
1254         bjtmp = bj + bdiag[i+1]+1;
1255         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1256 
1257         /* load in initial (unfactored row) */
1258         nz    = ai[r[i]+1] - ai[r[i]];
1259         ajtmp = aj + ai[r[i]];
1260         v     = aa + ai[r[i]];
1261         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1262 
1263         /* ZeropivotApply() */
1264         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1265 
1266         /* elimination */
1267         bjtmp = bj + bi[i];
1268         row   = *bjtmp++;
1269         nzL   = bi[i+1] - bi[i];
1270         for (k=0; k < nzL; k++) {
1271           pc = rtmp1 + row;
1272           if (*pc != 0.0) {
1273             pv   = b->a + bdiag[row];
1274             mul1 = *pc * (*pv);
1275             *pc  = mul1;
1276             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1277             pv   = b->a + bdiag[row+1]+1;
1278             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1279             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1280             ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1281           }
1282           row = *bjtmp++;
1283         }
1284 
1285         /* finished row so stick it into b->a */
1286         rs = 0.0;
1287         /* L part */
1288         pv = b->a + bi[i];
1289         pj = b->j + bi[i];
1290         nz = bi[i+1] - bi[i];
1291         for (j=0; j<nz; j++) {
1292           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1293         }
1294 
1295         /* U part */
1296         pv = b->a + bdiag[i+1]+1;
1297         pj = b->j + bdiag[i+1]+1;
1298         nz = bdiag[i] - bdiag[i+1]-1;
1299         for (j=0; j<nz; j++) {
1300           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1301         }
1302 
1303         /* Check zero pivot */
1304         sctx.rs = rs;
1305         sctx.pv = rtmp1[i];
1306         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1307         if (sctx.newshift) break;
1308 
1309         /* Mark diagonal and invert diagonal for simplier triangular solves */
1310         pv  = b->a + bdiag[i];
1311         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1312         break;
1313 
1314       case 2:
1315         /*----------*/
1316         /* zero rtmp1 and rtmp2 */
1317         /* L part */
1318         nz    = bi[i+1] - bi[i];
1319         bjtmp = bj + bi[i];
1320         for  (j=0; j<nz; j++) {
1321           col        = bjtmp[j];
1322           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1323         }
1324 
1325         /* U part */
1326         nz    = bdiag[i]-bdiag[i+1];
1327         bjtmp = bj + bdiag[i+1]+1;
1328         for  (j=0; j<nz; j++) {
1329           col        = bjtmp[j];
1330           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1331         }
1332 
1333         /* load in initial (unfactored row) */
1334         nz    = ai[r[i]+1] - ai[r[i]];
1335         ajtmp = aj + ai[r[i]];
1336         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1337         for (j=0; j<nz; j++) {
1338           col        = ics[ajtmp[j]];
1339           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1340         }
1341         /* ZeropivotApply(): shift the diagonal of the matrix  */
1342         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1343 
1344         /* elimination */
1345         bjtmp = bj + bi[i];
1346         row   = *bjtmp++; /* pivot row */
1347         nzL   = bi[i+1] - bi[i];
1348         for (k=0; k < nzL; k++) {
1349           pc1 = rtmp1 + row;
1350           pc2 = rtmp2 + row;
1351           if (*pc1 != 0.0 || *pc2 != 0.0) {
1352             pv   = b->a + bdiag[row];
1353             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1354             *pc1 = mul1;       *pc2 = mul2;
1355 
1356             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1357             pv = b->a + bdiag[row+1]+1;
1358             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1359             for (j=0; j<nz; j++) {
1360               col         = pj[j];
1361               rtmp1[col] -= mul1 * pv[j];
1362               rtmp2[col] -= mul2 * pv[j];
1363             }
1364             ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr);
1365           }
1366           row = *bjtmp++;
1367         }
1368 
1369         /* finished row i; check zero pivot, then stick row i into b->a */
1370         rs = 0.0;
1371         /* L part */
1372         pc1 = b->a + bi[i];
1373         pj  = b->j + bi[i];
1374         nz  = bi[i+1] - bi[i];
1375         for (j=0; j<nz; j++) {
1376           col    = pj[j];
1377           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1378         }
1379         /* U part */
1380         pc1 = b->a + bdiag[i+1]+1;
1381         pj  = b->j + bdiag[i+1]+1;
1382         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1383         for (j=0; j<nz; j++) {
1384           col    = pj[j];
1385           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1386         }
1387 
1388         sctx.rs = rs;
1389         sctx.pv = rtmp1[i];
1390         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1391         if (sctx.newshift) break;
1392         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1393         *pc1 = 1.0/sctx.pv;
1394 
1395         /* Now take care of diagonal 2x2 block. */
1396         pc2 = rtmp2 + i;
1397         if (*pc2 != 0.0) {
1398           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1399           *pc2 = mul1;          /* insert L entry */
1400           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1401           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1402           for (j=0; j<nz; j++) {
1403             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1404           }
1405           ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1406         }
1407 
1408         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1409         rs = 0.0;
1410         /* L part */
1411         pc2 = b->a + bi[i+1];
1412         pj  = b->j + bi[i+1];
1413         nz  = bi[i+2] - bi[i+1];
1414         for (j=0; j<nz; j++) {
1415           col    = pj[j];
1416           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1417         }
1418         /* U part */
1419         pc2 = b->a + bdiag[i+2]+1;
1420         pj  = b->j + bdiag[i+2]+1;
1421         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1422         for (j=0; j<nz; j++) {
1423           col    = pj[j];
1424           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1425         }
1426 
1427         sctx.rs = rs;
1428         sctx.pv = rtmp2[i+1];
1429         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1430         if (sctx.newshift) break;
1431         pc2  = b->a + bdiag[i+1];
1432         *pc2 = 1.0/sctx.pv;
1433         break;
1434 
1435       case 3:
1436         /*----------*/
1437         /* zero rtmp */
1438         /* L part */
1439         nz    = bi[i+1] - bi[i];
1440         bjtmp = bj + bi[i];
1441         for  (j=0; j<nz; j++) {
1442           col        = bjtmp[j];
1443           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1444         }
1445 
1446         /* U part */
1447         nz    = bdiag[i]-bdiag[i+1];
1448         bjtmp = bj + bdiag[i+1]+1;
1449         for  (j=0; j<nz; j++) {
1450           col        = bjtmp[j];
1451           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1452         }
1453 
1454         /* load in initial (unfactored row) */
1455         nz    = ai[r[i]+1] - ai[r[i]];
1456         ajtmp = aj + ai[r[i]];
1457         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1458         for (j=0; j<nz; j++) {
1459           col        = ics[ajtmp[j]];
1460           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1461         }
1462         /* ZeropivotApply(): shift the diagonal of the matrix  */
1463         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1464 
1465         /* elimination */
1466         bjtmp = bj + bi[i];
1467         row   = *bjtmp++; /* pivot row */
1468         nzL   = bi[i+1] - bi[i];
1469         for (k=0; k < nzL; k++) {
1470           pc1 = rtmp1 + row;
1471           pc2 = rtmp2 + row;
1472           pc3 = rtmp3 + row;
1473           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1474             pv   = b->a + bdiag[row];
1475             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1476             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1477 
1478             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1479             pv = b->a + bdiag[row+1]+1;
1480             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1481             for (j=0; j<nz; j++) {
1482               col         = pj[j];
1483               rtmp1[col] -= mul1 * pv[j];
1484               rtmp2[col] -= mul2 * pv[j];
1485               rtmp3[col] -= mul3 * pv[j];
1486             }
1487             ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr);
1488           }
1489           row = *bjtmp++;
1490         }
1491 
1492         /* finished row i; check zero pivot, then stick row i into b->a */
1493         rs = 0.0;
1494         /* L part */
1495         pc1 = b->a + bi[i];
1496         pj  = b->j + bi[i];
1497         nz  = bi[i+1] - bi[i];
1498         for (j=0; j<nz; j++) {
1499           col    = pj[j];
1500           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1501         }
1502         /* U part */
1503         pc1 = b->a + bdiag[i+1]+1;
1504         pj  = b->j + bdiag[i+1]+1;
1505         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1506         for (j=0; j<nz; j++) {
1507           col    = pj[j];
1508           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1509         }
1510 
1511         sctx.rs = rs;
1512         sctx.pv = rtmp1[i];
1513         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1514         if (sctx.newshift) break;
1515         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1516         *pc1 = 1.0/sctx.pv;
1517 
1518         /* Now take care of 1st column of diagonal 3x3 block. */
1519         pc2 = rtmp2 + i;
1520         pc3 = rtmp3 + i;
1521         if (*pc2 != 0.0 || *pc3 != 0.0) {
1522           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1523           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1524           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1525           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1526           for (j=0; j<nz; j++) {
1527             col         = pj[j];
1528             rtmp2[col] -= mul2 * rtmp1[col];
1529             rtmp3[col] -= mul3 * rtmp1[col];
1530           }
1531           ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr);
1532         }
1533 
1534         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1535         rs = 0.0;
1536         /* L part */
1537         pc2 = b->a + bi[i+1];
1538         pj  = b->j + bi[i+1];
1539         nz  = bi[i+2] - bi[i+1];
1540         for (j=0; j<nz; j++) {
1541           col    = pj[j];
1542           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1543         }
1544         /* U part */
1545         pc2 = b->a + bdiag[i+2]+1;
1546         pj  = b->j + bdiag[i+2]+1;
1547         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1548         for (j=0; j<nz; j++) {
1549           col    = pj[j];
1550           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1551         }
1552 
1553         sctx.rs = rs;
1554         sctx.pv = rtmp2[i+1];
1555         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1556         if (sctx.newshift) break;
1557         pc2  = b->a + bdiag[i+1];
1558         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1559 
1560         /* Now take care of 2nd column of diagonal 3x3 block. */
1561         pc3 = rtmp3 + i+1;
1562         if (*pc3 != 0.0) {
1563           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1564           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1565           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1566           for (j=0; j<nz; j++) {
1567             col         = pj[j];
1568             rtmp3[col] -= mul3 * rtmp2[col];
1569           }
1570           ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1571         }
1572 
1573         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1574         rs = 0.0;
1575         /* L part */
1576         pc3 = b->a + bi[i+2];
1577         pj  = b->j + bi[i+2];
1578         nz  = bi[i+3] - bi[i+2];
1579         for (j=0; j<nz; j++) {
1580           col    = pj[j];
1581           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1582         }
1583         /* U part */
1584         pc3 = b->a + bdiag[i+3]+1;
1585         pj  = b->j + bdiag[i+3]+1;
1586         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1587         for (j=0; j<nz; j++) {
1588           col    = pj[j];
1589           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1590         }
1591 
1592         sctx.rs = rs;
1593         sctx.pv = rtmp3[i+2];
1594         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1595         if (sctx.newshift) break;
1596         pc3  = b->a + bdiag[i+2];
1597         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1598         break;
1599       case 4:
1600         /*----------*/
1601         /* zero rtmp */
1602         /* L part */
1603         nz    = bi[i+1] - bi[i];
1604         bjtmp = bj + bi[i];
1605         for  (j=0; j<nz; j++) {
1606           col        = bjtmp[j];
1607           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1608         }
1609 
1610         /* U part */
1611         nz    = bdiag[i]-bdiag[i+1];
1612         bjtmp = bj + bdiag[i+1]+1;
1613         for  (j=0; j<nz; j++) {
1614           col        = bjtmp[j];
1615           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1616         }
1617 
1618         /* load in initial (unfactored row) */
1619         nz    = ai[r[i]+1] - ai[r[i]];
1620         ajtmp = aj + ai[r[i]];
1621         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1622         for (j=0; j<nz; j++) {
1623           col        = ics[ajtmp[j]];
1624           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1625         }
1626         /* ZeropivotApply(): shift the diagonal of the matrix  */
1627         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1628 
1629         /* elimination */
1630         bjtmp = bj + bi[i];
1631         row   = *bjtmp++; /* pivot row */
1632         nzL   = bi[i+1] - bi[i];
1633         for (k=0; k < nzL; k++) {
1634           pc1 = rtmp1 + row;
1635           pc2 = rtmp2 + row;
1636           pc3 = rtmp3 + row;
1637           pc4 = rtmp4 + row;
1638           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1639             pv   = b->a + bdiag[row];
1640             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1641             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1642 
1643             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1644             pv = b->a + bdiag[row+1]+1;
1645             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1646             for (j=0; j<nz; j++) {
1647               col         = pj[j];
1648               rtmp1[col] -= mul1 * pv[j];
1649               rtmp2[col] -= mul2 * pv[j];
1650               rtmp3[col] -= mul3 * pv[j];
1651               rtmp4[col] -= mul4 * pv[j];
1652             }
1653             ierr = PetscLogFlops(4+8.0*nz);CHKERRQ(ierr);
1654           }
1655           row = *bjtmp++;
1656         }
1657 
1658         /* finished row i; check zero pivot, then stick row i into b->a */
1659         rs = 0.0;
1660         /* L part */
1661         pc1 = b->a + bi[i];
1662         pj  = b->j + bi[i];
1663         nz  = bi[i+1] - bi[i];
1664         for (j=0; j<nz; j++) {
1665           col    = pj[j];
1666           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1667         }
1668         /* U part */
1669         pc1 = b->a + bdiag[i+1]+1;
1670         pj  = b->j + bdiag[i+1]+1;
1671         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1672         for (j=0; j<nz; j++) {
1673           col    = pj[j];
1674           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1675         }
1676 
1677         sctx.rs = rs;
1678         sctx.pv = rtmp1[i];
1679         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1680         if (sctx.newshift) break;
1681         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1682         *pc1 = 1.0/sctx.pv;
1683 
1684         /* Now take care of 1st column of diagonal 4x4 block. */
1685         pc2 = rtmp2 + i;
1686         pc3 = rtmp3 + i;
1687         pc4 = rtmp4 + i;
1688         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1689           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1690           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1691           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1692           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1693           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1694           for (j=0; j<nz; j++) {
1695             col         = pj[j];
1696             rtmp2[col] -= mul2 * rtmp1[col];
1697             rtmp3[col] -= mul3 * rtmp1[col];
1698             rtmp4[col] -= mul4 * rtmp1[col];
1699           }
1700           ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr);
1701         }
1702 
1703         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1704         rs = 0.0;
1705         /* L part */
1706         pc2 = b->a + bi[i+1];
1707         pj  = b->j + bi[i+1];
1708         nz  = bi[i+2] - bi[i+1];
1709         for (j=0; j<nz; j++) {
1710           col    = pj[j];
1711           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1712         }
1713         /* U part */
1714         pc2 = b->a + bdiag[i+2]+1;
1715         pj  = b->j + bdiag[i+2]+1;
1716         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1717         for (j=0; j<nz; j++) {
1718           col    = pj[j];
1719           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1720         }
1721 
1722         sctx.rs = rs;
1723         sctx.pv = rtmp2[i+1];
1724         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1725         if (sctx.newshift) break;
1726         pc2  = b->a + bdiag[i+1];
1727         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1728 
1729         /* Now take care of 2nd column of diagonal 4x4 block. */
1730         pc3 = rtmp3 + i+1;
1731         pc4 = rtmp4 + i+1;
1732         if (*pc3 != 0.0 || *pc4 != 0.0) {
1733           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1734           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1735           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1736           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1737           for (j=0; j<nz; j++) {
1738             col         = pj[j];
1739             rtmp3[col] -= mul3 * rtmp2[col];
1740             rtmp4[col] -= mul4 * rtmp2[col];
1741           }
1742           ierr = PetscLogFlops(4.0*nz);CHKERRQ(ierr);
1743         }
1744 
1745         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1746         rs = 0.0;
1747         /* L part */
1748         pc3 = b->a + bi[i+2];
1749         pj  = b->j + bi[i+2];
1750         nz  = bi[i+3] - bi[i+2];
1751         for (j=0; j<nz; j++) {
1752           col    = pj[j];
1753           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1754         }
1755         /* U part */
1756         pc3 = b->a + bdiag[i+3]+1;
1757         pj  = b->j + bdiag[i+3]+1;
1758         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1759         for (j=0; j<nz; j++) {
1760           col    = pj[j];
1761           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1762         }
1763 
1764         sctx.rs = rs;
1765         sctx.pv = rtmp3[i+2];
1766         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1767         if (sctx.newshift) break;
1768         pc3  = b->a + bdiag[i+2];
1769         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1770 
1771         /* Now take care of 3rd column of diagonal 4x4 block. */
1772         pc4 = rtmp4 + i+2;
1773         if (*pc4 != 0.0) {
1774           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1775           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1776           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1777           for (j=0; j<nz; j++) {
1778             col         = pj[j];
1779             rtmp4[col] -= mul4 * rtmp3[col];
1780           }
1781           ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1782         }
1783 
1784         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1785         rs = 0.0;
1786         /* L part */
1787         pc4 = b->a + bi[i+3];
1788         pj  = b->j + bi[i+3];
1789         nz  = bi[i+4] - bi[i+3];
1790         for (j=0; j<nz; j++) {
1791           col    = pj[j];
1792           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1793         }
1794         /* U part */
1795         pc4 = b->a + bdiag[i+4]+1;
1796         pj  = b->j + bdiag[i+4]+1;
1797         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1798         for (j=0; j<nz; j++) {
1799           col    = pj[j];
1800           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1801         }
1802 
1803         sctx.rs = rs;
1804         sctx.pv = rtmp4[i+3];
1805         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1806         if (sctx.newshift) break;
1807         pc4  = b->a + bdiag[i+3];
1808         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1809         break;
1810 
1811       default:
1812         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1813       }
1814       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1815       i += nodesz;                 /* Update the row */
1816     }
1817 
1818     /* MatPivotRefine() */
1819     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1820       /*
1821        * if no shift in this attempt & shifting & started shifting & can refine,
1822        * then try lower shift
1823        */
1824       sctx.shift_hi       = sctx.shift_fraction;
1825       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1826       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1827       sctx.newshift       = PETSC_TRUE;
1828       sctx.nshift++;
1829     }
1830   } while (sctx.newshift);
1831 
1832   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1833   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1834   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1835   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1836 
1837   if (b->inode.size) {
1838     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1839   } else {
1840     C->ops->solve           = MatSolve_SeqAIJ;
1841   }
1842   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1843   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1844   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1845   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1846   C->assembled              = PETSC_TRUE;
1847   C->preallocated           = PETSC_TRUE;
1848 
1849   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1850 
1851   /* MatShiftView(A,info,&sctx) */
1852   if (sctx.nshift) {
1853     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1854       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1855     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1856       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1857     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1858       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1859     }
1860   }
1861   PetscFunctionReturn(0);
1862 }
1863 
1864 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1865 {
1866   Mat             C     = B;
1867   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1868   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1869   PetscErrorCode  ierr;
1870   const PetscInt  *r,*ic,*c,*ics;
1871   PetscInt        n   = A->rmap->n,*bi = b->i;
1872   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1873   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1874   PetscInt        *ai = a->i,*aj = a->j;
1875   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1876   PetscScalar     mul1,mul2,mul3,tmp;
1877   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1878   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1879   PetscReal       rs=0.0;
1880   FactorShiftCtx  sctx;
1881 
1882   PetscFunctionBegin;
1883   sctx.shift_top      = 0;
1884   sctx.nshift_max     = 0;
1885   sctx.shift_lo       = 0;
1886   sctx.shift_hi       = 0;
1887   sctx.shift_fraction = 0;
1888 
1889   /* if both shift schemes are chosen by user, only use info->shiftpd */
1890   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1891     sctx.shift_top = 0;
1892     for (i=0; i<n; i++) {
1893       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1894       rs    = 0.0;
1895       ajtmp = aj + ai[i];
1896       rtmp1 = aa + ai[i];
1897       nz    = ai[i+1] - ai[i];
1898       for (j=0; j<nz; j++) {
1899         if (*ajtmp != i) {
1900           rs += PetscAbsScalar(*rtmp1++);
1901         } else {
1902           rs -= PetscRealPart(*rtmp1++);
1903         }
1904         ajtmp++;
1905       }
1906       if (rs>sctx.shift_top) sctx.shift_top = rs;
1907     }
1908     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1909     sctx.shift_top *= 1.1;
1910     sctx.nshift_max = 5;
1911     sctx.shift_lo   = 0.;
1912     sctx.shift_hi   = 1.;
1913   }
1914   sctx.shift_amount = 0;
1915   sctx.nshift       = 0;
1916 
1917   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1918   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1919   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1920   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1921   ics    = ic;
1922 
1923   node_max = a->inode.node_count;
1924   ns       = a->inode.size;
1925   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1926 
1927   /* If max inode size > 3, split it into two inodes.*/
1928   /* also map the inode sizes according to the ordering */
1929   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1930   for (i=0,j=0; i<node_max; ++i,++j) {
1931     if (ns[i]>3) {
1932       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1933       ++j;
1934       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1935     } else {
1936       tmp_vec1[j] = ns[i];
1937     }
1938   }
1939   /* Use the correct node_max */
1940   node_max = j;
1941 
1942   /* Now reorder the inode info based on mat re-ordering info */
1943   /* First create a row -> inode_size_array_index map */
1944   ierr = PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);CHKERRQ(ierr);
1945   for (i=0,row=0; i<node_max; i++) {
1946     nodesz = tmp_vec1[i];
1947     for (j=0; j<nodesz; j++,row++) {
1948       nsmap[row] = i;
1949     }
1950   }
1951   /* Using nsmap, create a reordered ns structure */
1952   for (i=0,j=0; i< node_max; i++) {
1953     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1954     tmp_vec2[i] = nodesz;
1955     j          += nodesz;
1956   }
1957   ierr = PetscFree2(nsmap,tmp_vec1);CHKERRQ(ierr);
1958   /* Now use the correct ns */
1959   ns = tmp_vec2;
1960 
1961   do {
1962     sctx.newshift = PETSC_FALSE;
1963     /* Now loop over each block-row, and do the factorization */
1964     for (i=0,row=0; i<node_max; i++) {
1965       nodesz = ns[i];
1966       nz     = bi[row+1] - bi[row];
1967       bjtmp  = bj + bi[row];
1968 
1969       switch (nodesz) {
1970       case 1:
1971         for  (j=0; j<nz; j++) {
1972           idx         = bjtmp[j];
1973           rtmp11[idx] = 0.0;
1974         }
1975 
1976         /* load in initial (unfactored row) */
1977         idx    = r[row];
1978         nz_tmp = ai[idx+1] - ai[idx];
1979         ajtmp  = aj + ai[idx];
1980         v1     = aa + ai[idx];
1981 
1982         for (j=0; j<nz_tmp; j++) {
1983           idx         = ics[ajtmp[j]];
1984           rtmp11[idx] = v1[j];
1985         }
1986         rtmp11[ics[r[row]]] += sctx.shift_amount;
1987 
1988         prow = *bjtmp++;
1989         while (prow < row) {
1990           pc1 = rtmp11 + prow;
1991           if (*pc1 != 0.0) {
1992             pv     = ba + bd[prow];
1993             pj     = nbj + bd[prow];
1994             mul1   = *pc1 * *pv++;
1995             *pc1   = mul1;
1996             nz_tmp = bi[prow+1] - bd[prow] - 1;
1997             ierr   = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
1998             for (j=0; j<nz_tmp; j++) {
1999               tmp          = pv[j];
2000               idx          = pj[j];
2001               rtmp11[idx] -= mul1 * tmp;
2002             }
2003           }
2004           prow = *bjtmp++;
2005         }
2006         pj  = bj + bi[row];
2007         pc1 = ba + bi[row];
2008 
2009         sctx.pv     = rtmp11[row];
2010         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2011         rs          = 0.0;
2012         for (j=0; j<nz; j++) {
2013           idx    = pj[j];
2014           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2015           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2016         }
2017         sctx.rs = rs;
2018         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2019         if (sctx.newshift) goto endofwhile;
2020         break;
2021 
2022       case 2:
2023         for (j=0; j<nz; j++) {
2024           idx         = bjtmp[j];
2025           rtmp11[idx] = 0.0;
2026           rtmp22[idx] = 0.0;
2027         }
2028 
2029         /* load in initial (unfactored row) */
2030         idx    = r[row];
2031         nz_tmp = ai[idx+1] - ai[idx];
2032         ajtmp  = aj + ai[idx];
2033         v1     = aa + ai[idx];
2034         v2     = aa + ai[idx+1];
2035         for (j=0; j<nz_tmp; j++) {
2036           idx         = ics[ajtmp[j]];
2037           rtmp11[idx] = v1[j];
2038           rtmp22[idx] = v2[j];
2039         }
2040         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2041         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2042 
2043         prow = *bjtmp++;
2044         while (prow < row) {
2045           pc1 = rtmp11 + prow;
2046           pc2 = rtmp22 + prow;
2047           if (*pc1 != 0.0 || *pc2 != 0.0) {
2048             pv   = ba + bd[prow];
2049             pj   = nbj + bd[prow];
2050             mul1 = *pc1 * *pv;
2051             mul2 = *pc2 * *pv;
2052             ++pv;
2053             *pc1 = mul1;
2054             *pc2 = mul2;
2055 
2056             nz_tmp = bi[prow+1] - bd[prow] - 1;
2057             for (j=0; j<nz_tmp; j++) {
2058               tmp          = pv[j];
2059               idx          = pj[j];
2060               rtmp11[idx] -= mul1 * tmp;
2061               rtmp22[idx] -= mul2 * tmp;
2062             }
2063             ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr);
2064           }
2065           prow = *bjtmp++;
2066         }
2067 
2068         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2069         pc1 = rtmp11 + prow;
2070         pc2 = rtmp22 + prow;
2071 
2072         sctx.pv = *pc1;
2073         pj      = bj + bi[prow];
2074         rs      = 0.0;
2075         for (j=0; j<nz; j++) {
2076           idx = pj[j];
2077           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2078         }
2079         sctx.rs = rs;
2080         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2081         if (sctx.newshift) goto endofwhile;
2082 
2083         if (*pc2 != 0.0) {
2084           pj     = nbj + bd[prow];
2085           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2086           *pc2   = mul2;
2087           nz_tmp = bi[prow+1] - bd[prow] - 1;
2088           for (j=0; j<nz_tmp; j++) {
2089             idx          = pj[j];
2090             tmp          = rtmp11[idx];
2091             rtmp22[idx] -= mul2 * tmp;
2092           }
2093           ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
2094         }
2095 
2096         pj  = bj + bi[row];
2097         pc1 = ba + bi[row];
2098         pc2 = ba + bi[row+1];
2099 
2100         sctx.pv       = rtmp22[row+1];
2101         rs            = 0.0;
2102         rtmp11[row]   = 1.0/rtmp11[row];
2103         rtmp22[row+1] = 1.0/rtmp22[row+1];
2104         /* copy row entries from dense representation to sparse */
2105         for (j=0; j<nz; j++) {
2106           idx    = pj[j];
2107           pc1[j] = rtmp11[idx];
2108           pc2[j] = rtmp22[idx];
2109           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2110         }
2111         sctx.rs = rs;
2112         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2113         if (sctx.newshift) goto endofwhile;
2114         break;
2115 
2116       case 3:
2117         for  (j=0; j<nz; j++) {
2118           idx         = bjtmp[j];
2119           rtmp11[idx] = 0.0;
2120           rtmp22[idx] = 0.0;
2121           rtmp33[idx] = 0.0;
2122         }
2123         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2124         idx    = r[row];
2125         nz_tmp = ai[idx+1] - ai[idx];
2126         ajtmp  = aj + ai[idx];
2127         v1     = aa + ai[idx];
2128         v2     = aa + ai[idx+1];
2129         v3     = aa + ai[idx+2];
2130         for (j=0; j<nz_tmp; j++) {
2131           idx         = ics[ajtmp[j]];
2132           rtmp11[idx] = v1[j];
2133           rtmp22[idx] = v2[j];
2134           rtmp33[idx] = v3[j];
2135         }
2136         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2137         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2138         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2139 
2140         /* loop over all pivot row blocks above this row block */
2141         prow = *bjtmp++;
2142         while (prow < row) {
2143           pc1 = rtmp11 + prow;
2144           pc2 = rtmp22 + prow;
2145           pc3 = rtmp33 + prow;
2146           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2147             pv   = ba  + bd[prow];
2148             pj   = nbj + bd[prow];
2149             mul1 = *pc1 * *pv;
2150             mul2 = *pc2 * *pv;
2151             mul3 = *pc3 * *pv;
2152             ++pv;
2153             *pc1 = mul1;
2154             *pc2 = mul2;
2155             *pc3 = mul3;
2156 
2157             nz_tmp = bi[prow+1] - bd[prow] - 1;
2158             /* update this row based on pivot row */
2159             for (j=0; j<nz_tmp; j++) {
2160               tmp          = pv[j];
2161               idx          = pj[j];
2162               rtmp11[idx] -= mul1 * tmp;
2163               rtmp22[idx] -= mul2 * tmp;
2164               rtmp33[idx] -= mul3 * tmp;
2165             }
2166             ierr = PetscLogFlops(3+6.0*nz_tmp);CHKERRQ(ierr);
2167           }
2168           prow = *bjtmp++;
2169         }
2170 
2171         /* Now take care of diagonal 3x3 block in this set of rows */
2172         /* note: prow = row here */
2173         pc1 = rtmp11 + prow;
2174         pc2 = rtmp22 + prow;
2175         pc3 = rtmp33 + prow;
2176 
2177         sctx.pv = *pc1;
2178         pj      = bj + bi[prow];
2179         rs      = 0.0;
2180         for (j=0; j<nz; j++) {
2181           idx = pj[j];
2182           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2183         }
2184         sctx.rs = rs;
2185         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2186         if (sctx.newshift) goto endofwhile;
2187 
2188         if (*pc2 != 0.0 || *pc3 != 0.0) {
2189           mul2   = (*pc2)/(*pc1);
2190           mul3   = (*pc3)/(*pc1);
2191           *pc2   = mul2;
2192           *pc3   = mul3;
2193           nz_tmp = bi[prow+1] - bd[prow] - 1;
2194           pj     = nbj + bd[prow];
2195           for (j=0; j<nz_tmp; j++) {
2196             idx          = pj[j];
2197             tmp          = rtmp11[idx];
2198             rtmp22[idx] -= mul2 * tmp;
2199             rtmp33[idx] -= mul3 * tmp;
2200           }
2201           ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr);
2202         }
2203         ++prow;
2204 
2205         pc2     = rtmp22 + prow;
2206         pc3     = rtmp33 + prow;
2207         sctx.pv = *pc2;
2208         pj      = bj + bi[prow];
2209         rs      = 0.0;
2210         for (j=0; j<nz; j++) {
2211           idx = pj[j];
2212           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2213         }
2214         sctx.rs = rs;
2215         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2216         if (sctx.newshift) goto endofwhile;
2217 
2218         if (*pc3 != 0.0) {
2219           mul3   = (*pc3)/(*pc2);
2220           *pc3   = mul3;
2221           pj     = nbj + bd[prow];
2222           nz_tmp = bi[prow+1] - bd[prow] - 1;
2223           for (j=0; j<nz_tmp; j++) {
2224             idx          = pj[j];
2225             tmp          = rtmp22[idx];
2226             rtmp33[idx] -= mul3 * tmp;
2227           }
2228           ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
2229         }
2230 
2231         pj  = bj + bi[row];
2232         pc1 = ba + bi[row];
2233         pc2 = ba + bi[row+1];
2234         pc3 = ba + bi[row+2];
2235 
2236         sctx.pv       = rtmp33[row+2];
2237         rs            = 0.0;
2238         rtmp11[row]   = 1.0/rtmp11[row];
2239         rtmp22[row+1] = 1.0/rtmp22[row+1];
2240         rtmp33[row+2] = 1.0/rtmp33[row+2];
2241         /* copy row entries from dense representation to sparse */
2242         for (j=0; j<nz; j++) {
2243           idx    = pj[j];
2244           pc1[j] = rtmp11[idx];
2245           pc2[j] = rtmp22[idx];
2246           pc3[j] = rtmp33[idx];
2247           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2248         }
2249 
2250         sctx.rs = rs;
2251         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2252         if (sctx.newshift) goto endofwhile;
2253         break;
2254 
2255       default:
2256         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2257       }
2258       row += nodesz;                 /* Update the row */
2259     }
2260 endofwhile:;
2261   } while (sctx.newshift);
2262   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2263   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2264   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2265   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2266   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2267 
2268   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2269   /* do not set solve add, since MatSolve_Inode + Add is faster */
2270   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2271   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2272   C->assembled              = PETSC_TRUE;
2273   C->preallocated           = PETSC_TRUE;
2274   if (sctx.nshift) {
2275     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2276       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2277     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2278       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2279     }
2280   }
2281   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2282   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2283   PetscFunctionReturn(0);
2284 }
2285 
2286 /* ----------------------------------------------------------- */
2287 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2288 {
2289   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2290   IS                iscol = a->col,isrow = a->row;
2291   PetscErrorCode    ierr;
2292   const PetscInt    *r,*c,*rout,*cout;
2293   PetscInt          i,j,n = A->rmap->n;
2294   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2295   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2296   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2297   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2298   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2299   const PetscScalar *b;
2300 
2301   PetscFunctionBegin;
2302   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2303   node_max = a->inode.node_count;
2304   ns       = a->inode.size;     /* Node Size array */
2305 
2306   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2307   ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
2308   tmp  = a->solve_work;
2309 
2310   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2311   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2312 
2313   /* forward solve the lower triangular */
2314   tmps = tmp;
2315   aa   = a_a;
2316   aj   = a_j;
2317   ad   = a->diag;
2318 
2319   for (i = 0,row = 0; i< node_max; ++i) {
2320     nsz = ns[i];
2321     aii = ai[row];
2322     v1  = aa + aii;
2323     vi  = aj + aii;
2324     nz  = ai[row+1]- ai[row];
2325 
2326     if (i < node_max-1) {
2327       /* Prefetch the indices for the next block */
2328       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2329       /* Prefetch the data for the next block */
2330       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2331     }
2332 
2333     switch (nsz) {               /* Each loop in 'case' is unrolled */
2334     case 1:
2335       sum1 = b[r[row]];
2336       for (j=0; j<nz-1; j+=2) {
2337         i0    = vi[j];
2338         i1    = vi[j+1];
2339         tmp0  = tmps[i0];
2340         tmp1  = tmps[i1];
2341         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2342       }
2343       if (j == nz-1) {
2344         tmp0  = tmps[vi[j]];
2345         sum1 -= v1[j]*tmp0;
2346       }
2347       tmp[row++]=sum1;
2348       break;
2349     case 2:
2350       sum1 = b[r[row]];
2351       sum2 = b[r[row+1]];
2352       v2   = aa + ai[row+1];
2353 
2354       for (j=0; j<nz-1; j+=2) {
2355         i0    = vi[j];
2356         i1    = vi[j+1];
2357         tmp0  = tmps[i0];
2358         tmp1  = tmps[i1];
2359         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2360         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2361       }
2362       if (j == nz-1) {
2363         tmp0  = tmps[vi[j]];
2364         sum1 -= v1[j] *tmp0;
2365         sum2 -= v2[j] *tmp0;
2366       }
2367       sum2     -= v2[nz] * sum1;
2368       tmp[row++]=sum1;
2369       tmp[row++]=sum2;
2370       break;
2371     case 3:
2372       sum1 = b[r[row]];
2373       sum2 = b[r[row+1]];
2374       sum3 = b[r[row+2]];
2375       v2   = aa + ai[row+1];
2376       v3   = aa + ai[row+2];
2377 
2378       for (j=0; j<nz-1; j+=2) {
2379         i0    = vi[j];
2380         i1    = vi[j+1];
2381         tmp0  = tmps[i0];
2382         tmp1  = tmps[i1];
2383         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2384         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2385         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2386       }
2387       if (j == nz-1) {
2388         tmp0  = tmps[vi[j]];
2389         sum1 -= v1[j] *tmp0;
2390         sum2 -= v2[j] *tmp0;
2391         sum3 -= v3[j] *tmp0;
2392       }
2393       sum2     -= v2[nz] * sum1;
2394       sum3     -= v3[nz] * sum1;
2395       sum3     -= v3[nz+1] * sum2;
2396       tmp[row++]=sum1;
2397       tmp[row++]=sum2;
2398       tmp[row++]=sum3;
2399       break;
2400 
2401     case 4:
2402       sum1 = b[r[row]];
2403       sum2 = b[r[row+1]];
2404       sum3 = b[r[row+2]];
2405       sum4 = b[r[row+3]];
2406       v2   = aa + ai[row+1];
2407       v3   = aa + ai[row+2];
2408       v4   = aa + ai[row+3];
2409 
2410       for (j=0; j<nz-1; j+=2) {
2411         i0    = vi[j];
2412         i1    = vi[j+1];
2413         tmp0  = tmps[i0];
2414         tmp1  = tmps[i1];
2415         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2416         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2417         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2418         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2419       }
2420       if (j == nz-1) {
2421         tmp0  = tmps[vi[j]];
2422         sum1 -= v1[j] *tmp0;
2423         sum2 -= v2[j] *tmp0;
2424         sum3 -= v3[j] *tmp0;
2425         sum4 -= v4[j] *tmp0;
2426       }
2427       sum2 -= v2[nz] * sum1;
2428       sum3 -= v3[nz] * sum1;
2429       sum4 -= v4[nz] * sum1;
2430       sum3 -= v3[nz+1] * sum2;
2431       sum4 -= v4[nz+1] * sum2;
2432       sum4 -= v4[nz+2] * sum3;
2433 
2434       tmp[row++]=sum1;
2435       tmp[row++]=sum2;
2436       tmp[row++]=sum3;
2437       tmp[row++]=sum4;
2438       break;
2439     case 5:
2440       sum1 = b[r[row]];
2441       sum2 = b[r[row+1]];
2442       sum3 = b[r[row+2]];
2443       sum4 = b[r[row+3]];
2444       sum5 = b[r[row+4]];
2445       v2   = aa + ai[row+1];
2446       v3   = aa + ai[row+2];
2447       v4   = aa + ai[row+3];
2448       v5   = aa + ai[row+4];
2449 
2450       for (j=0; j<nz-1; j+=2) {
2451         i0    = vi[j];
2452         i1    = vi[j+1];
2453         tmp0  = tmps[i0];
2454         tmp1  = tmps[i1];
2455         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2456         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2457         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2458         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2459         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2460       }
2461       if (j == nz-1) {
2462         tmp0  = tmps[vi[j]];
2463         sum1 -= v1[j] *tmp0;
2464         sum2 -= v2[j] *tmp0;
2465         sum3 -= v3[j] *tmp0;
2466         sum4 -= v4[j] *tmp0;
2467         sum5 -= v5[j] *tmp0;
2468       }
2469 
2470       sum2 -= v2[nz] * sum1;
2471       sum3 -= v3[nz] * sum1;
2472       sum4 -= v4[nz] * sum1;
2473       sum5 -= v5[nz] * sum1;
2474       sum3 -= v3[nz+1] * sum2;
2475       sum4 -= v4[nz+1] * sum2;
2476       sum5 -= v5[nz+1] * sum2;
2477       sum4 -= v4[nz+2] * sum3;
2478       sum5 -= v5[nz+2] * sum3;
2479       sum5 -= v5[nz+3] * sum4;
2480 
2481       tmp[row++]=sum1;
2482       tmp[row++]=sum2;
2483       tmp[row++]=sum3;
2484       tmp[row++]=sum4;
2485       tmp[row++]=sum5;
2486       break;
2487     default:
2488       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2489     }
2490   }
2491   /* backward solve the upper triangular */
2492   for (i=node_max -1,row = n-1; i>=0; i--) {
2493     nsz = ns[i];
2494     aii = ad[row+1] + 1;
2495     v1  = aa + aii;
2496     vi  = aj + aii;
2497     nz  = ad[row]- ad[row+1] - 1;
2498 
2499     if (i > 0) {
2500       /* Prefetch the indices for the next block */
2501       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2502       /* Prefetch the data for the next block */
2503       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2504     }
2505 
2506     switch (nsz) {               /* Each loop in 'case' is unrolled */
2507     case 1:
2508       sum1 = tmp[row];
2509 
2510       for (j=0; j<nz-1; j+=2) {
2511         i0    = vi[j];
2512         i1    = vi[j+1];
2513         tmp0  = tmps[i0];
2514         tmp1  = tmps[i1];
2515         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2516       }
2517       if (j == nz-1) {
2518         tmp0  = tmps[vi[j]];
2519         sum1 -= v1[j]*tmp0;
2520       }
2521       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2522       break;
2523     case 2:
2524       sum1 = tmp[row];
2525       sum2 = tmp[row-1];
2526       v2   = aa + ad[row] + 1;
2527       for (j=0; j<nz-1; j+=2) {
2528         i0    = vi[j];
2529         i1    = vi[j+1];
2530         tmp0  = tmps[i0];
2531         tmp1  = tmps[i1];
2532         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2533         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2534       }
2535       if (j == nz-1) {
2536         tmp0  = tmps[vi[j]];
2537         sum1 -= v1[j]* tmp0;
2538         sum2 -= v2[j+1]* tmp0;
2539       }
2540 
2541       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2542       sum2     -= v2[0] * tmp0;
2543       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2544       break;
2545     case 3:
2546       sum1 = tmp[row];
2547       sum2 = tmp[row -1];
2548       sum3 = tmp[row -2];
2549       v2   = aa + ad[row] + 1;
2550       v3   = aa + ad[row -1] + 1;
2551       for (j=0; j<nz-1; j+=2) {
2552         i0    = vi[j];
2553         i1    = vi[j+1];
2554         tmp0  = tmps[i0];
2555         tmp1  = tmps[i1];
2556         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2558         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2559       }
2560       if (j== nz-1) {
2561         tmp0  = tmps[vi[j]];
2562         sum1 -= v1[j] * tmp0;
2563         sum2 -= v2[j+1] * tmp0;
2564         sum3 -= v3[j+2] * tmp0;
2565       }
2566       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2567       sum2     -= v2[0]* tmp0;
2568       sum3     -= v3[1] * tmp0;
2569       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2570       sum3     -= v3[0]* tmp0;
2571       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2572 
2573       break;
2574     case 4:
2575       sum1 = tmp[row];
2576       sum2 = tmp[row -1];
2577       sum3 = tmp[row -2];
2578       sum4 = tmp[row -3];
2579       v2   = aa + ad[row]+1;
2580       v3   = aa + ad[row -1]+1;
2581       v4   = aa + ad[row -2]+1;
2582 
2583       for (j=0; j<nz-1; j+=2) {
2584         i0    = vi[j];
2585         i1    = vi[j+1];
2586         tmp0  = tmps[i0];
2587         tmp1  = tmps[i1];
2588         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2589         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2590         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2591         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2592       }
2593       if (j== nz-1) {
2594         tmp0  = tmps[vi[j]];
2595         sum1 -= v1[j] * tmp0;
2596         sum2 -= v2[j+1] * tmp0;
2597         sum3 -= v3[j+2] * tmp0;
2598         sum4 -= v4[j+3] * tmp0;
2599       }
2600 
2601       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2602       sum2     -= v2[0] * tmp0;
2603       sum3     -= v3[1] * tmp0;
2604       sum4     -= v4[2] * tmp0;
2605       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2606       sum3     -= v3[0] * tmp0;
2607       sum4     -= v4[1] * tmp0;
2608       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609       sum4     -= v4[0] * tmp0;
2610       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2611       break;
2612     case 5:
2613       sum1 = tmp[row];
2614       sum2 = tmp[row -1];
2615       sum3 = tmp[row -2];
2616       sum4 = tmp[row -3];
2617       sum5 = tmp[row -4];
2618       v2   = aa + ad[row]+1;
2619       v3   = aa + ad[row -1]+1;
2620       v4   = aa + ad[row -2]+1;
2621       v5   = aa + ad[row -3]+1;
2622       for (j=0; j<nz-1; j+=2) {
2623         i0    = vi[j];
2624         i1    = vi[j+1];
2625         tmp0  = tmps[i0];
2626         tmp1  = tmps[i1];
2627         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2628         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2629         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2630         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2631         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2632       }
2633       if (j==nz-1) {
2634         tmp0  = tmps[vi[j]];
2635         sum1 -= v1[j] * tmp0;
2636         sum2 -= v2[j+1] * tmp0;
2637         sum3 -= v3[j+2] * tmp0;
2638         sum4 -= v4[j+3] * tmp0;
2639         sum5 -= v5[j+4] * tmp0;
2640       }
2641 
2642       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643       sum2     -= v2[0] * tmp0;
2644       sum3     -= v3[1] * tmp0;
2645       sum4     -= v4[2] * tmp0;
2646       sum5     -= v5[3] * tmp0;
2647       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2648       sum3     -= v3[0] * tmp0;
2649       sum4     -= v4[1] * tmp0;
2650       sum5     -= v5[2] * tmp0;
2651       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652       sum4     -= v4[0] * tmp0;
2653       sum5     -= v5[1] * tmp0;
2654       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2655       sum5     -= v5[0] * tmp0;
2656       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2657       break;
2658     default:
2659       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2660     }
2661   }
2662   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2663   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2664   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2665   ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
2666   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2667   PetscFunctionReturn(0);
2668 }
2669 
2670 /*
2671      Makes a longer coloring[] array and calls the usual code with that
2672 */
2673 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2674 {
2675   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2676   PetscErrorCode  ierr;
2677   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2678   PetscInt        *colorused,i;
2679   ISColoringValue *newcolor;
2680 
2681   PetscFunctionBegin;
2682   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2683   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2684   /* loop over inodes, marking a color for each column*/
2685   row = 0;
2686   for (i=0; i<m; i++) {
2687     for (j=0; j<ns[i]; j++) {
2688       newcolor[row++] = coloring[i] + j*ncolors;
2689     }
2690   }
2691 
2692   /* eliminate unneeded colors */
2693   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2694   for (i=0; i<n; i++) {
2695     colorused[newcolor[i]] = 1;
2696   }
2697 
2698   for (i=1; i<5*ncolors; i++) {
2699     colorused[i] += colorused[i-1];
2700   }
2701   ncolors = colorused[5*ncolors-1];
2702   for (i=0; i<n; i++) {
2703     newcolor[i] = colorused[newcolor[i]]-1;
2704   }
2705   ierr = PetscFree(colorused);CHKERRQ(ierr);
2706   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2707   ierr = PetscFree(coloring);CHKERRQ(ierr);
2708   PetscFunctionReturn(0);
2709 }
2710 
2711 #include <petsc/private/kernels/blockinvert.h>
2712 
2713 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2714 {
2715   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2716   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2717   MatScalar         *ibdiag,*bdiag,work[25],*t;
2718   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2719   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2720   const PetscScalar *xb, *b;
2721   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2722   PetscErrorCode    ierr;
2723   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2724   PetscInt          sz,k,ipvt[5];
2725   PetscBool         allowzeropivot,zeropivotdetected;
2726   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2727 
2728   PetscFunctionBegin;
2729   allowzeropivot = PetscNot(A->erroriffailure);
2730   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2731   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2732   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2733 
2734   if (!a->inode.ibdiagvalid) {
2735     if (!a->inode.ibdiag) {
2736       /* calculate space needed for diagonal blocks */
2737       for (i=0; i<m; i++) {
2738         cnt += sizes[i]*sizes[i];
2739       }
2740       a->inode.bdiagsize = cnt;
2741 
2742       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2743     }
2744 
2745     /* copy over the diagonal blocks and invert them */
2746     ibdiag = a->inode.ibdiag;
2747     bdiag  = a->inode.bdiag;
2748     cnt    = 0;
2749     for (i=0, row = 0; i<m; i++) {
2750       for (j=0; j<sizes[i]; j++) {
2751         for (k=0; k<sizes[i]; k++) {
2752           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2753         }
2754       }
2755       ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr);
2756 
2757       switch (sizes[i]) {
2758       case 1:
2759         /* Create matrix data structure */
2760         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2761           if (allowzeropivot) {
2762             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2763             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2764             A->factorerror_zeropivot_row   = row;
2765             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2766           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2767         }
2768         ibdiag[cnt] = 1.0/ibdiag[cnt];
2769         break;
2770       case 2:
2771         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2772         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2773         break;
2774       case 3:
2775         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2776         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2777         break;
2778       case 4:
2779         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2780         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2781         break;
2782       case 5:
2783         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2784         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2785         break;
2786       default:
2787         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2788       }
2789       cnt += sizes[i]*sizes[i];
2790       row += sizes[i];
2791     }
2792     a->inode.ibdiagvalid = PETSC_TRUE;
2793   }
2794   ibdiag = a->inode.ibdiag;
2795   bdiag  = a->inode.bdiag;
2796   t      = a->inode.ssor_work;
2797 
2798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2799   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2800   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2801   if (flag & SOR_ZERO_INITIAL_GUESS) {
2802     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2803 
2804       for (i=0, row=0; i<m; i++) {
2805         sz  = diag[row] - ii[row];
2806         v1  = a->a + ii[row];
2807         idx = a->j + ii[row];
2808 
2809         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2810         switch (sizes[i]) {
2811         case 1:
2812 
2813           sum1 = b[row];
2814           for (n = 0; n<sz-1; n+=2) {
2815             i1    = idx[0];
2816             i2    = idx[1];
2817             idx  += 2;
2818             tmp0  = x[i1];
2819             tmp1  = x[i2];
2820             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2821           }
2822 
2823           if (n == sz-1) {
2824             tmp0  = x[*idx];
2825             sum1 -= *v1 * tmp0;
2826           }
2827           t[row]   = sum1;
2828           x[row++] = sum1*(*ibdiag++);
2829           break;
2830         case 2:
2831           v2   = a->a + ii[row+1];
2832           sum1 = b[row];
2833           sum2 = b[row+1];
2834           for (n = 0; n<sz-1; n+=2) {
2835             i1    = idx[0];
2836             i2    = idx[1];
2837             idx  += 2;
2838             tmp0  = x[i1];
2839             tmp1  = x[i2];
2840             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2841             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2842           }
2843 
2844           if (n == sz-1) {
2845             tmp0  = x[*idx];
2846             sum1 -= v1[0] * tmp0;
2847             sum2 -= v2[0] * tmp0;
2848           }
2849           t[row]   = sum1;
2850           t[row+1] = sum2;
2851           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2852           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2853           ibdiag  += 4;
2854           break;
2855         case 3:
2856           v2   = a->a + ii[row+1];
2857           v3   = a->a + ii[row+2];
2858           sum1 = b[row];
2859           sum2 = b[row+1];
2860           sum3 = b[row+2];
2861           for (n = 0; n<sz-1; n+=2) {
2862             i1    = idx[0];
2863             i2    = idx[1];
2864             idx  += 2;
2865             tmp0  = x[i1];
2866             tmp1  = x[i2];
2867             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2868             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2869             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2870           }
2871 
2872           if (n == sz-1) {
2873             tmp0  = x[*idx];
2874             sum1 -= v1[0] * tmp0;
2875             sum2 -= v2[0] * tmp0;
2876             sum3 -= v3[0] * tmp0;
2877           }
2878           t[row]   = sum1;
2879           t[row+1] = sum2;
2880           t[row+2] = sum3;
2881           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2882           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2883           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2884           ibdiag  += 9;
2885           break;
2886         case 4:
2887           v2   = a->a + ii[row+1];
2888           v3   = a->a + ii[row+2];
2889           v4   = a->a + ii[row+3];
2890           sum1 = b[row];
2891           sum2 = b[row+1];
2892           sum3 = b[row+2];
2893           sum4 = b[row+3];
2894           for (n = 0; n<sz-1; n+=2) {
2895             i1    = idx[0];
2896             i2    = idx[1];
2897             idx  += 2;
2898             tmp0  = x[i1];
2899             tmp1  = x[i2];
2900             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2901             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2902             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2903             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2904           }
2905 
2906           if (n == sz-1) {
2907             tmp0  = x[*idx];
2908             sum1 -= v1[0] * tmp0;
2909             sum2 -= v2[0] * tmp0;
2910             sum3 -= v3[0] * tmp0;
2911             sum4 -= v4[0] * tmp0;
2912           }
2913           t[row]   = sum1;
2914           t[row+1] = sum2;
2915           t[row+2] = sum3;
2916           t[row+3] = sum4;
2917           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2918           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2919           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2920           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2921           ibdiag  += 16;
2922           break;
2923         case 5:
2924           v2   = a->a + ii[row+1];
2925           v3   = a->a + ii[row+2];
2926           v4   = a->a + ii[row+3];
2927           v5   = a->a + ii[row+4];
2928           sum1 = b[row];
2929           sum2 = b[row+1];
2930           sum3 = b[row+2];
2931           sum4 = b[row+3];
2932           sum5 = b[row+4];
2933           for (n = 0; n<sz-1; n+=2) {
2934             i1    = idx[0];
2935             i2    = idx[1];
2936             idx  += 2;
2937             tmp0  = x[i1];
2938             tmp1  = x[i2];
2939             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2944           }
2945 
2946           if (n == sz-1) {
2947             tmp0  = x[*idx];
2948             sum1 -= v1[0] * tmp0;
2949             sum2 -= v2[0] * tmp0;
2950             sum3 -= v3[0] * tmp0;
2951             sum4 -= v4[0] * tmp0;
2952             sum5 -= v5[0] * tmp0;
2953           }
2954           t[row]   = sum1;
2955           t[row+1] = sum2;
2956           t[row+2] = sum3;
2957           t[row+3] = sum4;
2958           t[row+4] = sum5;
2959           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2960           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2961           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2962           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2963           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2964           ibdiag  += 25;
2965           break;
2966         default:
2967           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2968         }
2969       }
2970 
2971       xb   = t;
2972       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2973     } else xb = b;
2974     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2975 
2976       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2977       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2978         ibdiag -= sizes[i]*sizes[i];
2979         sz      = ii[row+1] - diag[row] - 1;
2980         v1      = a->a + diag[row] + 1;
2981         idx     = a->j + diag[row] + 1;
2982 
2983         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2984         switch (sizes[i]) {
2985         case 1:
2986 
2987           sum1 = xb[row];
2988           for (n = 0; n<sz-1; n+=2) {
2989             i1    = idx[0];
2990             i2    = idx[1];
2991             idx  += 2;
2992             tmp0  = x[i1];
2993             tmp1  = x[i2];
2994             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2995           }
2996 
2997           if (n == sz-1) {
2998             tmp0  = x[*idx];
2999             sum1 -= *v1*tmp0;
3000           }
3001           x[row--] = sum1*(*ibdiag);
3002           break;
3003 
3004         case 2:
3005 
3006           sum1 = xb[row];
3007           sum2 = xb[row-1];
3008           /* note that sum1 is associated with the second of the two rows */
3009           v2 = a->a + diag[row-1] + 2;
3010           for (n = 0; n<sz-1; n+=2) {
3011             i1    = idx[0];
3012             i2    = idx[1];
3013             idx  += 2;
3014             tmp0  = x[i1];
3015             tmp1  = x[i2];
3016             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3017             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3018           }
3019 
3020           if (n == sz-1) {
3021             tmp0  = x[*idx];
3022             sum1 -= *v1*tmp0;
3023             sum2 -= *v2*tmp0;
3024           }
3025           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3026           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3027           break;
3028         case 3:
3029 
3030           sum1 = xb[row];
3031           sum2 = xb[row-1];
3032           sum3 = xb[row-2];
3033           v2   = a->a + diag[row-1] + 2;
3034           v3   = a->a + diag[row-2] + 3;
3035           for (n = 0; n<sz-1; n+=2) {
3036             i1    = idx[0];
3037             i2    = idx[1];
3038             idx  += 2;
3039             tmp0  = x[i1];
3040             tmp1  = x[i2];
3041             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3042             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3043             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3044           }
3045 
3046           if (n == sz-1) {
3047             tmp0  = x[*idx];
3048             sum1 -= *v1*tmp0;
3049             sum2 -= *v2*tmp0;
3050             sum3 -= *v3*tmp0;
3051           }
3052           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3053           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3054           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3055           break;
3056         case 4:
3057 
3058           sum1 = xb[row];
3059           sum2 = xb[row-1];
3060           sum3 = xb[row-2];
3061           sum4 = xb[row-3];
3062           v2   = a->a + diag[row-1] + 2;
3063           v3   = a->a + diag[row-2] + 3;
3064           v4   = a->a + diag[row-3] + 4;
3065           for (n = 0; n<sz-1; n+=2) {
3066             i1    = idx[0];
3067             i2    = idx[1];
3068             idx  += 2;
3069             tmp0  = x[i1];
3070             tmp1  = x[i2];
3071             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3072             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3073             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3074             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3075           }
3076 
3077           if (n == sz-1) {
3078             tmp0  = x[*idx];
3079             sum1 -= *v1*tmp0;
3080             sum2 -= *v2*tmp0;
3081             sum3 -= *v3*tmp0;
3082             sum4 -= *v4*tmp0;
3083           }
3084           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3085           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3086           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3087           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3088           break;
3089         case 5:
3090 
3091           sum1 = xb[row];
3092           sum2 = xb[row-1];
3093           sum3 = xb[row-2];
3094           sum4 = xb[row-3];
3095           sum5 = xb[row-4];
3096           v2   = a->a + diag[row-1] + 2;
3097           v3   = a->a + diag[row-2] + 3;
3098           v4   = a->a + diag[row-3] + 4;
3099           v5   = a->a + diag[row-4] + 5;
3100           for (n = 0; n<sz-1; n+=2) {
3101             i1    = idx[0];
3102             i2    = idx[1];
3103             idx  += 2;
3104             tmp0  = x[i1];
3105             tmp1  = x[i2];
3106             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3107             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3108             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3109             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3110             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3111           }
3112 
3113           if (n == sz-1) {
3114             tmp0  = x[*idx];
3115             sum1 -= *v1*tmp0;
3116             sum2 -= *v2*tmp0;
3117             sum3 -= *v3*tmp0;
3118             sum4 -= *v4*tmp0;
3119             sum5 -= *v5*tmp0;
3120           }
3121           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3122           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3123           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3124           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3125           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3126           break;
3127         default:
3128           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3129         }
3130       }
3131 
3132       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3133     }
3134     its--;
3135   }
3136   while (its--) {
3137 
3138     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3139       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3140            i<m;
3141            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3142 
3143         sz  = diag[row] - ii[row];
3144         v1  = a->a + ii[row];
3145         idx = a->j + ii[row];
3146         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3147         switch (sizes[i]) {
3148         case 1:
3149           sum1 = b[row];
3150           for (n = 0; n<sz-1; n+=2) {
3151             i1    = idx[0];
3152             i2    = idx[1];
3153             idx  += 2;
3154             tmp0  = x[i1];
3155             tmp1  = x[i2];
3156             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3157           }
3158           if (n == sz-1) {
3159             tmp0  = x[*idx++];
3160             sum1 -= *v1 * tmp0;
3161             v1++;
3162           }
3163           t[row]   = sum1;
3164           sz      = ii[row+1] - diag[row] - 1;
3165           idx     = a->j + diag[row] + 1;
3166           v1 += 1;
3167           for (n = 0; n<sz-1; n+=2) {
3168             i1    = idx[0];
3169             i2    = idx[1];
3170             idx  += 2;
3171             tmp0  = x[i1];
3172             tmp1  = x[i2];
3173             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3174           }
3175           if (n == sz-1) {
3176             tmp0  = x[*idx++];
3177             sum1 -= *v1 * tmp0;
3178           }
3179           /* in MatSOR_SeqAIJ this line would be
3180            *
3181            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3182            *
3183            * but omega == 1, so this becomes
3184            *
3185            * x[row] = sum1*(*ibdiag++);
3186            *
3187            */
3188           x[row] = sum1*(*ibdiag);
3189           break;
3190         case 2:
3191           v2   = a->a + ii[row+1];
3192           sum1 = b[row];
3193           sum2 = b[row+1];
3194           for (n = 0; n<sz-1; n+=2) {
3195             i1    = idx[0];
3196             i2    = idx[1];
3197             idx  += 2;
3198             tmp0  = x[i1];
3199             tmp1  = x[i2];
3200             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3201             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3202           }
3203           if (n == sz-1) {
3204             tmp0  = x[*idx++];
3205             sum1 -= v1[0] * tmp0;
3206             sum2 -= v2[0] * tmp0;
3207             v1++; v2++;
3208           }
3209           t[row]   = sum1;
3210           t[row+1] = sum2;
3211           sz      = ii[row+1] - diag[row] - 2;
3212           idx     = a->j + diag[row] + 2;
3213           v1 += 2;
3214           v2 += 2;
3215           for (n = 0; n<sz-1; n+=2) {
3216             i1    = idx[0];
3217             i2    = idx[1];
3218             idx  += 2;
3219             tmp0  = x[i1];
3220             tmp1  = x[i2];
3221             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3222             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3223           }
3224           if (n == sz-1) {
3225             tmp0  = x[*idx];
3226             sum1 -= v1[0] * tmp0;
3227             sum2 -= v2[0] * tmp0;
3228           }
3229           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3230           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3231           break;
3232         case 3:
3233           v2   = a->a + ii[row+1];
3234           v3   = a->a + ii[row+2];
3235           sum1 = b[row];
3236           sum2 = b[row+1];
3237           sum3 = b[row+2];
3238           for (n = 0; n<sz-1; n+=2) {
3239             i1    = idx[0];
3240             i2    = idx[1];
3241             idx  += 2;
3242             tmp0  = x[i1];
3243             tmp1  = x[i2];
3244             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3245             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3246             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3247           }
3248           if (n == sz-1) {
3249             tmp0  = x[*idx++];
3250             sum1 -= v1[0] * tmp0;
3251             sum2 -= v2[0] * tmp0;
3252             sum3 -= v3[0] * tmp0;
3253             v1++; v2++; v3++;
3254           }
3255           t[row]   = sum1;
3256           t[row+1] = sum2;
3257           t[row+2] = sum3;
3258           sz      = ii[row+1] - diag[row] - 3;
3259           idx     = a->j + diag[row] + 3;
3260           v1 += 3;
3261           v2 += 3;
3262           v3 += 3;
3263           for (n = 0; n<sz-1; n+=2) {
3264             i1    = idx[0];
3265             i2    = idx[1];
3266             idx  += 2;
3267             tmp0  = x[i1];
3268             tmp1  = x[i2];
3269             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3270             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3271             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3272           }
3273           if (n == sz-1) {
3274             tmp0  = x[*idx];
3275             sum1 -= v1[0] * tmp0;
3276             sum2 -= v2[0] * tmp0;
3277             sum3 -= v3[0] * tmp0;
3278           }
3279           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3280           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3281           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3282           break;
3283         case 4:
3284           v2   = a->a + ii[row+1];
3285           v3   = a->a + ii[row+2];
3286           v4   = a->a + ii[row+3];
3287           sum1 = b[row];
3288           sum2 = b[row+1];
3289           sum3 = b[row+2];
3290           sum4 = b[row+3];
3291           for (n = 0; n<sz-1; n+=2) {
3292             i1    = idx[0];
3293             i2    = idx[1];
3294             idx  += 2;
3295             tmp0  = x[i1];
3296             tmp1  = x[i2];
3297             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3298             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3299             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3300             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3301           }
3302           if (n == sz-1) {
3303             tmp0  = x[*idx++];
3304             sum1 -= v1[0] * tmp0;
3305             sum2 -= v2[0] * tmp0;
3306             sum3 -= v3[0] * tmp0;
3307             sum4 -= v4[0] * tmp0;
3308             v1++; v2++; v3++; v4++;
3309           }
3310           t[row]   = sum1;
3311           t[row+1] = sum2;
3312           t[row+2] = sum3;
3313           t[row+3] = sum4;
3314           sz      = ii[row+1] - diag[row] - 4;
3315           idx     = a->j + diag[row] + 4;
3316           v1 += 4;
3317           v2 += 4;
3318           v3 += 4;
3319           v4 += 4;
3320           for (n = 0; n<sz-1; n+=2) {
3321             i1    = idx[0];
3322             i2    = idx[1];
3323             idx  += 2;
3324             tmp0  = x[i1];
3325             tmp1  = x[i2];
3326             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3327             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3328             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3329             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3330           }
3331           if (n == sz-1) {
3332             tmp0  = x[*idx];
3333             sum1 -= v1[0] * tmp0;
3334             sum2 -= v2[0] * tmp0;
3335             sum3 -= v3[0] * tmp0;
3336             sum4 -= v4[0] * tmp0;
3337           }
3338           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3339           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3340           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3341           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3342           break;
3343         case 5:
3344           v2   = a->a + ii[row+1];
3345           v3   = a->a + ii[row+2];
3346           v4   = a->a + ii[row+3];
3347           v5   = a->a + ii[row+4];
3348           sum1 = b[row];
3349           sum2 = b[row+1];
3350           sum3 = b[row+2];
3351           sum4 = b[row+3];
3352           sum5 = b[row+4];
3353           for (n = 0; n<sz-1; n+=2) {
3354             i1    = idx[0];
3355             i2    = idx[1];
3356             idx  += 2;
3357             tmp0  = x[i1];
3358             tmp1  = x[i2];
3359             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3360             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3361             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3362             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3363             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3364           }
3365           if (n == sz-1) {
3366             tmp0  = x[*idx++];
3367             sum1 -= v1[0] * tmp0;
3368             sum2 -= v2[0] * tmp0;
3369             sum3 -= v3[0] * tmp0;
3370             sum4 -= v4[0] * tmp0;
3371             sum5 -= v5[0] * tmp0;
3372             v1++; v2++; v3++; v4++; v5++;
3373           }
3374           t[row]   = sum1;
3375           t[row+1] = sum2;
3376           t[row+2] = sum3;
3377           t[row+3] = sum4;
3378           t[row+4] = sum5;
3379           sz      = ii[row+1] - diag[row] - 5;
3380           idx     = a->j + diag[row] + 5;
3381           v1 += 5;
3382           v2 += 5;
3383           v3 += 5;
3384           v4 += 5;
3385           v5 += 5;
3386           for (n = 0; n<sz-1; n+=2) {
3387             i1    = idx[0];
3388             i2    = idx[1];
3389             idx  += 2;
3390             tmp0  = x[i1];
3391             tmp1  = x[i2];
3392             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3393             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3394             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3395             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3396             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3397           }
3398           if (n == sz-1) {
3399             tmp0  = x[*idx];
3400             sum1 -= v1[0] * tmp0;
3401             sum2 -= v2[0] * tmp0;
3402             sum3 -= v3[0] * tmp0;
3403             sum4 -= v4[0] * tmp0;
3404             sum5 -= v5[0] * tmp0;
3405           }
3406           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3407           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3408           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3409           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3410           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3411           break;
3412         default:
3413           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3414         }
3415       }
3416       xb   = t;
3417       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3418     } else xb = b;
3419 
3420     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3421 
3422       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3423       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3424         ibdiag -= sizes[i]*sizes[i];
3425 
3426         /* set RHS */
3427         if (xb == b) {
3428           /* whole (old way) */
3429           sz      = ii[row+1] - ii[row];
3430           idx     = a->j + ii[row];
3431           switch (sizes[i]) {
3432           case 5:
3433             v5      = a->a + ii[row-4];
3434           case 4: /* fall through */
3435             v4      = a->a + ii[row-3];
3436           case 3:
3437             v3      = a->a + ii[row-2];
3438           case 2:
3439             v2      = a->a + ii[row-1];
3440           case 1:
3441             v1      = a->a + ii[row];
3442             break;
3443           default:
3444             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3445           }
3446         } else {
3447           /* upper, no diag */
3448           sz      = ii[row+1] - diag[row] - 1;
3449           idx     = a->j + diag[row] + 1;
3450           switch (sizes[i]) {
3451           case 5:
3452             v5      = a->a + diag[row-4] + 5;
3453           case 4: /* fall through */
3454             v4      = a->a + diag[row-3] + 4;
3455           case 3:
3456             v3      = a->a + diag[row-2] + 3;
3457           case 2:
3458             v2      = a->a + diag[row-1] + 2;
3459           case 1:
3460             v1      = a->a + diag[row] + 1;
3461           }
3462         }
3463         /* set sum */
3464         switch (sizes[i]) {
3465         case 5:
3466           sum5 = xb[row-4];
3467         case 4: /* fall through */
3468           sum4 = xb[row-3];
3469         case 3:
3470           sum3 = xb[row-2];
3471         case 2:
3472           sum2 = xb[row-1];
3473         case 1:
3474           /* note that sum1 is associated with the last row */
3475           sum1 = xb[row];
3476         }
3477         /* do sums */
3478         for (n = 0; n<sz-1; n+=2) {
3479             i1    = idx[0];
3480             i2    = idx[1];
3481             idx  += 2;
3482             tmp0  = x[i1];
3483             tmp1  = x[i2];
3484             switch (sizes[i]) {
3485             case 5:
3486               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3487             case 4: /* fall through */
3488               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3489             case 3:
3490               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3491             case 2:
3492               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3493             case 1:
3494               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3495             }
3496         }
3497         /* ragged edge */
3498         if (n == sz-1) {
3499           tmp0  = x[*idx];
3500           switch (sizes[i]) {
3501           case 5:
3502             sum5 -= *v5*tmp0;
3503           case 4: /* fall through */
3504             sum4 -= *v4*tmp0;
3505           case 3:
3506             sum3 -= *v3*tmp0;
3507           case 2:
3508             sum2 -= *v2*tmp0;
3509           case 1:
3510             sum1 -= *v1*tmp0;
3511           }
3512         }
3513         /* update */
3514         if (xb == b) {
3515           /* whole (old way) w/ diag */
3516           switch (sizes[i]) {
3517           case 5:
3518             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3519             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3520             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3521             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3522             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3523             break;
3524           case 4:
3525             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3526             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3527             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3528             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3529             break;
3530           case 3:
3531             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3532             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3533             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3534             break;
3535           case 2:
3536             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3537             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3538             break;
3539           case 1:
3540             x[row--] += sum1*(*ibdiag);
3541             break;
3542           }
3543         } else {
3544           /* no diag so set =  */
3545           switch (sizes[i]) {
3546           case 5:
3547             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3548             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3549             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3550             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3551             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3552             break;
3553           case 4:
3554             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3555             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3556             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3557             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3558             break;
3559           case 3:
3560             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3561             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3562             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3563             break;
3564           case 2:
3565             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3566             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3567             break;
3568           case 1:
3569             x[row--] = sum1*(*ibdiag);
3570             break;
3571           }
3572         }
3573       }
3574       if (xb == b) {
3575         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3576       } else {
3577         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3578       }
3579     }
3580   }
3581   if (flag & SOR_EISENSTAT) {
3582     /*
3583           Apply  (U + D)^-1  where D is now the block diagonal
3584     */
3585     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3586     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3587       ibdiag -= sizes[i]*sizes[i];
3588       sz      = ii[row+1] - diag[row] - 1;
3589       v1      = a->a + diag[row] + 1;
3590       idx     = a->j + diag[row] + 1;
3591       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3592       switch (sizes[i]) {
3593       case 1:
3594 
3595         sum1 = b[row];
3596         for (n = 0; n<sz-1; n+=2) {
3597           i1    = idx[0];
3598           i2    = idx[1];
3599           idx  += 2;
3600           tmp0  = x[i1];
3601           tmp1  = x[i2];
3602           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3603         }
3604 
3605         if (n == sz-1) {
3606           tmp0  = x[*idx];
3607           sum1 -= *v1*tmp0;
3608         }
3609         x[row] = sum1*(*ibdiag);row--;
3610         break;
3611 
3612       case 2:
3613 
3614         sum1 = b[row];
3615         sum2 = b[row-1];
3616         /* note that sum1 is associated with the second of the two rows */
3617         v2 = a->a + diag[row-1] + 2;
3618         for (n = 0; n<sz-1; n+=2) {
3619           i1    = idx[0];
3620           i2    = idx[1];
3621           idx  += 2;
3622           tmp0  = x[i1];
3623           tmp1  = x[i2];
3624           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3625           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3626         }
3627 
3628         if (n == sz-1) {
3629           tmp0  = x[*idx];
3630           sum1 -= *v1*tmp0;
3631           sum2 -= *v2*tmp0;
3632         }
3633         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3634         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3635         row     -= 2;
3636         break;
3637       case 3:
3638 
3639         sum1 = b[row];
3640         sum2 = b[row-1];
3641         sum3 = b[row-2];
3642         v2   = a->a + diag[row-1] + 2;
3643         v3   = a->a + diag[row-2] + 3;
3644         for (n = 0; n<sz-1; n+=2) {
3645           i1    = idx[0];
3646           i2    = idx[1];
3647           idx  += 2;
3648           tmp0  = x[i1];
3649           tmp1  = x[i2];
3650           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3651           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3652           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3653         }
3654 
3655         if (n == sz-1) {
3656           tmp0  = x[*idx];
3657           sum1 -= *v1*tmp0;
3658           sum2 -= *v2*tmp0;
3659           sum3 -= *v3*tmp0;
3660         }
3661         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3662         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3663         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3664         row     -= 3;
3665         break;
3666       case 4:
3667 
3668         sum1 = b[row];
3669         sum2 = b[row-1];
3670         sum3 = b[row-2];
3671         sum4 = b[row-3];
3672         v2   = a->a + diag[row-1] + 2;
3673         v3   = a->a + diag[row-2] + 3;
3674         v4   = a->a + diag[row-3] + 4;
3675         for (n = 0; n<sz-1; n+=2) {
3676           i1    = idx[0];
3677           i2    = idx[1];
3678           idx  += 2;
3679           tmp0  = x[i1];
3680           tmp1  = x[i2];
3681           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3682           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3683           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3684           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3685         }
3686 
3687         if (n == sz-1) {
3688           tmp0  = x[*idx];
3689           sum1 -= *v1*tmp0;
3690           sum2 -= *v2*tmp0;
3691           sum3 -= *v3*tmp0;
3692           sum4 -= *v4*tmp0;
3693         }
3694         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3695         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3696         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3697         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3698         row     -= 4;
3699         break;
3700       case 5:
3701 
3702         sum1 = b[row];
3703         sum2 = b[row-1];
3704         sum3 = b[row-2];
3705         sum4 = b[row-3];
3706         sum5 = b[row-4];
3707         v2   = a->a + diag[row-1] + 2;
3708         v3   = a->a + diag[row-2] + 3;
3709         v4   = a->a + diag[row-3] + 4;
3710         v5   = a->a + diag[row-4] + 5;
3711         for (n = 0; n<sz-1; n+=2) {
3712           i1    = idx[0];
3713           i2    = idx[1];
3714           idx  += 2;
3715           tmp0  = x[i1];
3716           tmp1  = x[i2];
3717           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3718           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3719           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3720           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3721           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3722         }
3723 
3724         if (n == sz-1) {
3725           tmp0  = x[*idx];
3726           sum1 -= *v1*tmp0;
3727           sum2 -= *v2*tmp0;
3728           sum3 -= *v3*tmp0;
3729           sum4 -= *v4*tmp0;
3730           sum5 -= *v5*tmp0;
3731         }
3732         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3733         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3734         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3735         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3736         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3737         row     -= 5;
3738         break;
3739       default:
3740         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3741       }
3742     }
3743     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3744 
3745     /*
3746            t = b - D x    where D is the block diagonal
3747     */
3748     cnt = 0;
3749     for (i=0, row=0; i<m; i++) {
3750       switch (sizes[i]) {
3751       case 1:
3752         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3753         break;
3754       case 2:
3755         x1       = x[row]; x2 = x[row+1];
3756         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3757         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3758         t[row]   = b[row] - tmp1;
3759         t[row+1] = b[row+1] - tmp2; row += 2;
3760         cnt     += 4;
3761         break;
3762       case 3:
3763         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3764         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3765         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3766         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3767         t[row]   = b[row] - tmp1;
3768         t[row+1] = b[row+1] - tmp2;
3769         t[row+2] = b[row+2] - tmp3; row += 3;
3770         cnt     += 9;
3771         break;
3772       case 4:
3773         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3774         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3775         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3776         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3777         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3778         t[row]   = b[row] - tmp1;
3779         t[row+1] = b[row+1] - tmp2;
3780         t[row+2] = b[row+2] - tmp3;
3781         t[row+3] = b[row+3] - tmp4; row += 4;
3782         cnt     += 16;
3783         break;
3784       case 5:
3785         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3786         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3787         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3788         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3789         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3790         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3791         t[row]   = b[row] - tmp1;
3792         t[row+1] = b[row+1] - tmp2;
3793         t[row+2] = b[row+2] - tmp3;
3794         t[row+3] = b[row+3] - tmp4;
3795         t[row+4] = b[row+4] - tmp5;row += 5;
3796         cnt     += 25;
3797         break;
3798       default:
3799         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3800       }
3801     }
3802     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3803 
3804     /*
3805           Apply (L + D)^-1 where D is the block diagonal
3806     */
3807     for (i=0, row=0; i<m; i++) {
3808       sz  = diag[row] - ii[row];
3809       v1  = a->a + ii[row];
3810       idx = a->j + ii[row];
3811       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3812       switch (sizes[i]) {
3813       case 1:
3814 
3815         sum1 = t[row];
3816         for (n = 0; n<sz-1; n+=2) {
3817           i1    = idx[0];
3818           i2    = idx[1];
3819           idx  += 2;
3820           tmp0  = t[i1];
3821           tmp1  = t[i2];
3822           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3823         }
3824 
3825         if (n == sz-1) {
3826           tmp0  = t[*idx];
3827           sum1 -= *v1 * tmp0;
3828         }
3829         x[row] += t[row] = sum1*(*ibdiag++); row++;
3830         break;
3831       case 2:
3832         v2   = a->a + ii[row+1];
3833         sum1 = t[row];
3834         sum2 = t[row+1];
3835         for (n = 0; n<sz-1; n+=2) {
3836           i1    = idx[0];
3837           i2    = idx[1];
3838           idx  += 2;
3839           tmp0  = t[i1];
3840           tmp1  = t[i2];
3841           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3842           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3843         }
3844 
3845         if (n == sz-1) {
3846           tmp0  = t[*idx];
3847           sum1 -= v1[0] * tmp0;
3848           sum2 -= v2[0] * tmp0;
3849         }
3850         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3851         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3852         ibdiag   += 4; row += 2;
3853         break;
3854       case 3:
3855         v2   = a->a + ii[row+1];
3856         v3   = a->a + ii[row+2];
3857         sum1 = t[row];
3858         sum2 = t[row+1];
3859         sum3 = t[row+2];
3860         for (n = 0; n<sz-1; n+=2) {
3861           i1    = idx[0];
3862           i2    = idx[1];
3863           idx  += 2;
3864           tmp0  = t[i1];
3865           tmp1  = t[i2];
3866           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3867           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3868           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3869         }
3870 
3871         if (n == sz-1) {
3872           tmp0  = t[*idx];
3873           sum1 -= v1[0] * tmp0;
3874           sum2 -= v2[0] * tmp0;
3875           sum3 -= v3[0] * tmp0;
3876         }
3877         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3878         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3879         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3880         ibdiag   += 9; row += 3;
3881         break;
3882       case 4:
3883         v2   = a->a + ii[row+1];
3884         v3   = a->a + ii[row+2];
3885         v4   = a->a + ii[row+3];
3886         sum1 = t[row];
3887         sum2 = t[row+1];
3888         sum3 = t[row+2];
3889         sum4 = t[row+3];
3890         for (n = 0; n<sz-1; n+=2) {
3891           i1    = idx[0];
3892           i2    = idx[1];
3893           idx  += 2;
3894           tmp0  = t[i1];
3895           tmp1  = t[i2];
3896           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3897           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3898           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3899           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3900         }
3901 
3902         if (n == sz-1) {
3903           tmp0  = t[*idx];
3904           sum1 -= v1[0] * tmp0;
3905           sum2 -= v2[0] * tmp0;
3906           sum3 -= v3[0] * tmp0;
3907           sum4 -= v4[0] * tmp0;
3908         }
3909         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3910         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3911         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3912         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3913         ibdiag   += 16; row += 4;
3914         break;
3915       case 5:
3916         v2   = a->a + ii[row+1];
3917         v3   = a->a + ii[row+2];
3918         v4   = a->a + ii[row+3];
3919         v5   = a->a + ii[row+4];
3920         sum1 = t[row];
3921         sum2 = t[row+1];
3922         sum3 = t[row+2];
3923         sum4 = t[row+3];
3924         sum5 = t[row+4];
3925         for (n = 0; n<sz-1; n+=2) {
3926           i1    = idx[0];
3927           i2    = idx[1];
3928           idx  += 2;
3929           tmp0  = t[i1];
3930           tmp1  = t[i2];
3931           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3932           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3933           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3934           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3935           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3936         }
3937 
3938         if (n == sz-1) {
3939           tmp0  = t[*idx];
3940           sum1 -= v1[0] * tmp0;
3941           sum2 -= v2[0] * tmp0;
3942           sum3 -= v3[0] * tmp0;
3943           sum4 -= v4[0] * tmp0;
3944           sum5 -= v5[0] * tmp0;
3945         }
3946         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3947         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3948         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3949         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3950         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3951         ibdiag   += 25; row += 5;
3952         break;
3953       default:
3954         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3955       }
3956     }
3957     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3958   }
3959   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3960   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3961   PetscFunctionReturn(0);
3962 }
3963 
3964 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3965 {
3966   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3967   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3968   const MatScalar   *bdiag = a->inode.bdiag;
3969   const PetscScalar *b;
3970   PetscErrorCode    ierr;
3971   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3972   const PetscInt    *sizes = a->inode.size;
3973 
3974   PetscFunctionBegin;
3975   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
3976   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3977   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3978   cnt  = 0;
3979   for (i=0, row=0; i<m; i++) {
3980     switch (sizes[i]) {
3981     case 1:
3982       x[row] = b[row]*bdiag[cnt++];row++;
3983       break;
3984     case 2:
3985       x1       = b[row]; x2 = b[row+1];
3986       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3987       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3988       x[row++] = tmp1;
3989       x[row++] = tmp2;
3990       cnt     += 4;
3991       break;
3992     case 3:
3993       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3994       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3995       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3996       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3997       x[row++] = tmp1;
3998       x[row++] = tmp2;
3999       x[row++] = tmp3;
4000       cnt     += 9;
4001       break;
4002     case 4:
4003       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4004       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4005       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4006       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4007       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4008       x[row++] = tmp1;
4009       x[row++] = tmp2;
4010       x[row++] = tmp3;
4011       x[row++] = tmp4;
4012       cnt     += 16;
4013       break;
4014     case 5:
4015       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4016       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4017       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4018       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4019       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4020       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4021       x[row++] = tmp1;
4022       x[row++] = tmp2;
4023       x[row++] = tmp3;
4024       x[row++] = tmp4;
4025       x[row++] = tmp5;
4026       cnt     += 25;
4027       break;
4028     default:
4029       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4030     }
4031   }
4032   ierr = PetscLogFlops(2.0*cnt);CHKERRQ(ierr);
4033   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4034   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4035   PetscFunctionReturn(0);
4036 }
4037 
4038 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4039 {
4040   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4041 
4042   PetscFunctionBegin;
4043   a->inode.node_count       = 0;
4044   a->inode.use              = PETSC_FALSE;
4045   a->inode.checked          = PETSC_FALSE;
4046   a->inode.mat_nonzerostate = -1;
4047   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4048   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4049   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4050   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4051   A->ops->coloringpatch     = NULL;
4052   A->ops->multdiagonalblock = NULL;
4053   if (A->factortype) {
4054     A->ops->solve = MatSolve_SeqAIJ_inplace;
4055   }
4056   PetscFunctionReturn(0);
4057 }
4058 
4059 /*
4060     samestructure indicates that the matrix has not changed its nonzero structure so we
4061     do not need to recompute the inodes
4062 */
4063 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4064 {
4065   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4066   PetscErrorCode ierr;
4067   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4068   PetscBool      flag;
4069   const PetscInt *idx,*idy,*ii;
4070 
4071   PetscFunctionBegin;
4072   if (!a->inode.use) {
4073     ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr);
4074     ierr = PetscFree(a->inode.size);CHKERRQ(ierr);
4075     PetscFunctionReturn(0);
4076   }
4077   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4078 
4079   m = A->rmap->n;
4080   if (!a->inode.size) { ierr = PetscMalloc1(m+1,&a->inode.size);CHKERRQ(ierr); }
4081   ns = a->inode.size;
4082 
4083   i          = 0;
4084   node_count = 0;
4085   idx        = a->j;
4086   ii         = a->i;
4087   while (i < m) {                /* For each row */
4088     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4089     /* Limits the number of elements in a node to 'a->inode.limit' */
4090     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4091       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4092       if (nzy != nzx) break;
4093       idy += nzx;              /* Same nonzero pattern */
4094       ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr);
4095       if (!flag) break;
4096     }
4097     ns[node_count++] = blk_size;
4098     idx             += blk_size*nzx;
4099     i                = j;
4100   }
4101 
4102   /* If not enough inodes found,, do not use inode version of the routines */
4103   if (!m || node_count > .8*m) {
4104     ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr);
4105     ierr = PetscFree(a->inode.size);CHKERRQ(ierr);
4106     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4107   } else {
4108     if (!A->factortype) {
4109       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4110       if (A->rmap->n == A->cmap->n) {
4111         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4112         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4113         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4114         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4115         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4116       }
4117     } else {
4118       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4119     }
4120     a->inode.node_count = node_count;
4121     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4122   }
4123   a->inode.checked          = PETSC_TRUE;
4124   a->inode.mat_nonzerostate = A->nonzerostate;
4125   PetscFunctionReturn(0);
4126 }
4127 
4128 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4129 {
4130   Mat            B =*C;
4131   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4132   PetscErrorCode ierr;
4133   PetscInt       m=A->rmap->n;
4134 
4135   PetscFunctionBegin;
4136   c->inode.use              = a->inode.use;
4137   c->inode.limit            = a->inode.limit;
4138   c->inode.max_limit        = a->inode.max_limit;
4139   c->inode.checked          = PETSC_FALSE;
4140   c->inode.size             = NULL;
4141   c->inode.node_count       = 0;
4142   c->inode.ibdiagvalid      = PETSC_FALSE;
4143   c->inode.ibdiag           = NULL;
4144   c->inode.bdiag            = NULL;
4145   c->inode.mat_nonzerostate = -1;
4146   if (a->inode.use) {
4147     if (a->inode.checked && a->inode.size) {
4148       ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4149       ierr = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr);
4150 
4151       c->inode.checked          = PETSC_TRUE;
4152       c->inode.node_count       = a->inode.node_count;
4153       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4154     }
4155     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4156     if (!B->factortype) {
4157       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4158       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4159       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4160       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4161       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4162       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4163     } else {
4164       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4165     }
4166   }
4167   PetscFunctionReturn(0);
4168 }
4169 
4170 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4171 {
4172   PetscInt       k;
4173   const PetscInt *vi;
4174 
4175   PetscFunctionBegin;
4176   vi = aj + ai[row];
4177   for (k=0; k<nzl; k++) cols[k] = vi[k];
4178   vi        = aj + adiag[row];
4179   cols[nzl] = vi[0];
4180   vi        = aj + adiag[row+1]+1;
4181   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4182   PetscFunctionReturn(0);
4183 }
4184 /*
4185    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4186    Modified from MatSeqAIJCheckInode().
4187 
4188    Input Parameters:
4189 .  Mat A - ILU or LU matrix factor
4190 
4191 */
4192 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4193 {
4194   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4195   PetscErrorCode ierr;
4196   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4197   PetscInt       *cols1,*cols2,*ns;
4198   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4199   PetscBool      flag;
4200 
4201   PetscFunctionBegin;
4202   if (!a->inode.use)    PetscFunctionReturn(0);
4203   if (a->inode.checked) PetscFunctionReturn(0);
4204 
4205   m = A->rmap->n;
4206   if (a->inode.size) ns = a->inode.size;
4207   else {
4208     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4209   }
4210 
4211   i          = 0;
4212   node_count = 0;
4213   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4214   while (i < m) {                /* For each row */
4215     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4216     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4217     nzx  = nzl1 + nzu1 + 1;
4218     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4219 
4220     /* Limits the number of elements in a node to 'a->inode.limit' */
4221     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4222       nzl2 = ai[j+1] - ai[j];
4223       nzu2 = adiag[j] - adiag[j+1] - 1;
4224       nzy  = nzl2 + nzu2 + 1;
4225       if (nzy != nzx) break;
4226       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4227       ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr);
4228       if (!flag) break;
4229     }
4230     ns[node_count++] = blk_size;
4231     i                = j;
4232   }
4233   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4234   /* If not enough inodes found,, do not use inode version of the routines */
4235   if (!m || node_count > .8*m) {
4236     ierr = PetscFree(ns);CHKERRQ(ierr);
4237 
4238     a->inode.node_count = 0;
4239     a->inode.size       = NULL;
4240     a->inode.use        = PETSC_FALSE;
4241 
4242     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4243   } else {
4244     A->ops->mult              = NULL;
4245     A->ops->sor               = NULL;
4246     A->ops->multadd           = NULL;
4247     A->ops->getrowij          = NULL;
4248     A->ops->restorerowij      = NULL;
4249     A->ops->getcolumnij       = NULL;
4250     A->ops->restorecolumnij   = NULL;
4251     A->ops->coloringpatch     = NULL;
4252     A->ops->multdiagonalblock = NULL;
4253     a->inode.node_count       = node_count;
4254     a->inode.size             = ns;
4255 
4256     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4257   }
4258   a->inode.checked = PETSC_TRUE;
4259   PetscFunctionReturn(0);
4260 }
4261 
4262 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4263 {
4264   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4265 
4266   PetscFunctionBegin;
4267   a->inode.ibdiagvalid = PETSC_FALSE;
4268   PetscFunctionReturn(0);
4269 }
4270 
4271 /*
4272      This is really ugly. if inodes are used this replaces the
4273   permutations with ones that correspond to rows/cols of the matrix
4274   rather then inode blocks
4275 */
4276 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4277 {
4278   PetscErrorCode ierr;
4279 
4280   PetscFunctionBegin;
4281   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4282   PetscFunctionReturn(0);
4283 }
4284 
4285 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4286 {
4287   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4288   PetscErrorCode ierr;
4289   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4290   const PetscInt *ridx,*cidx;
4291   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4292   PetscInt       nslim_col,*ns_col;
4293   IS             ris = *rperm,cis = *cperm;
4294 
4295   PetscFunctionBegin;
4296   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4297   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4298 
4299   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4300   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4301   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4302 
4303   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4304   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4305 
4306   /* Form the inode structure for the rows of permuted matric using inv perm*/
4307   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4308 
4309   /* Construct the permutations for rows*/
4310   for (i=0,row = 0; i<nslim_row; ++i) {
4311     indx      = ridx[i];
4312     start_val = tns[indx];
4313     end_val   = tns[indx + 1];
4314     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4315   }
4316 
4317   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4318   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4319 
4320   /* Construct permutations for columns */
4321   for (i=0,col=0; i<nslim_col; ++i) {
4322     indx      = cidx[i];
4323     start_val = tns[indx];
4324     end_val   = tns[indx + 1];
4325     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4326   }
4327 
4328   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4329   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4330   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4331   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4332 
4333   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4334   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4335 
4336   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4337   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4338   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4339   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4340   ierr = PetscFree(tns);CHKERRQ(ierr);
4341   PetscFunctionReturn(0);
4342 }
4343 
4344 /*@C
4345    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4346 
4347    Not Collective
4348 
4349    Input Parameter:
4350 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4351 
4352    Output Parameter:
4353 +  node_count - no of inodes present in the matrix.
4354 .  sizes      - an array of size node_count,with sizes of each inode.
4355 -  limit      - the max size used to generate the inodes.
4356 
4357    Level: advanced
4358 
4359    Notes:
4360     This routine returns some internal storage information
4361    of the matrix, it is intended to be used by advanced users.
4362    It should be called after the matrix is assembled.
4363    The contents of the sizes[] array should not be changed.
4364    NULL may be passed for information not requested.
4365 
4366 .seealso: MatGetInfo()
4367 @*/
4368 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4369 {
4370   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4371 
4372   PetscFunctionBegin;
4373   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4374   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4375   if (f) {
4376     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4377   }
4378   PetscFunctionReturn(0);
4379 }
4380 
4381 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4382 {
4383   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4384 
4385   PetscFunctionBegin;
4386   if (node_count) *node_count = a->inode.node_count;
4387   if (sizes)      *sizes      = a->inode.size;
4388   if (limit)      *limit      = a->inode.limit;
4389   PetscFunctionReturn(0);
4390 }
4391