xref: /petsc/src/mat/impls/aij/seq/inode.c (revision eca87e8d15a60b86ea90ec43a90a0e7530e65b09)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202 
1203   PetscFunctionBegin;
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp2 = rtmp1 + n;
1232   rtmp3 = rtmp2 + n;
1233   ics   = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns){
1238     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1239   }
1240 
1241   /* If max inode size > 3, split it into two inodes.*/
1242   /* also map the inode sizes according to the ordering */
1243   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1244   for (i=0,j=0; i<node_max; ++i,++j){
1245     if (ns[i]>3) {
1246       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1247       ++j;
1248       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1249     } else {
1250       tmp_vec1[j] = ns[i];
1251     }
1252   }
1253   /* Use the correct node_max */
1254   node_max = j;
1255 
1256   /* Now reorder the inode info based on mat re-ordering info */
1257   /* First create a row -> inode_size_array_index map */
1258   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1259   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1260   for (i=0,row=0; i<node_max; i++) {
1261     nodesz = tmp_vec1[i];
1262     for (j=0; j<nodesz; j++,row++) {
1263       nsmap[row] = i;
1264     }
1265   }
1266   /* Using nsmap, create a reordered ns structure */
1267   for (i=0,j=0; i< node_max; i++) {
1268     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1269     tmp_vec2[i]  = nodesz;
1270     j           += nodesz;
1271   }
1272   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1273   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1274 
1275   /* Now use the correct ns */
1276   ns = tmp_vec2;
1277 
1278   do {
1279     sctx.useshift = PETSC_FALSE;
1280     /* Now loop over each block-row, and do the factorization */
1281     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1282       nodesz = ns[inod];
1283 
1284       switch (nodesz){
1285       case 1:
1286       /*----------*/
1287         /* zero rtmp1 */
1288         /* L part */
1289         nz    = bi[i+1] - bi[i];
1290         bjtmp = bj + bi[i];
1291         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1292 
1293         /* U part */
1294         nz = bdiag[i]-bdiag[i+1];
1295         bjtmp = bj + bdiag[i+1]+1;
1296         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1297 
1298         /* load in initial (unfactored row) */
1299         nz    = ai[r[i]+1] - ai[r[i]];
1300         ajtmp = aj + ai[r[i]];
1301         v     = aa + ai[r[i]];
1302         for (j=0; j<nz; j++) {
1303           rtmp1[ics[ajtmp[j]]] = v[j];
1304         }
1305         /* ZeropivotApply() */
1306         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1307 
1308         /* elimination */
1309         bjtmp = bj + bi[i];
1310         row   = *bjtmp++;
1311         nzL   = bi[i+1] - bi[i];
1312         for(k=0; k < nzL;k++) {
1313           pc = rtmp1 + row;
1314           if (*pc != 0.0) {
1315             pv   = b->a + bdiag[row];
1316             mul1 = *pc * (*pv);
1317             *pc  = mul1;
1318             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1319             pv = b->a + bdiag[row+1]+1;
1320             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1321             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1322             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1323           }
1324           row = *bjtmp++;
1325         }
1326 
1327         /* finished row so stick it into b->a */
1328         rs = 0.0;
1329         /* L part */
1330         pv = b->a + bi[i] ;
1331         pj = b->j + bi[i] ;
1332         nz = bi[i+1] - bi[i];
1333         for (j=0; j<nz; j++) {
1334           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335         }
1336 
1337         /* U part */
1338         pv = b->a + bdiag[i+1]+1;
1339         pj = b->j + bdiag[i+1]+1;
1340         nz = bdiag[i] - bdiag[i+1]-1;
1341         for (j=0; j<nz; j++) {
1342           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1343         }
1344 
1345         /* Check zero pivot */
1346         sctx.rs = rs;
1347         sctx.pv = rtmp1[i];
1348         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1349 
1350         /* Mark diagonal and invert diagonal for simplier triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356       /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i+1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for  (j=0; j<nz; j++) {
1362           col = bjtmp[j];
1363           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364         }
1365 
1366         /* U part */
1367         nz = bdiag[i]-bdiag[i+1];
1368         bjtmp = bj + bdiag[i+1]+1;
1369         for  (j=0; j<nz; j++) {
1370           col = bjtmp[j];
1371           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372         }
1373 
1374         /* load in initial (unfactored row) */
1375         nz    = ai[r[i]+1] - ai[r[i]];
1376         ajtmp = aj + ai[r[i]];
1377         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378         for (j=0; j<nz; j++) {
1379           col = ics[ajtmp[j]];
1380           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381         }
1382         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384 
1385         /* elimination */
1386         bjtmp = bj + bi[i];
1387         row   = *bjtmp++; /* pivot row */
1388         nzL   = bi[i+1] - bi[i];
1389         for(k=0; k < nzL;k++) {
1390           pc1 = rtmp1 + row;
1391           pc2 = rtmp2 + row;
1392           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393             pv   = b->a + bdiag[row];
1394             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395             *pc1 = mul1;       *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row+1]+1;
1399             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400             for (j=0; j<nz; j++){
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs  = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i] ;
1415         nz  = bi[i+1] - bi[i];
1416         for (j=0; j<nz; j++) {
1417           col = pj[j];
1418           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419         }
1420         /* U part */
1421         pc1 = b->a + bdiag[i+1]+1;
1422         pj  = b->j + bdiag[i+1]+1;
1423         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424         for (j=0; j<nz; j++) {
1425           col = pj[j];
1426           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427         }
1428 
1429         sctx.rs  = rs;
1430         sctx.pv  = rtmp1[i];
1431         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1432         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433         *pc1 = 1.0/sctx.pv;
1434 
1435         /* Now take care of diagonal 2x2 block. */
1436         pc2 = rtmp2 + i;
1437         if (*pc2 != 0.0){
1438           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439           *pc2 = mul1;          /* insert L entry */
1440           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442           for (j=0; j<nz; j++) {
1443             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444           }
1445           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1446         }
1447 
1448         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449         rs = 0.0;
1450         /* L part */
1451         pc2 = b->a + bi[i+1];
1452         pj  = b->j + bi[i+1] ;
1453         nz  = bi[i+2] - bi[i+1];
1454         for (j=0; j<nz; j++) {
1455           col = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458         /* U part */
1459         pc2 = b->a + bdiag[i+2]+1;
1460         pj  = b->j + bdiag[i+2]+1;
1461         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462         for (j=0; j<nz; j++) {
1463           col = pj[j];
1464           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465         }
1466 
1467         sctx.rs  = rs;
1468         sctx.pv  = rtmp2[i+1];
1469         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1470         pc2  = b->a + bdiag[i+1];
1471         *pc2 = 1.0/sctx.pv;
1472         break;
1473 
1474       case 3:
1475       /*----------*/
1476         /* zero rtmp */
1477         /* L part */
1478         nz    = bi[i+1] - bi[i];
1479         bjtmp = bj + bi[i];
1480         for  (j=0; j<nz; j++) {
1481           col = bjtmp[j];
1482           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1483         }
1484 
1485         /* U part */
1486         nz = bdiag[i]-bdiag[i+1];
1487         bjtmp = bj + bdiag[i+1]+1;
1488         for  (j=0; j<nz; j++) {
1489           col = bjtmp[j];
1490           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1491         }
1492 
1493         /* load in initial (unfactored row) */
1494         nz    = ai[r[i]+1] - ai[r[i]];
1495         ajtmp = aj + ai[r[i]];
1496         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1497         for (j=0; j<nz; j++) {
1498           col = ics[ajtmp[j]];
1499           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1500         }
1501         /* ZeropivotApply(): shift the diagonal of the matrix  */
1502         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1503 
1504         /* elimination */
1505         bjtmp = bj + bi[i];
1506         row   = *bjtmp++; /* pivot row */
1507         nzL   = bi[i+1] - bi[i];
1508         for(k=0; k < nzL;k++) {
1509           pc1 = rtmp1 + row;
1510           pc2 = rtmp2 + row;
1511           pc3 = rtmp3 + row;
1512           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1513             pv  = b->a + bdiag[row];
1514             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1515             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1516 
1517             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1518             pv = b->a + bdiag[row+1]+1;
1519             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1520             for (j=0; j<nz; j++){
1521               col = pj[j];
1522               rtmp1[col] -= mul1 * pv[j];
1523               rtmp2[col] -= mul2 * pv[j];
1524               rtmp3[col] -= mul3 * pv[j];
1525             }
1526             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1527           }
1528           row = *bjtmp++;
1529         }
1530 
1531         /* finished row i; check zero pivot, then stick row i into b->a */
1532         rs  = 0.0;
1533         /* L part */
1534         pc1 = b->a + bi[i];
1535         pj  = b->j + bi[i] ;
1536         nz  = bi[i+1] - bi[i];
1537         for (j=0; j<nz; j++) {
1538           col = pj[j];
1539           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1540         }
1541         /* U part */
1542         pc1 = b->a + bdiag[i+1]+1;
1543         pj  = b->j + bdiag[i+1]+1;
1544         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1545         for (j=0; j<nz; j++) {
1546           col = pj[j];
1547           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1548         }
1549 
1550         sctx.rs  = rs;
1551         sctx.pv  = rtmp1[i];
1552         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1553         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1554         *pc1 = 1.0/sctx.pv;
1555 
1556         /* Now take care of 1st column of diagonal 3x3 block. */
1557         pc2 = rtmp2 + i;
1558         pc3 = rtmp3 + i;
1559         if (*pc2 != 0.0 || *pc3 != 0.0){
1560           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1561           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1562           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1563           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1564           for (j=0; j<nz; j++) {
1565             col = pj[j];
1566             rtmp2[col] -= mul2 * rtmp1[col];
1567             rtmp3[col] -= mul3 * rtmp1[col];
1568           }
1569           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1570         }
1571 
1572         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1573         rs = 0.0;
1574         /* L part */
1575         pc2 = b->a + bi[i+1];
1576         pj  = b->j + bi[i+1] ;
1577         nz  = bi[i+2] - bi[i+1];
1578         for (j=0; j<nz; j++) {
1579           col = pj[j];
1580           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581         }
1582         /* U part */
1583         pc2 = b->a + bdiag[i+2]+1;
1584         pj  = b->j + bdiag[i+2]+1;
1585         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1586         for (j=0; j<nz; j++) {
1587           col = pj[j];
1588           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1589         }
1590 
1591         sctx.rs  = rs;
1592         sctx.pv  = rtmp2[i+1];
1593         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1594         pc2  = b->a + bdiag[i+1];
1595         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1596 
1597         /* Now take care of 2nd column of diagonal 3x3 block. */
1598         pc3 = rtmp3 + i+1;
1599         if (*pc3 != 0.0){
1600           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1601           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1602           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1603           for (j=0; j<nz; j++) {
1604             col = pj[j];
1605             rtmp3[col] -= mul3 * rtmp2[col];
1606           }
1607           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1608         }
1609 
1610         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1611         rs = 0.0;
1612         /* L part */
1613         pc3 = b->a + bi[i+2];
1614         pj  = b->j + bi[i+2] ;
1615         nz  = bi[i+3] - bi[i+2];
1616         for (j=0; j<nz; j++) {
1617           col = pj[j];
1618           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1619         }
1620         /* U part */
1621         pc3 = b->a + bdiag[i+3]+1;
1622         pj  = b->j + bdiag[i+3]+1;
1623         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1624         for (j=0; j<nz; j++) {
1625           col = pj[j];
1626           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1627         }
1628 
1629         sctx.rs  = rs;
1630         sctx.pv  = rtmp3[i+2];
1631         ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr);
1632         pc3  = b->a + bdiag[i+2];
1633         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1634         break;
1635 
1636       default:
1637         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1638       }
1639       i += nodesz;                 /* Update the row */
1640     }
1641 
1642     /* MatPivotRefine() */
1643     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1644       /*
1645        * if no shift in this attempt & shifting & started shifting & can refine,
1646        * then try lower shift
1647        */
1648       sctx.shift_hi       = sctx.shift_fraction;
1649       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1650       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1651       sctx.useshift       = PETSC_TRUE;
1652       sctx.nshift++;
1653     }
1654   } while (sctx.useshift);
1655 
1656   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1657   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1658   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1659   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1660 
1661   C->ops->solve              = MatSolve_SeqAIJ;
1662   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1663   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1664   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1665   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1666   C->assembled    = PETSC_TRUE;
1667   C->preallocated = PETSC_TRUE;
1668   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1669 
1670   /* MatShiftView(A,info,&sctx) */
1671   if (sctx.nshift){
1672     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1673       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1674     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1675       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1676     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1677       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1678     }
1679   }
1680   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 #undef __FUNCT__
1685 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1686 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1687 {
1688   Mat               C = B;
1689   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1690   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1691   PetscErrorCode    ierr;
1692   const PetscInt    *r,*ic,*c,*ics;
1693   PetscInt          n = A->rmap->n,*bi = b->i;
1694   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1695   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1696   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1697   PetscScalar       mul1,mul2,mul3,tmp;
1698   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1699   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1700   PetscReal         rs=0.0;
1701   FactorShiftCtx    sctx;
1702   PetscInt          newshift;
1703 
1704   PetscFunctionBegin;
1705   sctx.shift_top      = 0;
1706   sctx.nshift_max     = 0;
1707   sctx.shift_lo       = 0;
1708   sctx.shift_hi       = 0;
1709   sctx.shift_fraction = 0;
1710 
1711   /* if both shift schemes are chosen by user, only use info->shiftpd */
1712   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1713     sctx.shift_top = 0;
1714     for (i=0; i<n; i++) {
1715       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1716       rs    = 0.0;
1717       ajtmp = aj + ai[i];
1718       rtmp1 = aa + ai[i];
1719       nz = ai[i+1] - ai[i];
1720       for (j=0; j<nz; j++){
1721         if (*ajtmp != i){
1722           rs += PetscAbsScalar(*rtmp1++);
1723         } else {
1724           rs -= PetscRealPart(*rtmp1++);
1725         }
1726         ajtmp++;
1727       }
1728       if (rs>sctx.shift_top) sctx.shift_top = rs;
1729     }
1730     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1731     sctx.shift_top *= 1.1;
1732     sctx.nshift_max = 5;
1733     sctx.shift_lo   = 0.;
1734     sctx.shift_hi   = 1.;
1735   }
1736   sctx.shift_amount = 0;
1737   sctx.nshift       = 0;
1738 
1739   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1740   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1741   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1742   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1743   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1744   ics   = ic ;
1745   rtmp22 = rtmp11 + n;
1746   rtmp33 = rtmp22 + n;
1747 
1748   node_max = a->inode.node_count;
1749   ns       = a->inode.size;
1750   if (!ns){
1751     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1752   }
1753 
1754   /* If max inode size > 3, split it into two inodes.*/
1755   /* also map the inode sizes according to the ordering */
1756   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1757   for (i=0,j=0; i<node_max; ++i,++j){
1758     if (ns[i]>3) {
1759       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1760       ++j;
1761       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1762     } else {
1763       tmp_vec1[j] = ns[i];
1764     }
1765   }
1766   /* Use the correct node_max */
1767   node_max = j;
1768 
1769   /* Now reorder the inode info based on mat re-ordering info */
1770   /* First create a row -> inode_size_array_index map */
1771   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1772   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1773   for (i=0,row=0; i<node_max; i++) {
1774     nodesz = tmp_vec1[i];
1775     for (j=0; j<nodesz; j++,row++) {
1776       nsmap[row] = i;
1777     }
1778   }
1779   /* Using nsmap, create a reordered ns structure */
1780   for (i=0,j=0; i< node_max; i++) {
1781     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1782     tmp_vec2[i]  = nodesz;
1783     j           += nodesz;
1784   }
1785   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1786   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1787   /* Now use the correct ns */
1788   ns = tmp_vec2;
1789 
1790   do {
1791     sctx.useshift = PETSC_FALSE;
1792     /* Now loop over each block-row, and do the factorization */
1793     for (i=0,row=0; i<node_max; i++) {
1794       nodesz = ns[i];
1795       nz     = bi[row+1] - bi[row];
1796       bjtmp  = bj + bi[row];
1797 
1798       switch (nodesz){
1799       case 1:
1800         for  (j=0; j<nz; j++){
1801           idx        = bjtmp[j];
1802           rtmp11[idx] = 0.0;
1803         }
1804 
1805         /* load in initial (unfactored row) */
1806         idx    = r[row];
1807         nz_tmp = ai[idx+1] - ai[idx];
1808         ajtmp  = aj + ai[idx];
1809         v1     = aa + ai[idx];
1810 
1811         for (j=0; j<nz_tmp; j++) {
1812           idx        = ics[ajtmp[j]];
1813           rtmp11[idx] = v1[j];
1814         }
1815         rtmp11[ics[r[row]]] += sctx.shift_amount;
1816 
1817         prow = *bjtmp++ ;
1818         while (prow < row) {
1819           pc1 = rtmp11 + prow;
1820           if (*pc1 != 0.0){
1821             pv   = ba + bd[prow];
1822             pj   = nbj + bd[prow];
1823             mul1 = *pc1 * *pv++;
1824             *pc1 = mul1;
1825             nz_tmp = bi[prow+1] - bd[prow] - 1;
1826             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1827             for (j=0; j<nz_tmp; j++) {
1828               tmp = pv[j];
1829               idx = pj[j];
1830               rtmp11[idx] -= mul1 * tmp;
1831             }
1832           }
1833           prow = *bjtmp++ ;
1834         }
1835         pj  = bj + bi[row];
1836         pc1 = ba + bi[row];
1837 
1838         sctx.pv    = rtmp11[row];
1839         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1840         rs         = 0.0;
1841         for (j=0; j<nz; j++) {
1842           idx    = pj[j];
1843           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1844           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1845         }
1846         sctx.rs  = rs;
1847         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1848         if (newshift == 1) goto endofwhile;
1849         break;
1850 
1851       case 2:
1852         for (j=0; j<nz; j++) {
1853           idx        = bjtmp[j];
1854           rtmp11[idx] = 0.0;
1855           rtmp22[idx] = 0.0;
1856         }
1857 
1858         /* load in initial (unfactored row) */
1859         idx    = r[row];
1860         nz_tmp = ai[idx+1] - ai[idx];
1861         ajtmp  = aj + ai[idx];
1862         v1     = aa + ai[idx];
1863         v2     = aa + ai[idx+1];
1864         for (j=0; j<nz_tmp; j++) {
1865           idx        = ics[ajtmp[j]];
1866           rtmp11[idx] = v1[j];
1867           rtmp22[idx] = v2[j];
1868         }
1869         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1870         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1871 
1872         prow = *bjtmp++ ;
1873         while (prow < row) {
1874           pc1 = rtmp11 + prow;
1875           pc2 = rtmp22 + prow;
1876           if (*pc1 != 0.0 || *pc2 != 0.0){
1877             pv   = ba + bd[prow];
1878             pj   = nbj + bd[prow];
1879             mul1 = *pc1 * *pv;
1880             mul2 = *pc2 * *pv;
1881             ++pv;
1882             *pc1 = mul1;
1883             *pc2 = mul2;
1884 
1885             nz_tmp = bi[prow+1] - bd[prow] - 1;
1886             for (j=0; j<nz_tmp; j++) {
1887               tmp = pv[j];
1888               idx = pj[j];
1889               rtmp11[idx] -= mul1 * tmp;
1890               rtmp22[idx] -= mul2 * tmp;
1891             }
1892             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1893           }
1894           prow = *bjtmp++ ;
1895         }
1896 
1897         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1898         pc1 = rtmp11 + prow;
1899         pc2 = rtmp22 + prow;
1900 
1901         sctx.pv = *pc1;
1902         pj      = bj + bi[prow];
1903         rs      = 0.0;
1904         for (j=0; j<nz; j++){
1905           idx = pj[j];
1906           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1907         }
1908         sctx.rs = rs;
1909         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1910         if (newshift == 1) goto endofwhile;
1911 
1912         if (*pc2 != 0.0){
1913           pj     = nbj + bd[prow];
1914           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1915           *pc2   = mul2;
1916           nz_tmp = bi[prow+1] - bd[prow] - 1;
1917           for (j=0; j<nz_tmp; j++) {
1918             idx = pj[j] ;
1919             tmp = rtmp11[idx];
1920             rtmp22[idx] -= mul2 * tmp;
1921           }
1922           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1923         }
1924 
1925         pj  = bj + bi[row];
1926         pc1 = ba + bi[row];
1927         pc2 = ba + bi[row+1];
1928 
1929         sctx.pv = rtmp22[row+1];
1930         rs = 0.0;
1931         rtmp11[row]   = 1.0/rtmp11[row];
1932         rtmp22[row+1] = 1.0/rtmp22[row+1];
1933         /* copy row entries from dense representation to sparse */
1934         for (j=0; j<nz; j++) {
1935           idx    = pj[j];
1936           pc1[j] = rtmp11[idx];
1937           pc2[j] = rtmp22[idx];
1938           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1939         }
1940         sctx.rs = rs;
1941         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1942         if (newshift == 1) goto endofwhile;
1943         break;
1944 
1945       case 3:
1946         for  (j=0; j<nz; j++) {
1947           idx        = bjtmp[j];
1948           rtmp11[idx] = 0.0;
1949           rtmp22[idx] = 0.0;
1950           rtmp33[idx] = 0.0;
1951         }
1952         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1953         idx    = r[row];
1954         nz_tmp = ai[idx+1] - ai[idx];
1955         ajtmp = aj + ai[idx];
1956         v1    = aa + ai[idx];
1957         v2    = aa + ai[idx+1];
1958         v3    = aa + ai[idx+2];
1959         for (j=0; j<nz_tmp; j++) {
1960           idx        = ics[ajtmp[j]];
1961           rtmp11[idx] = v1[j];
1962           rtmp22[idx] = v2[j];
1963           rtmp33[idx] = v3[j];
1964         }
1965         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1966         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1967         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1968 
1969         /* loop over all pivot row blocks above this row block */
1970         prow = *bjtmp++ ;
1971         while (prow < row) {
1972           pc1 = rtmp11 + prow;
1973           pc2 = rtmp22 + prow;
1974           pc3 = rtmp33 + prow;
1975           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1976             pv   = ba  + bd[prow];
1977             pj   = nbj + bd[prow];
1978             mul1 = *pc1 * *pv;
1979             mul2 = *pc2 * *pv;
1980             mul3 = *pc3 * *pv;
1981             ++pv;
1982             *pc1 = mul1;
1983             *pc2 = mul2;
1984             *pc3 = mul3;
1985 
1986             nz_tmp = bi[prow+1] - bd[prow] - 1;
1987             /* update this row based on pivot row */
1988             for (j=0; j<nz_tmp; j++) {
1989               tmp = pv[j];
1990               idx = pj[j];
1991               rtmp11[idx] -= mul1 * tmp;
1992               rtmp22[idx] -= mul2 * tmp;
1993               rtmp33[idx] -= mul3 * tmp;
1994             }
1995             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1996           }
1997           prow = *bjtmp++ ;
1998         }
1999 
2000         /* Now take care of diagonal 3x3 block in this set of rows */
2001         /* note: prow = row here */
2002         pc1 = rtmp11 + prow;
2003         pc2 = rtmp22 + prow;
2004         pc3 = rtmp33 + prow;
2005 
2006         sctx.pv = *pc1;
2007         pj      = bj + bi[prow];
2008         rs      = 0.0;
2009         for (j=0; j<nz; j++){
2010           idx = pj[j];
2011           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2012         }
2013         sctx.rs = rs;
2014         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2015         if (newshift == 1) goto endofwhile;
2016 
2017         if (*pc2 != 0.0 || *pc3 != 0.0){
2018           mul2 = (*pc2)/(*pc1);
2019           mul3 = (*pc3)/(*pc1);
2020           *pc2 = mul2;
2021           *pc3 = mul3;
2022           nz_tmp = bi[prow+1] - bd[prow] - 1;
2023           pj     = nbj + bd[prow];
2024           for (j=0; j<nz_tmp; j++) {
2025             idx = pj[j] ;
2026             tmp = rtmp11[idx];
2027             rtmp22[idx] -= mul2 * tmp;
2028             rtmp33[idx] -= mul3 * tmp;
2029           }
2030           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2031         }
2032         ++prow;
2033 
2034         pc2 = rtmp22 + prow;
2035         pc3 = rtmp33 + prow;
2036         sctx.pv = *pc2;
2037         pj      = bj + bi[prow];
2038         rs      = 0.0;
2039         for (j=0; j<nz; j++){
2040           idx = pj[j];
2041           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2042         }
2043         sctx.rs = rs;
2044         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2045         if (newshift == 1) goto endofwhile;
2046 
2047         if (*pc3 != 0.0){
2048           mul3   = (*pc3)/(*pc2);
2049           *pc3   = mul3;
2050           pj     = nbj + bd[prow];
2051           nz_tmp = bi[prow+1] - bd[prow] - 1;
2052           for (j=0; j<nz_tmp; j++) {
2053             idx = pj[j] ;
2054             tmp = rtmp22[idx];
2055             rtmp33[idx] -= mul3 * tmp;
2056           }
2057           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2058         }
2059 
2060         pj  = bj + bi[row];
2061         pc1 = ba + bi[row];
2062         pc2 = ba + bi[row+1];
2063         pc3 = ba + bi[row+2];
2064 
2065         sctx.pv = rtmp33[row+2];
2066         rs = 0.0;
2067         rtmp11[row]   = 1.0/rtmp11[row];
2068         rtmp22[row+1] = 1.0/rtmp22[row+1];
2069         rtmp33[row+2] = 1.0/rtmp33[row+2];
2070         /* copy row entries from dense representation to sparse */
2071         for (j=0; j<nz; j++) {
2072           idx    = pj[j];
2073           pc1[j] = rtmp11[idx];
2074           pc2[j] = rtmp22[idx];
2075           pc3[j] = rtmp33[idx];
2076           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2077         }
2078 
2079         sctx.rs = rs;
2080         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2081         if (newshift == 1) goto endofwhile;
2082         break;
2083 
2084       default:
2085         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2086       }
2087       row += nodesz;                 /* Update the row */
2088     }
2089     endofwhile:;
2090   } while (sctx.useshift);
2091   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2092   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2093   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2094   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2095   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2096   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2097   /* do not set solve add, since MatSolve_Inode + Add is faster */
2098   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2099   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2100   C->assembled   = PETSC_TRUE;
2101   C->preallocated = PETSC_TRUE;
2102   if (sctx.nshift) {
2103     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2104       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2105     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2106       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2107     }
2108   }
2109   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2110   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2111   PetscFunctionReturn(0);
2112 }
2113 
2114 
2115 /* ----------------------------------------------------------- */
2116 #undef __FUNCT__
2117 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2118 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2119 {
2120   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2121   IS                iscol = a->col,isrow = a->row;
2122   PetscErrorCode    ierr;
2123   const PetscInt    *r,*c,*rout,*cout;
2124   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2125   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2126   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2127   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2128   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2129   const PetscScalar *b;
2130 
2131   PetscFunctionBegin;
2132   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2133   node_max = a->inode.node_count;
2134   ns       = a->inode.size;     /* Node Size array */
2135 
2136   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2137   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2138   tmp  = a->solve_work;
2139 
2140   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2141   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2142 
2143   /* forward solve the lower triangular */
2144   tmps = tmp ;
2145   aa   = a_a ;
2146   aj   = a_j ;
2147   ad   = a->diag;
2148 
2149   for (i = 0,row = 0; i< node_max; ++i){
2150     nsz = ns[i];
2151     aii = ai[row];
2152     v1  = aa + aii;
2153     vi  = aj + aii;
2154     nz  = ai[row+1]- ai[row];
2155 
2156     if (i < node_max-1) {
2157       /* Prefetch the indices for the next block */
2158       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2159       /* Prefetch the data for the next block */
2160       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2161     }
2162 
2163     switch (nsz){               /* Each loop in 'case' is unrolled */
2164     case 1 :
2165       sum1 = b[r[row]];
2166       for(j=0; j<nz-1; j+=2){
2167         i0   = vi[j];
2168         i1   = vi[j+1];
2169         tmp0 = tmps[i0];
2170         tmp1 = tmps[i1];
2171         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2172       }
2173       if(j == nz-1){
2174         tmp0 = tmps[vi[j]];
2175         sum1 -= v1[j]*tmp0;
2176       }
2177       tmp[row++]=sum1;
2178       break;
2179     case 2:
2180       sum1 = b[r[row]];
2181       sum2 = b[r[row+1]];
2182       v2   = aa + ai[row+1];
2183 
2184       for(j=0; j<nz-1; j+=2){
2185         i0   = vi[j];
2186         i1   = vi[j+1];
2187         tmp0 = tmps[i0];
2188         tmp1 = tmps[i1];
2189         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2190         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2191       }
2192       if(j == nz-1){
2193         tmp0 = tmps[vi[j]];
2194         sum1 -= v1[j] *tmp0;
2195         sum2 -= v2[j] *tmp0;
2196       }
2197       sum2 -= v2[nz] * sum1;
2198       tmp[row ++]=sum1;
2199       tmp[row ++]=sum2;
2200       break;
2201     case 3:
2202       sum1 = b[r[row]];
2203       sum2 = b[r[row+1]];
2204       sum3 = b[r[row+2]];
2205       v2   = aa + ai[row+1];
2206       v3   = aa + ai[row+2];
2207 
2208       for (j=0; j<nz-1; j+=2){
2209         i0   = vi[j];
2210         i1   = vi[j+1];
2211         tmp0 = tmps[i0];
2212         tmp1 = tmps[i1];
2213         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2214         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2215         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2216       }
2217       if (j == nz-1){
2218         tmp0 = tmps[vi[j]];
2219         sum1 -= v1[j] *tmp0;
2220         sum2 -= v2[j] *tmp0;
2221         sum3 -= v3[j] *tmp0;
2222       }
2223       sum2 -= v2[nz] * sum1;
2224       sum3 -= v3[nz] * sum1;
2225       sum3 -= v3[nz+1] * sum2;
2226       tmp[row ++]=sum1;
2227       tmp[row ++]=sum2;
2228       tmp[row ++]=sum3;
2229       break;
2230 
2231     case 4:
2232       sum1 = b[r[row]];
2233       sum2 = b[r[row+1]];
2234       sum3 = b[r[row+2]];
2235       sum4 = b[r[row+3]];
2236       v2   = aa + ai[row+1];
2237       v3   = aa + ai[row+2];
2238       v4   = aa + ai[row+3];
2239 
2240       for (j=0; j<nz-1; j+=2){
2241         i0   = vi[j];
2242         i1   = vi[j+1];
2243         tmp0 = tmps[i0];
2244         tmp1 = tmps[i1];
2245         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2246         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2247         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2248         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2249       }
2250       if (j == nz-1){
2251         tmp0 = tmps[vi[j]];
2252         sum1 -= v1[j] *tmp0;
2253         sum2 -= v2[j] *tmp0;
2254         sum3 -= v3[j] *tmp0;
2255         sum4 -= v4[j] *tmp0;
2256       }
2257       sum2 -= v2[nz] * sum1;
2258       sum3 -= v3[nz] * sum1;
2259       sum4 -= v4[nz] * sum1;
2260       sum3 -= v3[nz+1] * sum2;
2261       sum4 -= v4[nz+1] * sum2;
2262       sum4 -= v4[nz+2] * sum3;
2263 
2264       tmp[row ++]=sum1;
2265       tmp[row ++]=sum2;
2266       tmp[row ++]=sum3;
2267       tmp[row ++]=sum4;
2268       break;
2269     case 5:
2270       sum1 = b[r[row]];
2271       sum2 = b[r[row+1]];
2272       sum3 = b[r[row+2]];
2273       sum4 = b[r[row+3]];
2274       sum5 = b[r[row+4]];
2275       v2   = aa + ai[row+1];
2276       v3   = aa + ai[row+2];
2277       v4   = aa + ai[row+3];
2278       v5   = aa + ai[row+4];
2279 
2280       for (j=0; j<nz-1; j+=2){
2281         i0   = vi[j];
2282         i1   = vi[j+1];
2283         tmp0 = tmps[i0];
2284         tmp1 = tmps[i1];
2285         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2286         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2287         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2288         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2289         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2290       }
2291       if (j == nz-1){
2292         tmp0 = tmps[vi[j]];
2293         sum1 -= v1[j] *tmp0;
2294         sum2 -= v2[j] *tmp0;
2295         sum3 -= v3[j] *tmp0;
2296         sum4 -= v4[j] *tmp0;
2297         sum5 -= v5[j] *tmp0;
2298       }
2299 
2300       sum2 -= v2[nz] * sum1;
2301       sum3 -= v3[nz] * sum1;
2302       sum4 -= v4[nz] * sum1;
2303       sum5 -= v5[nz] * sum1;
2304       sum3 -= v3[nz+1] * sum2;
2305       sum4 -= v4[nz+1] * sum2;
2306       sum5 -= v5[nz+1] * sum2;
2307       sum4 -= v4[nz+2] * sum3;
2308       sum5 -= v5[nz+2] * sum3;
2309       sum5 -= v5[nz+3] * sum4;
2310 
2311       tmp[row ++]=sum1;
2312       tmp[row ++]=sum2;
2313       tmp[row ++]=sum3;
2314       tmp[row ++]=sum4;
2315       tmp[row ++]=sum5;
2316       break;
2317     default:
2318       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2319     }
2320   }
2321   /* backward solve the upper triangular */
2322   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2323     nsz = ns[i];
2324     aii = ad[row+1] + 1;
2325     v1  = aa + aii;
2326     vi  = aj + aii;
2327     nz  = ad[row]- ad[row+1] - 1;
2328 
2329     if (i > 0) {
2330       /* Prefetch the indices for the next block */
2331       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2332       /* Prefetch the data for the next block */
2333       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2334     }
2335 
2336     switch (nsz){               /* Each loop in 'case' is unrolled */
2337     case 1 :
2338       sum1 = tmp[row];
2339 
2340       for(j=0 ; j<nz-1; j+=2){
2341         i0   = vi[j];
2342         i1   = vi[j+1];
2343         tmp0 = tmps[i0];
2344         tmp1 = tmps[i1];
2345         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2346       }
2347       if (j == nz-1){
2348         tmp0  = tmps[vi[j]];
2349         sum1 -= v1[j]*tmp0;
2350       }
2351       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2352       break;
2353     case 2 :
2354       sum1 = tmp[row];
2355       sum2 = tmp[row-1];
2356       v2   = aa + ad[row] + 1;
2357       for (j=0 ; j<nz-1; j+=2){
2358         i0   = vi[j];
2359         i1   = vi[j+1];
2360         tmp0 = tmps[i0];
2361         tmp1 = tmps[i1];
2362         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2363         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2364       }
2365       if (j == nz-1){
2366         tmp0  = tmps[vi[j]];
2367         sum1 -= v1[j]* tmp0;
2368         sum2 -= v2[j+1]* tmp0;
2369       }
2370 
2371       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2372       sum2   -= v2[0] * tmp0;
2373       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2374       break;
2375     case 3 :
2376       sum1 = tmp[row];
2377       sum2 = tmp[row -1];
2378       sum3 = tmp[row -2];
2379       v2   = aa + ad[row] + 1;
2380       v3   = aa + ad[row -1] + 1;
2381       for (j=0 ; j<nz-1; j+=2){
2382         i0   = vi[j];
2383         i1   = vi[j+1];
2384         tmp0 = tmps[i0];
2385         tmp1 = tmps[i1];
2386         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2387         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2388         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2389       }
2390       if (j== nz-1){
2391         tmp0  = tmps[vi[j]];
2392         sum1 -= v1[j] * tmp0;
2393         sum2 -= v2[j+1] * tmp0;
2394         sum3 -= v3[j+2] * tmp0;
2395       }
2396       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2397       sum2   -= v2[0]* tmp0;
2398       sum3   -= v3[1] * tmp0;
2399       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2400       sum3   -= v3[0]* tmp0;
2401       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2402 
2403       break;
2404     case 4 :
2405       sum1 = tmp[row];
2406       sum2 = tmp[row -1];
2407       sum3 = tmp[row -2];
2408       sum4 = tmp[row -3];
2409       v2   = aa + ad[row]+1;
2410       v3   = aa + ad[row -1]+1;
2411       v4   = aa + ad[row -2]+1;
2412 
2413       for (j=0 ; j<nz-1; j+=2){
2414         i0   = vi[j];
2415         i1   = vi[j+1];
2416         tmp0 = tmps[i0];
2417         tmp1 = tmps[i1];
2418         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2419         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2420         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2421         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2422       }
2423       if (j== nz-1){
2424         tmp0  = tmps[vi[j]];
2425         sum1 -= v1[j] * tmp0;
2426         sum2 -= v2[j+1] * tmp0;
2427         sum3 -= v3[j+2] * tmp0;
2428         sum4 -= v4[j+3] * tmp0;
2429       }
2430 
2431       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2432       sum2   -= v2[0] * tmp0;
2433       sum3   -= v3[1] * tmp0;
2434       sum4   -= v4[2] * tmp0;
2435       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2436       sum3   -= v3[0] * tmp0;
2437       sum4   -= v4[1] * tmp0;
2438       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2439       sum4   -= v4[0] * tmp0;
2440       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2441       break;
2442     case 5 :
2443       sum1 = tmp[row];
2444       sum2 = tmp[row -1];
2445       sum3 = tmp[row -2];
2446       sum4 = tmp[row -3];
2447       sum5 = tmp[row -4];
2448       v2   = aa + ad[row]+1;
2449       v3   = aa + ad[row -1]+1;
2450       v4   = aa + ad[row -2]+1;
2451       v5   = aa + ad[row -3]+1;
2452       for (j=0 ; j<nz-1; j+=2){
2453         i0   = vi[j];
2454         i1   = vi[j+1];
2455         tmp0 = tmps[i0];
2456         tmp1 = tmps[i1];
2457         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2458         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2459         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2460         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2461         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2462       }
2463       if (j==nz-1){
2464         tmp0  = tmps[vi[j]];
2465         sum1 -= v1[j] * tmp0;
2466         sum2 -= v2[j+1] * tmp0;
2467         sum3 -= v3[j+2] * tmp0;
2468         sum4 -= v4[j+3] * tmp0;
2469         sum5 -= v5[j+4] * tmp0;
2470       }
2471 
2472       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2473       sum2   -= v2[0] * tmp0;
2474       sum3   -= v3[1] * tmp0;
2475       sum4   -= v4[2] * tmp0;
2476       sum5   -= v5[3] * tmp0;
2477       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2478       sum3   -= v3[0] * tmp0;
2479       sum4   -= v4[1] * tmp0;
2480       sum5   -= v5[2] * tmp0;
2481       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2482       sum4   -= v4[0] * tmp0;
2483       sum5   -= v5[1] * tmp0;
2484       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2485       sum5   -= v5[0] * tmp0;
2486       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2487       break;
2488     default:
2489       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2490     }
2491   }
2492   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2493   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2494   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2495   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2496   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2497   PetscFunctionReturn(0);
2498 }
2499 
2500 
2501 /*
2502      Makes a longer coloring[] array and calls the usual code with that
2503 */
2504 #undef __FUNCT__
2505 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2506 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2507 {
2508   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2509   PetscErrorCode  ierr;
2510   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2511   PetscInt        *colorused,i;
2512   ISColoringValue *newcolor;
2513 
2514   PetscFunctionBegin;
2515   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2516   /* loop over inodes, marking a color for each column*/
2517   row = 0;
2518   for (i=0; i<m; i++){
2519     for (j=0; j<ns[i]; j++) {
2520       newcolor[row++] = coloring[i] + j*ncolors;
2521     }
2522   }
2523 
2524   /* eliminate unneeded colors */
2525   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2526   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2527   for (i=0; i<n; i++) {
2528     colorused[newcolor[i]] = 1;
2529   }
2530 
2531   for (i=1; i<5*ncolors; i++) {
2532     colorused[i] += colorused[i-1];
2533   }
2534   ncolors = colorused[5*ncolors-1];
2535   for (i=0; i<n; i++) {
2536     newcolor[i] = colorused[newcolor[i]]-1;
2537   }
2538   ierr = PetscFree(colorused);CHKERRQ(ierr);
2539   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2540   ierr = PetscFree(coloring);CHKERRQ(ierr);
2541   PetscFunctionReturn(0);
2542 }
2543 
2544 #include "../src/mat/blockinvert.h"
2545 
2546 #undef __FUNCT__
2547 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2548 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2549 {
2550   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2551   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2552   MatScalar          *ibdiag,*bdiag,work[25];
2553   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2554   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2555   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2556   PetscErrorCode     ierr;
2557   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2558   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2559 
2560   PetscFunctionBegin;
2561   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2562   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2563   if (its > 1) {
2564     /* switch to non-inode version */
2565     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2566     PetscFunctionReturn(0);
2567   }
2568 
2569   if (!a->inode.ibdiagvalid) {
2570     if (!a->inode.ibdiag) {
2571       /* calculate space needed for diagonal blocks */
2572       for (i=0; i<m; i++) {
2573 	cnt += sizes[i]*sizes[i];
2574       }
2575       a->inode.bdiagsize = cnt;
2576       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2577     }
2578 
2579     /* copy over the diagonal blocks and invert them */
2580     ibdiag = a->inode.ibdiag;
2581     bdiag  = a->inode.bdiag;
2582     cnt = 0;
2583     for (i=0, row = 0; i<m; i++) {
2584       for (j=0; j<sizes[i]; j++) {
2585         for (k=0; k<sizes[i]; k++) {
2586           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2587         }
2588       }
2589       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2590 
2591       switch(sizes[i]) {
2592         case 1:
2593           /* Create matrix data structure */
2594           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2595           ibdiag[cnt] = 1.0/ibdiag[cnt];
2596           break;
2597         case 2:
2598           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2599           break;
2600         case 3:
2601           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2602           break;
2603         case 4:
2604           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2605           break;
2606         case 5:
2607           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2608           break;
2609        default:
2610 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2611       }
2612       cnt += sizes[i]*sizes[i];
2613       row += sizes[i];
2614     }
2615     a->inode.ibdiagvalid = PETSC_TRUE;
2616   }
2617   ibdiag = a->inode.ibdiag;
2618   bdiag  = a->inode.bdiag;
2619 
2620 
2621   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2622   if (flag & SOR_ZERO_INITIAL_GUESS) {
2623     PetscScalar *b;
2624     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2625     if (xx != bb) {
2626       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2627     } else {
2628       b = x;
2629     }
2630     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2631 
2632       for (i=0, row=0; i<m; i++) {
2633         sz  = diag[row] - ii[row];
2634         v1  = a->a + ii[row];
2635         idx = a->j + ii[row];
2636 
2637         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2638         switch (sizes[i]){
2639           case 1:
2640 
2641             sum1  = b[row];
2642             for(n = 0; n<sz-1; n+=2) {
2643               i1   = idx[0];
2644               i2   = idx[1];
2645               idx += 2;
2646               tmp0 = x[i1];
2647               tmp1 = x[i2];
2648               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2649             }
2650 
2651             if (n == sz-1){
2652               tmp0  = x[*idx];
2653               sum1 -= *v1 * tmp0;
2654             }
2655             x[row++] = sum1*(*ibdiag++);
2656             break;
2657           case 2:
2658             v2    = a->a + ii[row+1];
2659             sum1  = b[row];
2660             sum2  = b[row+1];
2661             for(n = 0; n<sz-1; n+=2) {
2662               i1   = idx[0];
2663               i2   = idx[1];
2664               idx += 2;
2665               tmp0 = x[i1];
2666               tmp1 = x[i2];
2667               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2668               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2669             }
2670 
2671             if (n == sz-1){
2672               tmp0  = x[*idx];
2673               sum1 -= v1[0] * tmp0;
2674               sum2 -= v2[0] * tmp0;
2675             }
2676             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2677             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2678             ibdiag  += 4;
2679             break;
2680           case 3:
2681             v2    = a->a + ii[row+1];
2682             v3    = a->a + ii[row+2];
2683             sum1  = b[row];
2684             sum2  = b[row+1];
2685             sum3  = b[row+2];
2686             for(n = 0; n<sz-1; n+=2) {
2687               i1   = idx[0];
2688               i2   = idx[1];
2689               idx += 2;
2690               tmp0 = x[i1];
2691               tmp1 = x[i2];
2692               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2693               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2694               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2695             }
2696 
2697             if (n == sz-1){
2698               tmp0  = x[*idx];
2699               sum1 -= v1[0] * tmp0;
2700               sum2 -= v2[0] * tmp0;
2701               sum3 -= v3[0] * tmp0;
2702             }
2703             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2704             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2705             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2706             ibdiag  += 9;
2707             break;
2708           case 4:
2709             v2    = a->a + ii[row+1];
2710             v3    = a->a + ii[row+2];
2711             v4    = a->a + ii[row+3];
2712             sum1  = b[row];
2713             sum2  = b[row+1];
2714             sum3  = b[row+2];
2715             sum4  = b[row+3];
2716             for(n = 0; n<sz-1; n+=2) {
2717               i1   = idx[0];
2718               i2   = idx[1];
2719               idx += 2;
2720               tmp0 = x[i1];
2721               tmp1 = x[i2];
2722               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2723               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2724               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2725               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2726             }
2727 
2728             if (n == sz-1){
2729               tmp0  = x[*idx];
2730               sum1 -= v1[0] * tmp0;
2731               sum2 -= v2[0] * tmp0;
2732               sum3 -= v3[0] * tmp0;
2733               sum4 -= v4[0] * tmp0;
2734             }
2735             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2736             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2737             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2738             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2739             ibdiag  += 16;
2740             break;
2741           case 5:
2742             v2    = a->a + ii[row+1];
2743             v3    = a->a + ii[row+2];
2744             v4    = a->a + ii[row+3];
2745             v5    = a->a + ii[row+4];
2746             sum1  = b[row];
2747             sum2  = b[row+1];
2748             sum3  = b[row+2];
2749             sum4  = b[row+3];
2750             sum5  = b[row+4];
2751             for(n = 0; n<sz-1; n+=2) {
2752               i1   = idx[0];
2753               i2   = idx[1];
2754               idx += 2;
2755               tmp0 = x[i1];
2756               tmp1 = x[i2];
2757               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2758               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2759               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2760               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2761               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2762             }
2763 
2764             if (n == sz-1){
2765               tmp0  = x[*idx];
2766               sum1 -= v1[0] * tmp0;
2767               sum2 -= v2[0] * tmp0;
2768               sum3 -= v3[0] * tmp0;
2769               sum4 -= v4[0] * tmp0;
2770               sum5 -= v5[0] * tmp0;
2771             }
2772             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2773             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2774             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2775             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2776             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2777             ibdiag  += 25;
2778             break;
2779 	  default:
2780    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2781         }
2782       }
2783 
2784       xb = x;
2785       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2786     } else xb = b;
2787     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2788         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2789       cnt = 0;
2790       for (i=0, row=0; i<m; i++) {
2791 
2792         switch (sizes[i]){
2793           case 1:
2794             x[row++] *= bdiag[cnt++];
2795             break;
2796           case 2:
2797             x1   = x[row]; x2 = x[row+1];
2798             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2799             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2800             x[row++] = tmp1;
2801             x[row++] = tmp2;
2802             cnt += 4;
2803             break;
2804           case 3:
2805             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2806             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2807             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2808             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2809             x[row++] = tmp1;
2810             x[row++] = tmp2;
2811             x[row++] = tmp3;
2812             cnt += 9;
2813             break;
2814           case 4:
2815             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2816             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2817             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2818             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2819             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2820             x[row++] = tmp1;
2821             x[row++] = tmp2;
2822             x[row++] = tmp3;
2823             x[row++] = tmp4;
2824             cnt += 16;
2825             break;
2826           case 5:
2827             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2828             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2829             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2830             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2831             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2832             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2833             x[row++] = tmp1;
2834             x[row++] = tmp2;
2835             x[row++] = tmp3;
2836             x[row++] = tmp4;
2837             x[row++] = tmp5;
2838             cnt += 25;
2839             break;
2840 	  default:
2841    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2842         }
2843       }
2844       ierr = PetscLogFlops(m);CHKERRQ(ierr);
2845     }
2846     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2847 
2848       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2849       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2850         ibdiag -= sizes[i]*sizes[i];
2851         sz      = ii[row+1] - diag[row] - 1;
2852         v1      = a->a + diag[row] + 1;
2853         idx     = a->j + diag[row] + 1;
2854 
2855         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2856         switch (sizes[i]){
2857           case 1:
2858 
2859             sum1  = xb[row];
2860             for(n = 0; n<sz-1; n+=2) {
2861               i1   = idx[0];
2862               i2   = idx[1];
2863               idx += 2;
2864               tmp0 = x[i1];
2865               tmp1 = x[i2];
2866               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2867             }
2868 
2869             if (n == sz-1){
2870               tmp0  = x[*idx];
2871               sum1 -= *v1*tmp0;
2872             }
2873             x[row--] = sum1*(*ibdiag);
2874             break;
2875 
2876           case 2:
2877 
2878             sum1  = xb[row];
2879             sum2  = xb[row-1];
2880             /* note that sum1 is associated with the second of the two rows */
2881             v2    = a->a + diag[row-1] + 2;
2882             for(n = 0; n<sz-1; n+=2) {
2883               i1   = idx[0];
2884               i2   = idx[1];
2885               idx += 2;
2886               tmp0 = x[i1];
2887               tmp1 = x[i2];
2888               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2889               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2890             }
2891 
2892             if (n == sz-1){
2893               tmp0  = x[*idx];
2894               sum1 -= *v1*tmp0;
2895               sum2 -= *v2*tmp0;
2896             }
2897             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2898             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2899             break;
2900           case 3:
2901 
2902             sum1  = xb[row];
2903             sum2  = xb[row-1];
2904             sum3  = xb[row-2];
2905             v2    = a->a + diag[row-1] + 2;
2906             v3    = a->a + diag[row-2] + 3;
2907             for(n = 0; n<sz-1; n+=2) {
2908               i1   = idx[0];
2909               i2   = idx[1];
2910               idx += 2;
2911               tmp0 = x[i1];
2912               tmp1 = x[i2];
2913               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2914               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2915               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2916             }
2917 
2918             if (n == sz-1){
2919               tmp0  = x[*idx];
2920               sum1 -= *v1*tmp0;
2921               sum2 -= *v2*tmp0;
2922               sum3 -= *v3*tmp0;
2923             }
2924             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2925             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2926             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2927             break;
2928           case 4:
2929 
2930             sum1  = xb[row];
2931             sum2  = xb[row-1];
2932             sum3  = xb[row-2];
2933             sum4  = xb[row-3];
2934             v2    = a->a + diag[row-1] + 2;
2935             v3    = a->a + diag[row-2] + 3;
2936             v4    = a->a + diag[row-3] + 4;
2937             for(n = 0; n<sz-1; n+=2) {
2938               i1   = idx[0];
2939               i2   = idx[1];
2940               idx += 2;
2941               tmp0 = x[i1];
2942               tmp1 = x[i2];
2943               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2944               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2945               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2946               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2947             }
2948 
2949             if (n == sz-1){
2950               tmp0  = x[*idx];
2951               sum1 -= *v1*tmp0;
2952               sum2 -= *v2*tmp0;
2953               sum3 -= *v3*tmp0;
2954               sum4 -= *v4*tmp0;
2955             }
2956             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2957             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2958             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2959             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2960             break;
2961           case 5:
2962 
2963             sum1  = xb[row];
2964             sum2  = xb[row-1];
2965             sum3  = xb[row-2];
2966             sum4  = xb[row-3];
2967             sum5  = xb[row-4];
2968             v2    = a->a + diag[row-1] + 2;
2969             v3    = a->a + diag[row-2] + 3;
2970             v4    = a->a + diag[row-3] + 4;
2971             v5    = a->a + diag[row-4] + 5;
2972             for(n = 0; n<sz-1; n+=2) {
2973               i1   = idx[0];
2974               i2   = idx[1];
2975               idx += 2;
2976               tmp0 = x[i1];
2977               tmp1 = x[i2];
2978               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983             }
2984 
2985             if (n == sz-1){
2986               tmp0  = x[*idx];
2987               sum1 -= *v1*tmp0;
2988               sum2 -= *v2*tmp0;
2989               sum3 -= *v3*tmp0;
2990               sum4 -= *v4*tmp0;
2991               sum5 -= *v5*tmp0;
2992             }
2993             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2994             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2995             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2996             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2997             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2998             break;
2999 	  default:
3000    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3001         }
3002       }
3003 
3004       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3005     }
3006     its--;
3007     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3008     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3009   }
3010   if (flag & SOR_EISENSTAT) {
3011     const PetscScalar *b;
3012     MatScalar         *t = a->inode.ssor_work;
3013 
3014     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3015     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3016     /*
3017           Apply  (U + D)^-1  where D is now the block diagonal
3018     */
3019     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3020     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3021       ibdiag -= sizes[i]*sizes[i];
3022       sz      = ii[row+1] - diag[row] - 1;
3023       v1      = a->a + diag[row] + 1;
3024       idx     = a->j + diag[row] + 1;
3025       CHKMEMQ;
3026       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3027       switch (sizes[i]){
3028         case 1:
3029 
3030 	  sum1  = b[row];
3031 	  for(n = 0; n<sz-1; n+=2) {
3032 	    i1   = idx[0];
3033 	    i2   = idx[1];
3034 	    idx += 2;
3035 	    tmp0 = x[i1];
3036 	    tmp1 = x[i2];
3037 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3038 	  }
3039 
3040 	  if (n == sz-1){
3041 	    tmp0  = x[*idx];
3042 	    sum1 -= *v1*tmp0;
3043 	  }
3044 	  x[row] = sum1*(*ibdiag);row--;
3045 	  break;
3046 
3047         case 2:
3048 
3049 	  sum1  = b[row];
3050 	  sum2  = b[row-1];
3051 	  /* note that sum1 is associated with the second of the two rows */
3052 	  v2    = a->a + diag[row-1] + 2;
3053 	  for(n = 0; n<sz-1; n+=2) {
3054 	    i1   = idx[0];
3055 	    i2   = idx[1];
3056 	    idx += 2;
3057 	    tmp0 = x[i1];
3058 	    tmp1 = x[i2];
3059 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3060 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3061 	  }
3062 
3063 	  if (n == sz-1){
3064 	    tmp0  = x[*idx];
3065 	    sum1 -= *v1*tmp0;
3066 	    sum2 -= *v2*tmp0;
3067 	  }
3068 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3069 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3070           row -= 2;
3071 	  break;
3072         case 3:
3073 
3074 	  sum1  = b[row];
3075 	  sum2  = b[row-1];
3076 	  sum3  = b[row-2];
3077 	  v2    = a->a + diag[row-1] + 2;
3078 	  v3    = a->a + diag[row-2] + 3;
3079 	  for(n = 0; n<sz-1; n+=2) {
3080 	    i1   = idx[0];
3081 	    i2   = idx[1];
3082 	    idx += 2;
3083 	    tmp0 = x[i1];
3084 	    tmp1 = x[i2];
3085 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3086 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3087 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3088 	  }
3089 
3090 	  if (n == sz-1){
3091 	    tmp0  = x[*idx];
3092 	    sum1 -= *v1*tmp0;
3093 	    sum2 -= *v2*tmp0;
3094 	    sum3 -= *v3*tmp0;
3095 	  }
3096 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3097 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3098 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3099           row -= 3;
3100 	  break;
3101         case 4:
3102 
3103 	  sum1  = b[row];
3104 	  sum2  = b[row-1];
3105 	  sum3  = b[row-2];
3106 	  sum4  = b[row-3];
3107 	  v2    = a->a + diag[row-1] + 2;
3108 	  v3    = a->a + diag[row-2] + 3;
3109 	  v4    = a->a + diag[row-3] + 4;
3110 	  for(n = 0; n<sz-1; n+=2) {
3111 	    i1   = idx[0];
3112 	    i2   = idx[1];
3113 	    idx += 2;
3114 	    tmp0 = x[i1];
3115 	    tmp1 = x[i2];
3116 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3117 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3118 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3119 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3120 	  }
3121 
3122 	  if (n == sz-1){
3123 	    tmp0  = x[*idx];
3124 	    sum1 -= *v1*tmp0;
3125 	    sum2 -= *v2*tmp0;
3126 	    sum3 -= *v3*tmp0;
3127 	    sum4 -= *v4*tmp0;
3128 	  }
3129 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3130 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3131 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3132 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3133           row -= 4;
3134 	  break;
3135         case 5:
3136 
3137 	  sum1  = b[row];
3138 	  sum2  = b[row-1];
3139 	  sum3  = b[row-2];
3140 	  sum4  = b[row-3];
3141 	  sum5  = b[row-4];
3142 	  v2    = a->a + diag[row-1] + 2;
3143 	  v3    = a->a + diag[row-2] + 3;
3144 	  v4    = a->a + diag[row-3] + 4;
3145 	  v5    = a->a + diag[row-4] + 5;
3146 	  for(n = 0; n<sz-1; n+=2) {
3147 	    i1   = idx[0];
3148 	    i2   = idx[1];
3149 	    idx += 2;
3150 	    tmp0 = x[i1];
3151 	    tmp1 = x[i2];
3152 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3153 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3154 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3155 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3156 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3157 	  }
3158 
3159 	  if (n == sz-1){
3160 	    tmp0  = x[*idx];
3161 	    sum1 -= *v1*tmp0;
3162 	    sum2 -= *v2*tmp0;
3163 	    sum3 -= *v3*tmp0;
3164 	    sum4 -= *v4*tmp0;
3165 	    sum5 -= *v5*tmp0;
3166 	  }
3167 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3168 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3169 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3170 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3171 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3172           row -= 5;
3173 	  break;
3174         default:
3175 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3176       }
3177       CHKMEMQ;
3178     }
3179     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3180 
3181     /*
3182            t = b - D x    where D is the block diagonal
3183     */
3184     cnt = 0;
3185     for (i=0, row=0; i<m; i++) {
3186       CHKMEMQ;
3187       switch (sizes[i]){
3188         case 1:
3189 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3190 	  break;
3191         case 2:
3192 	  x1   = x[row]; x2 = x[row+1];
3193 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3194 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3195 	  t[row]   = b[row] - tmp1;
3196 	  t[row+1] = b[row+1] - tmp2; row += 2;
3197 	  cnt += 4;
3198 	  break;
3199         case 3:
3200 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3201 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3202 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3203 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3204 	  t[row] = b[row] - tmp1;
3205 	  t[row+1] = b[row+1] - tmp2;
3206 	  t[row+2] = b[row+2] - tmp3; row += 3;
3207 	  cnt += 9;
3208 	  break;
3209         case 4:
3210 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3211 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3212 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3213 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3214 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3215 	  t[row] = b[row] - tmp1;
3216 	  t[row+1] = b[row+1] - tmp2;
3217 	  t[row+2] = b[row+2] - tmp3;
3218 	  t[row+3] = b[row+3] - tmp4; row += 4;
3219 	  cnt += 16;
3220 	  break;
3221         case 5:
3222 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3223 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3224 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3225 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3226 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3227 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3228 	  t[row] = b[row] - tmp1;
3229 	  t[row+1] = b[row+1] - tmp2;
3230 	  t[row+2] = b[row+2] - tmp3;
3231 	  t[row+3] = b[row+3] - tmp4;
3232 	  t[row+4] = b[row+4] - tmp5;row += 5;
3233 	  cnt += 25;
3234 	  break;
3235         default:
3236 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3237       }
3238       CHKMEMQ;
3239     }
3240     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3241 
3242 
3243 
3244     /*
3245           Apply (L + D)^-1 where D is the block diagonal
3246     */
3247     for (i=0, row=0; i<m; i++) {
3248       sz  = diag[row] - ii[row];
3249       v1  = a->a + ii[row];
3250       idx = a->j + ii[row];
3251       CHKMEMQ;
3252       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3253       switch (sizes[i]){
3254         case 1:
3255 
3256 	  sum1  = t[row];
3257 	  for(n = 0; n<sz-1; n+=2) {
3258 	    i1   = idx[0];
3259 	    i2   = idx[1];
3260 	    idx += 2;
3261 	    tmp0 = t[i1];
3262 	    tmp1 = t[i2];
3263 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3264 	  }
3265 
3266 	  if (n == sz-1){
3267 	    tmp0  = t[*idx];
3268 	    sum1 -= *v1 * tmp0;
3269 	  }
3270 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3271 	  break;
3272         case 2:
3273 	  v2    = a->a + ii[row+1];
3274 	  sum1  = t[row];
3275 	  sum2  = t[row+1];
3276 	  for(n = 0; n<sz-1; n+=2) {
3277 	    i1   = idx[0];
3278 	    i2   = idx[1];
3279 	    idx += 2;
3280 	    tmp0 = t[i1];
3281 	    tmp1 = t[i2];
3282 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3283 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3284 	  }
3285 
3286 	  if (n == sz-1){
3287 	    tmp0  = t[*idx];
3288 	    sum1 -= v1[0] * tmp0;
3289 	    sum2 -= v2[0] * tmp0;
3290 	  }
3291 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3292 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3293 	  ibdiag  += 4; row += 2;
3294 	  break;
3295         case 3:
3296 	  v2    = a->a + ii[row+1];
3297 	  v3    = a->a + ii[row+2];
3298 	  sum1  = t[row];
3299 	  sum2  = t[row+1];
3300 	  sum3  = t[row+2];
3301 	  for(n = 0; n<sz-1; n+=2) {
3302 	    i1   = idx[0];
3303 	    i2   = idx[1];
3304 	    idx += 2;
3305 	    tmp0 = t[i1];
3306 	    tmp1 = t[i2];
3307 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3308 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3309 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3310 	  }
3311 
3312 	  if (n == sz-1){
3313 	    tmp0  = t[*idx];
3314 	    sum1 -= v1[0] * tmp0;
3315 	    sum2 -= v2[0] * tmp0;
3316 	    sum3 -= v3[0] * tmp0;
3317 	  }
3318 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3319 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3320 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3321 	  ibdiag  += 9; row += 3;
3322 	  break;
3323         case 4:
3324 	  v2    = a->a + ii[row+1];
3325 	  v3    = a->a + ii[row+2];
3326 	  v4    = a->a + ii[row+3];
3327 	  sum1  = t[row];
3328 	  sum2  = t[row+1];
3329 	  sum3  = t[row+2];
3330 	  sum4  = t[row+3];
3331 	  for(n = 0; n<sz-1; n+=2) {
3332 	    i1   = idx[0];
3333 	    i2   = idx[1];
3334 	    idx += 2;
3335 	    tmp0 = t[i1];
3336 	    tmp1 = t[i2];
3337 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3338 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3339 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3340 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3341 	  }
3342 
3343 	  if (n == sz-1){
3344 	    tmp0  = t[*idx];
3345 	    sum1 -= v1[0] * tmp0;
3346 	    sum2 -= v2[0] * tmp0;
3347 	    sum3 -= v3[0] * tmp0;
3348 	    sum4 -= v4[0] * tmp0;
3349 	  }
3350 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3351 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3352 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3353 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3354 	  ibdiag  += 16; row += 4;
3355 	  break;
3356         case 5:
3357 	  v2    = a->a + ii[row+1];
3358 	  v3    = a->a + ii[row+2];
3359 	  v4    = a->a + ii[row+3];
3360 	  v5    = a->a + ii[row+4];
3361 	  sum1  = t[row];
3362 	  sum2  = t[row+1];
3363 	  sum3  = t[row+2];
3364 	  sum4  = t[row+3];
3365 	  sum5  = t[row+4];
3366 	  for(n = 0; n<sz-1; n+=2) {
3367 	    i1   = idx[0];
3368 	    i2   = idx[1];
3369 	    idx += 2;
3370 	    tmp0 = t[i1];
3371 	    tmp1 = t[i2];
3372 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3373 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3374 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3375 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3376 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3377 	  }
3378 
3379 	  if (n == sz-1){
3380 	    tmp0  = t[*idx];
3381 	    sum1 -= v1[0] * tmp0;
3382 	    sum2 -= v2[0] * tmp0;
3383 	    sum3 -= v3[0] * tmp0;
3384 	    sum4 -= v4[0] * tmp0;
3385 	    sum5 -= v5[0] * tmp0;
3386 	  }
3387 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3388 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3389 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3390 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3391 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3392 	  ibdiag  += 25; row += 5;
3393 	  break;
3394         default:
3395 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3396       }
3397       CHKMEMQ;
3398     }
3399     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3400     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3401     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3402   }
3403   PetscFunctionReturn(0);
3404 }
3405 
3406 #undef __FUNCT__
3407 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3408 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3409 {
3410   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3411   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3412   const MatScalar    *bdiag = a->inode.bdiag;
3413   const PetscScalar  *b;
3414   PetscErrorCode      ierr;
3415   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3416   const PetscInt      *sizes = a->inode.size;
3417 
3418   PetscFunctionBegin;
3419   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3420   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3421   cnt = 0;
3422   for (i=0, row=0; i<m; i++) {
3423     switch (sizes[i]){
3424       case 1:
3425 	x[row] = b[row]*bdiag[cnt++];row++;
3426 	break;
3427       case 2:
3428 	x1   = b[row]; x2 = b[row+1];
3429 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3430 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3431 	x[row++] = tmp1;
3432 	x[row++] = tmp2;
3433 	cnt += 4;
3434 	break;
3435       case 3:
3436 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3437 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3438 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3439 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3440 	x[row++] = tmp1;
3441 	x[row++] = tmp2;
3442 	x[row++] = tmp3;
3443 	cnt += 9;
3444 	break;
3445       case 4:
3446 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3447 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3448 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3449 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3450 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3451 	x[row++] = tmp1;
3452 	x[row++] = tmp2;
3453 	x[row++] = tmp3;
3454 	x[row++] = tmp4;
3455 	cnt += 16;
3456 	break;
3457       case 5:
3458 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3459 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3460 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3461 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3462 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3463 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3464 	x[row++] = tmp1;
3465 	x[row++] = tmp2;
3466 	x[row++] = tmp3;
3467 	x[row++] = tmp4;
3468 	x[row++] = tmp5;
3469 	cnt += 25;
3470 	break;
3471       default:
3472 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3473     }
3474   }
3475   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3476   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3477   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3478   PetscFunctionReturn(0);
3479 }
3480 
3481 /*
3482     samestructure indicates that the matrix has not changed its nonzero structure so we
3483     do not need to recompute the inodes
3484 */
3485 #undef __FUNCT__
3486 #define __FUNCT__ "Mat_CheckInode"
3487 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3488 {
3489   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3490   PetscErrorCode ierr;
3491   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3492   PetscTruth     flag;
3493 
3494   PetscFunctionBegin;
3495   if (!a->inode.use)                     PetscFunctionReturn(0);
3496   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3497 
3498 
3499   m = A->rmap->n;
3500   if (a->inode.size) {ns = a->inode.size;}
3501   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3502 
3503   i          = 0;
3504   node_count = 0;
3505   idx        = a->j;
3506   ii         = a->i;
3507   while (i < m){                /* For each row */
3508     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3509     /* Limits the number of elements in a node to 'a->inode.limit' */
3510     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3511       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3512       if (nzy != nzx) break;
3513       idy  += nzx;             /* Same nonzero pattern */
3514       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3515       if (!flag) break;
3516     }
3517     ns[node_count++] = blk_size;
3518     idx += blk_size*nzx;
3519     i    = j;
3520   }
3521   /* If not enough inodes found,, do not use inode version of the routines */
3522   if (!a->inode.size && m && node_count > .9*m) {
3523     ierr = PetscFree(ns);CHKERRQ(ierr);
3524     a->inode.node_count     = 0;
3525     a->inode.size           = PETSC_NULL;
3526     a->inode.use            = PETSC_FALSE;
3527     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3528   } else {
3529     if (!A->factor) {
3530       A->ops->mult              = MatMult_SeqAIJ_Inode;
3531       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3532       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3533       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3534       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3535       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3536       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3537       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3538       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3539     } else {
3540       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3541     }
3542     a->inode.node_count       = node_count;
3543     a->inode.size             = ns;
3544     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3545   }
3546   PetscFunctionReturn(0);
3547 }
3548 
3549 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3550 PetscInt __k, *__vi; \
3551 __vi = aj + ai[row];				\
3552 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3553 __vi = aj + adiag[row];				\
3554 cols[nzl] = __vi[0];\
3555 __vi = aj + adiag[row+1]+1;\
3556 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3557 
3558 
3559 /*
3560    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3561    Modified from Mat_CheckInode().
3562 
3563    Input Parameters:
3564 +  Mat A - ILU or LU matrix factor
3565 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3566     do not need to recompute the inodes
3567 */
3568 #undef __FUNCT__
3569 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3570 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3571 {
3572   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3573   PetscErrorCode ierr;
3574   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3575   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3576   PetscTruth     flag;
3577 
3578   PetscFunctionBegin;
3579   if (!a->inode.use)                     PetscFunctionReturn(0);
3580   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3581 
3582   m = A->rmap->n;
3583   if (a->inode.size) {ns = a->inode.size;}
3584   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3585 
3586   i          = 0;
3587   node_count = 0;
3588   while (i < m){                /* For each row */
3589     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3590     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3591     nzx  = nzl1 + nzu1 + 1;
3592     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3593     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3594 
3595     /* Limits the number of elements in a node to 'a->inode.limit' */
3596     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3597       nzl2    = ai[j+1] - ai[j];
3598       nzu2    = adiag[j] - adiag[j+1] - 1;
3599       nzy     = nzl2 + nzu2 + 1;
3600       if( nzy != nzx) break;
3601       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3602       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3603       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3604       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3605       ierr = PetscFree(cols2);CHKERRQ(ierr);
3606     }
3607     ns[node_count++] = blk_size;
3608     ierr = PetscFree(cols1);CHKERRQ(ierr);
3609     i    = j;
3610   }
3611   /* If not enough inodes found,, do not use inode version of the routines */
3612   if (!a->inode.size && m && node_count > .9*m) {
3613     ierr = PetscFree(ns);CHKERRQ(ierr);
3614     a->inode.node_count     = 0;
3615     a->inode.size           = PETSC_NULL;
3616     a->inode.use            = PETSC_FALSE;
3617     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3618   } else {
3619     A->ops->mult              = 0;
3620     A->ops->sor               = 0;
3621     A->ops->multadd           = 0;
3622     A->ops->getrowij          = 0;
3623     A->ops->restorerowij      = 0;
3624     A->ops->getcolumnij       = 0;
3625     A->ops->restorecolumnij   = 0;
3626     A->ops->coloringpatch     = 0;
3627     A->ops->multdiagonalblock = 0;
3628     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3629     a->inode.node_count       = node_count;
3630     a->inode.size             = ns;
3631     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3632   }
3633   PetscFunctionReturn(0);
3634 }
3635 
3636 /*
3637      This is really ugly. if inodes are used this replaces the
3638   permutations with ones that correspond to rows/cols of the matrix
3639   rather then inode blocks
3640 */
3641 #undef __FUNCT__
3642 #define __FUNCT__ "MatInodeAdjustForInodes"
3643 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3644 {
3645   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3646 
3647   PetscFunctionBegin;
3648   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3649   if (f) {
3650     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3651   }
3652   PetscFunctionReturn(0);
3653 }
3654 
3655 EXTERN_C_BEGIN
3656 #undef __FUNCT__
3657 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3658 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3659 {
3660   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3661   PetscErrorCode ierr;
3662   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3663   const PetscInt *ridx,*cidx;
3664   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3665   PetscInt       nslim_col,*ns_col;
3666   IS             ris = *rperm,cis = *cperm;
3667 
3668   PetscFunctionBegin;
3669   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3670   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3671 
3672   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3673   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3674   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3675 
3676   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3677   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3678 
3679   /* Form the inode structure for the rows of permuted matric using inv perm*/
3680   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3681 
3682   /* Construct the permutations for rows*/
3683   for (i=0,row = 0; i<nslim_row; ++i){
3684     indx      = ridx[i];
3685     start_val = tns[indx];
3686     end_val   = tns[indx + 1];
3687     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3688   }
3689 
3690   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3691   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3692 
3693  /* Construct permutations for columns */
3694   for (i=0,col=0; i<nslim_col; ++i){
3695     indx      = cidx[i];
3696     start_val = tns[indx];
3697     end_val   = tns[indx + 1];
3698     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3699   }
3700 
3701   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3702   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3703   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3704   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3705 
3706   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3707   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3708 
3709   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3710   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3711   ierr = ISDestroy(cis);CHKERRQ(ierr);
3712   ierr = ISDestroy(ris);CHKERRQ(ierr);
3713   ierr = PetscFree(tns);CHKERRQ(ierr);
3714   PetscFunctionReturn(0);
3715 }
3716 EXTERN_C_END
3717 
3718 #undef __FUNCT__
3719 #define __FUNCT__ "MatInodeGetInodeSizes"
3720 /*@C
3721    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3722 
3723    Collective on Mat
3724 
3725    Input Parameter:
3726 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3727 
3728    Output Parameter:
3729 +  node_count - no of inodes present in the matrix.
3730 .  sizes      - an array of size node_count,with sizes of each inode.
3731 -  limit      - the max size used to generate the inodes.
3732 
3733    Level: advanced
3734 
3735    Notes: This routine returns some internal storage information
3736    of the matrix, it is intended to be used by advanced users.
3737    It should be called after the matrix is assembled.
3738    The contents of the sizes[] array should not be changed.
3739    PETSC_NULL may be passed for information not requested.
3740 
3741 .keywords: matrix, seqaij, get, inode
3742 
3743 .seealso: MatGetInfo()
3744 @*/
3745 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3746 {
3747   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3748 
3749   PetscFunctionBegin;
3750   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3751   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3752   if (f) {
3753     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3754   }
3755   PetscFunctionReturn(0);
3756 }
3757 
3758 EXTERN_C_BEGIN
3759 #undef __FUNCT__
3760 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3761 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3762 {
3763   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3764 
3765   PetscFunctionBegin;
3766   if (node_count) *node_count = a->inode.node_count;
3767   if (sizes)      *sizes      = a->inode.size;
3768   if (limit)      *limit      = a->inode.limit;
3769   PetscFunctionReturn(0);
3770 }
3771 EXTERN_C_END
3772