xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 3b644628e35efb5e641ba08a0b4892a0bd16ccb9) !
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap.n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x,*v1,*v2,*v3,*v4,*v5;
403   PetscErrorCode    ierr;
404   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405 
406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408 #endif
409 
410   PetscFunctionBegin;
411   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
412   node_max = a->inode.node_count;
413   ns       = a->inode.size;     /* Node Size array */
414   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
415   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
416   idx  = a->j;
417   v1   = a->a;
418   ii   = a->i;
419 
420   for (i = 0,row = 0; i< node_max; ++i){
421     nsz  = ns[i];
422     n    = ii[1] - ii[0];
423     nonzerorow += (n>0)*nsz;
424     ii  += nsz;
425     sz   = n;                   /* No of non zeros in this row */
426                                 /* Switch on the size of Node */
427     switch (nsz){               /* Each loop in 'case' is unrolled */
428     case 1 :
429       sum1  = 0;
430 
431       for(n = 0; n< sz-1; n+=2) {
432         i1   = idx[0];          /* The instructions are ordered to */
433         i2   = idx[1];          /* make the compiler's job easy */
434         idx += 2;
435         tmp0 = x[i1];
436         tmp1 = x[i2];
437         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
438        }
439 
440       if (n == sz-1){          /* Take care of the last nonzero  */
441         tmp0  = x[*idx++];
442         sum1 += *v1++ * tmp0;
443       }
444       y[row++]=sum1;
445       break;
446     case 2:
447       sum1  = 0;
448       sum2  = 0;
449       v2    = v1 + n;
450 
451       for (n = 0; n< sz-1; n+=2) {
452         i1   = idx[0];
453         i2   = idx[1];
454         idx += 2;
455         tmp0 = x[i1];
456         tmp1 = x[i2];
457         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459       }
460       if (n == sz-1){
461         tmp0  = x[*idx++];
462         sum1 += *v1++ * tmp0;
463         sum2 += *v2++ * tmp0;
464       }
465       y[row++]=sum1;
466       y[row++]=sum2;
467       v1      =v2;              /* Since the next block to be processed starts there*/
468       idx    +=sz;
469       break;
470     case 3:
471       sum1  = 0;
472       sum2  = 0;
473       sum3  = 0;
474       v2    = v1 + n;
475       v3    = v2 + n;
476 
477       for (n = 0; n< sz-1; n+=2) {
478         i1   = idx[0];
479         i2   = idx[1];
480         idx += 2;
481         tmp0 = x[i1];
482         tmp1 = x[i2];
483         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
484         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
485         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
486       }
487       if (n == sz-1){
488         tmp0  = x[*idx++];
489         sum1 += *v1++ * tmp0;
490         sum2 += *v2++ * tmp0;
491         sum3 += *v3++ * tmp0;
492       }
493       y[row++]=sum1;
494       y[row++]=sum2;
495       y[row++]=sum3;
496       v1       =v3;             /* Since the next block to be processed starts there*/
497       idx     +=2*sz;
498       break;
499     case 4:
500       sum1  = 0;
501       sum2  = 0;
502       sum3  = 0;
503       sum4  = 0;
504       v2    = v1 + n;
505       v3    = v2 + n;
506       v4    = v3 + n;
507 
508       for (n = 0; n< sz-1; n+=2) {
509         i1   = idx[0];
510         i2   = idx[1];
511         idx += 2;
512         tmp0 = x[i1];
513         tmp1 = x[i2];
514         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
515         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
516         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
517         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
518       }
519       if (n == sz-1){
520         tmp0  = x[*idx++];
521         sum1 += *v1++ * tmp0;
522         sum2 += *v2++ * tmp0;
523         sum3 += *v3++ * tmp0;
524         sum4 += *v4++ * tmp0;
525       }
526       y[row++]=sum1;
527       y[row++]=sum2;
528       y[row++]=sum3;
529       y[row++]=sum4;
530       v1      =v4;              /* Since the next block to be processed starts there*/
531       idx    +=3*sz;
532       break;
533     case 5:
534       sum1  = 0;
535       sum2  = 0;
536       sum3  = 0;
537       sum4  = 0;
538       sum5  = 0;
539       v2    = v1 + n;
540       v3    = v2 + n;
541       v4    = v3 + n;
542       v5    = v4 + n;
543 
544       for (n = 0; n<sz-1; n+=2) {
545         i1   = idx[0];
546         i2   = idx[1];
547         idx += 2;
548         tmp0 = x[i1];
549         tmp1 = x[i2];
550         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
551         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
552         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
553         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
554         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
555       }
556       if (n == sz-1){
557         tmp0  = x[*idx++];
558         sum1 += *v1++ * tmp0;
559         sum2 += *v2++ * tmp0;
560         sum3 += *v3++ * tmp0;
561         sum4 += *v4++ * tmp0;
562         sum5 += *v5++ * tmp0;
563       }
564       y[row++]=sum1;
565       y[row++]=sum2;
566       y[row++]=sum3;
567       y[row++]=sum4;
568       y[row++]=sum5;
569       v1      =v5;       /* Since the next block to be processed starts there */
570       idx    +=4*sz;
571       break;
572     default :
573       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
574     }
575   }
576   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
577   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
578   ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr);
579   PetscFunctionReturn(0);
580 }
581 /* ----------------------------------------------------------- */
582 /* Almost same code as the MatMult_Inode() */
583 #undef __FUNCT__
584 #define __FUNCT__ "MatMultAdd_Inode"
585 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
586 {
587   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
588   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
589   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
590   PetscErrorCode ierr;
591   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
598   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
599   if (zz != yy) {
600     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
601   } else {
602     z = y;
603   }
604   zt = z;
605 
606   idx  = a->j;
607   v1   = a->a;
608   ii   = a->i;
609 
610   for (i = 0,row = 0; i< node_max; ++i){
611     nsz  = ns[i];
612     n    = ii[1] - ii[0];
613     ii  += nsz;
614     sz   = n;                   /* No of non zeros in this row */
615                                 /* Switch on the size of Node */
616     switch (nsz){               /* Each loop in 'case' is unrolled */
617     case 1 :
618       sum1  = *zt++;
619 
620       for(n = 0; n< sz-1; n+=2) {
621         i1   = idx[0];          /* The instructions are ordered to */
622         i2   = idx[1];          /* make the compiler's job easy */
623         idx += 2;
624         tmp0 = x[i1];
625         tmp1 = x[i2];
626         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627        }
628 
629       if(n   == sz-1){          /* Take care of the last nonzero  */
630         tmp0  = x[*idx++];
631         sum1 += *v1++ * tmp0;
632       }
633       y[row++]=sum1;
634       break;
635     case 2:
636       sum1  = *zt++;
637       sum2  = *zt++;
638       v2    = v1 + n;
639 
640       for(n = 0; n< sz-1; n+=2) {
641         i1   = idx[0];
642         i2   = idx[1];
643         idx += 2;
644         tmp0 = x[i1];
645         tmp1 = x[i2];
646         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648       }
649       if(n   == sz-1){
650         tmp0  = x[*idx++];
651         sum1 += *v1++ * tmp0;
652         sum2 += *v2++ * tmp0;
653       }
654       y[row++]=sum1;
655       y[row++]=sum2;
656       v1      =v2;              /* Since the next block to be processed starts there*/
657       idx    +=sz;
658       break;
659     case 3:
660       sum1  = *zt++;
661       sum2  = *zt++;
662       sum3  = *zt++;
663       v2    = v1 + n;
664       v3    = v2 + n;
665 
666       for (n = 0; n< sz-1; n+=2) {
667         i1   = idx[0];
668         i2   = idx[1];
669         idx += 2;
670         tmp0 = x[i1];
671         tmp1 = x[i2];
672         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675       }
676       if (n == sz-1){
677         tmp0  = x[*idx++];
678         sum1 += *v1++ * tmp0;
679         sum2 += *v2++ * tmp0;
680         sum3 += *v3++ * tmp0;
681       }
682       y[row++]=sum1;
683       y[row++]=sum2;
684       y[row++]=sum3;
685       v1       =v3;             /* Since the next block to be processed starts there*/
686       idx     +=2*sz;
687       break;
688     case 4:
689       sum1  = *zt++;
690       sum2  = *zt++;
691       sum3  = *zt++;
692       sum4  = *zt++;
693       v2    = v1 + n;
694       v3    = v2 + n;
695       v4    = v3 + n;
696 
697       for (n = 0; n< sz-1; n+=2) {
698         i1   = idx[0];
699         i2   = idx[1];
700         idx += 2;
701         tmp0 = x[i1];
702         tmp1 = x[i2];
703         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707       }
708       if (n == sz-1){
709         tmp0  = x[*idx++];
710         sum1 += *v1++ * tmp0;
711         sum2 += *v2++ * tmp0;
712         sum3 += *v3++ * tmp0;
713         sum4 += *v4++ * tmp0;
714       }
715       y[row++]=sum1;
716       y[row++]=sum2;
717       y[row++]=sum3;
718       y[row++]=sum4;
719       v1      =v4;              /* Since the next block to be processed starts there*/
720       idx    +=3*sz;
721       break;
722     case 5:
723       sum1  = *zt++;
724       sum2  = *zt++;
725       sum3  = *zt++;
726       sum4  = *zt++;
727       sum5  = *zt++;
728       v2    = v1 + n;
729       v3    = v2 + n;
730       v4    = v3 + n;
731       v5    = v4 + n;
732 
733       for (n = 0; n<sz-1; n+=2) {
734         i1   = idx[0];
735         i2   = idx[1];
736         idx += 2;
737         tmp0 = x[i1];
738         tmp1 = x[i2];
739         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744       }
745       if(n   == sz-1){
746         tmp0  = x[*idx++];
747         sum1 += *v1++ * tmp0;
748         sum2 += *v2++ * tmp0;
749         sum3 += *v3++ * tmp0;
750         sum4 += *v4++ * tmp0;
751         sum5 += *v5++ * tmp0;
752       }
753       y[row++]=sum1;
754       y[row++]=sum2;
755       y[row++]=sum3;
756       y[row++]=sum4;
757       y[row++]=sum5;
758       v1      =v5;       /* Since the next block to be processed starts there */
759       idx    +=4*sz;
760       break;
761     default :
762       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
763     }
764   }
765   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
766   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
767   if (zz != yy) {
768     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
769   }
770   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
771   PetscFunctionReturn(0);
772 }
773 
774 /* ----------------------------------------------------------- */
775 #undef __FUNCT__
776 #define __FUNCT__ "MatSolve_Inode"
777 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
778 {
779   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
780   IS                iscol = a->col,isrow = a->row;
781   PetscErrorCode    ierr;
782   PetscInt          *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
783   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
784   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
785   PetscScalar       sum1,sum2,sum3,sum4,sum5;
786   const PetscScalar *v1,*v2,*v3,*v4,*v5,*b,*a_a = a->a,*aa;
787 
788   PetscFunctionBegin;
789   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
790   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
791   node_max = a->inode.node_count;
792   ns       = a->inode.size;     /* Node Size array */
793 
794   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
796   tmp  = a->solve_work;
797 
798   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
799   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
800 
801   /* forward solve the lower triangular */
802   tmps = tmp ;
803   aa   = a_a ;
804   aj   = a_j ;
805   ad   = a->diag;
806 
807   for (i = 0,row = 0; i< node_max; ++i){
808     nsz = ns[i];
809     aii = ai[row];
810     v1  = aa + aii;
811     vi  = aj + aii;
812     nz  = ad[row]- aii;
813 
814     switch (nsz){               /* Each loop in 'case' is unrolled */
815     case 1 :
816       sum1 = b[*r++];
817       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
818       for(j=0; j<nz-1; j+=2){
819         i0   = vi[0];
820         i1   = vi[1];
821         vi  +=2;
822         tmp0 = tmps[i0];
823         tmp1 = tmps[i1];
824         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
825       }
826       if(j == nz-1){
827         tmp0 = tmps[*vi++];
828         sum1 -= *v1++ *tmp0;
829       }
830       tmp[row ++]=sum1;
831       break;
832     case 2:
833       sum1 = b[*r++];
834       sum2 = b[*r++];
835       v2   = aa + ai[row+1];
836 
837       for(j=0; j<nz-1; j+=2){
838         i0   = vi[0];
839         i1   = vi[1];
840         vi  +=2;
841         tmp0 = tmps[i0];
842         tmp1 = tmps[i1];
843         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845       }
846       if(j == nz-1){
847         tmp0 = tmps[*vi++];
848         sum1 -= *v1++ *tmp0;
849         sum2 -= *v2++ *tmp0;
850       }
851       sum2 -= *v2++ * sum1;
852       tmp[row ++]=sum1;
853       tmp[row ++]=sum2;
854       break;
855     case 3:
856       sum1 = b[*r++];
857       sum2 = b[*r++];
858       sum3 = b[*r++];
859       v2   = aa + ai[row+1];
860       v3   = aa + ai[row+2];
861 
862       for (j=0; j<nz-1; j+=2){
863         i0   = vi[0];
864         i1   = vi[1];
865         vi  +=2;
866         tmp0 = tmps[i0];
867         tmp1 = tmps[i1];
868         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
869         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
870         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
871       }
872       if (j == nz-1){
873         tmp0 = tmps[*vi++];
874         sum1 -= *v1++ *tmp0;
875         sum2 -= *v2++ *tmp0;
876         sum3 -= *v3++ *tmp0;
877       }
878       sum2 -= *v2++ * sum1;
879       sum3 -= *v3++ * sum1;
880       sum3 -= *v3++ * sum2;
881       tmp[row ++]=sum1;
882       tmp[row ++]=sum2;
883       tmp[row ++]=sum3;
884       break;
885 
886     case 4:
887       sum1 = b[*r++];
888       sum2 = b[*r++];
889       sum3 = b[*r++];
890       sum4 = b[*r++];
891       v2   = aa + ai[row+1];
892       v3   = aa + ai[row+2];
893       v4   = aa + ai[row+3];
894 
895       for (j=0; j<nz-1; j+=2){
896         i0   = vi[0];
897         i1   = vi[1];
898         vi  +=2;
899         tmp0 = tmps[i0];
900         tmp1 = tmps[i1];
901         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
902         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
903         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
904         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
905       }
906       if (j == nz-1){
907         tmp0 = tmps[*vi++];
908         sum1 -= *v1++ *tmp0;
909         sum2 -= *v2++ *tmp0;
910         sum3 -= *v3++ *tmp0;
911         sum4 -= *v4++ *tmp0;
912       }
913       sum2 -= *v2++ * sum1;
914       sum3 -= *v3++ * sum1;
915       sum4 -= *v4++ * sum1;
916       sum3 -= *v3++ * sum2;
917       sum4 -= *v4++ * sum2;
918       sum4 -= *v4++ * sum3;
919 
920       tmp[row ++]=sum1;
921       tmp[row ++]=sum2;
922       tmp[row ++]=sum3;
923       tmp[row ++]=sum4;
924       break;
925     case 5:
926       sum1 = b[*r++];
927       sum2 = b[*r++];
928       sum3 = b[*r++];
929       sum4 = b[*r++];
930       sum5 = b[*r++];
931       v2   = aa + ai[row+1];
932       v3   = aa + ai[row+2];
933       v4   = aa + ai[row+3];
934       v5   = aa + ai[row+4];
935 
936       for (j=0; j<nz-1; j+=2){
937         i0   = vi[0];
938         i1   = vi[1];
939         vi  +=2;
940         tmp0 = tmps[i0];
941         tmp1 = tmps[i1];
942         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
943         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
944         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
945         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
946         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
947       }
948       if (j == nz-1){
949         tmp0 = tmps[*vi++];
950         sum1 -= *v1++ *tmp0;
951         sum2 -= *v2++ *tmp0;
952         sum3 -= *v3++ *tmp0;
953         sum4 -= *v4++ *tmp0;
954         sum5 -= *v5++ *tmp0;
955       }
956 
957       sum2 -= *v2++ * sum1;
958       sum3 -= *v3++ * sum1;
959       sum4 -= *v4++ * sum1;
960       sum5 -= *v5++ * sum1;
961       sum3 -= *v3++ * sum2;
962       sum4 -= *v4++ * sum2;
963       sum5 -= *v5++ * sum2;
964       sum4 -= *v4++ * sum3;
965       sum5 -= *v5++ * sum3;
966       sum5 -= *v5++ * sum4;
967 
968       tmp[row ++]=sum1;
969       tmp[row ++]=sum2;
970       tmp[row ++]=sum3;
971       tmp[row ++]=sum4;
972       tmp[row ++]=sum5;
973       break;
974     default:
975       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
976     }
977   }
978   /* backward solve the upper triangular */
979   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
980     nsz = ns[i];
981     aii = ai[row+1] -1;
982     v1  = aa + aii;
983     vi  = aj + aii;
984     nz  = aii- ad[row];
985     switch (nsz){               /* Each loop in 'case' is unrolled */
986     case 1 :
987       sum1 = tmp[row];
988 
989       for(j=nz ; j>1; j-=2){
990         vi  -=2;
991         i0   = vi[2];
992         i1   = vi[1];
993         tmp0 = tmps[i0];
994         tmp1 = tmps[i1];
995         v1   -= 2;
996         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997       }
998       if (j==1){
999         tmp0  = tmps[*vi--];
1000         sum1 -= *v1-- * tmp0;
1001       }
1002       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1003       break;
1004     case 2 :
1005       sum1 = tmp[row];
1006       sum2 = tmp[row -1];
1007       v2   = aa + ai[row]-1;
1008       for (j=nz ; j>1; j-=2){
1009         vi  -=2;
1010         i0   = vi[2];
1011         i1   = vi[1];
1012         tmp0 = tmps[i0];
1013         tmp1 = tmps[i1];
1014         v1   -= 2;
1015         v2   -= 2;
1016         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1017         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1018       }
1019       if (j==1){
1020         tmp0  = tmps[*vi--];
1021         sum1 -= *v1-- * tmp0;
1022         sum2 -= *v2-- * tmp0;
1023       }
1024 
1025       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1026       sum2   -= *v2-- * tmp0;
1027       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1028       break;
1029     case 3 :
1030       sum1 = tmp[row];
1031       sum2 = tmp[row -1];
1032       sum3 = tmp[row -2];
1033       v2   = aa + ai[row]-1;
1034       v3   = aa + ai[row -1]-1;
1035       for (j=nz ; j>1; j-=2){
1036         vi  -=2;
1037         i0   = vi[2];
1038         i1   = vi[1];
1039         tmp0 = tmps[i0];
1040         tmp1 = tmps[i1];
1041         v1   -= 2;
1042         v2   -= 2;
1043         v3   -= 2;
1044         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1045         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1046         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1047       }
1048       if (j==1){
1049         tmp0  = tmps[*vi--];
1050         sum1 -= *v1-- * tmp0;
1051         sum2 -= *v2-- * tmp0;
1052         sum3 -= *v3-- * tmp0;
1053       }
1054       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1055       sum2   -= *v2-- * tmp0;
1056       sum3   -= *v3-- * tmp0;
1057       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1058       sum3   -= *v3-- * tmp0;
1059       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1060 
1061       break;
1062     case 4 :
1063       sum1 = tmp[row];
1064       sum2 = tmp[row -1];
1065       sum3 = tmp[row -2];
1066       sum4 = tmp[row -3];
1067       v2   = aa + ai[row]-1;
1068       v3   = aa + ai[row -1]-1;
1069       v4   = aa + ai[row -2]-1;
1070 
1071       for (j=nz ; j>1; j-=2){
1072         vi  -=2;
1073         i0   = vi[2];
1074         i1   = vi[1];
1075         tmp0 = tmps[i0];
1076         tmp1 = tmps[i1];
1077         v1  -= 2;
1078         v2  -= 2;
1079         v3  -= 2;
1080         v4  -= 2;
1081         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1082         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1083         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1084         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1085       }
1086       if (j==1){
1087         tmp0  = tmps[*vi--];
1088         sum1 -= *v1-- * tmp0;
1089         sum2 -= *v2-- * tmp0;
1090         sum3 -= *v3-- * tmp0;
1091         sum4 -= *v4-- * tmp0;
1092       }
1093 
1094       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1095       sum2   -= *v2-- * tmp0;
1096       sum3   -= *v3-- * tmp0;
1097       sum4   -= *v4-- * tmp0;
1098       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1102       sum4   -= *v4-- * tmp0;
1103       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1104       break;
1105     case 5 :
1106       sum1 = tmp[row];
1107       sum2 = tmp[row -1];
1108       sum3 = tmp[row -2];
1109       sum4 = tmp[row -3];
1110       sum5 = tmp[row -4];
1111       v2   = aa + ai[row]-1;
1112       v3   = aa + ai[row -1]-1;
1113       v4   = aa + ai[row -2]-1;
1114       v5   = aa + ai[row -3]-1;
1115       for (j=nz ; j>1; j-=2){
1116         vi  -= 2;
1117         i0   = vi[2];
1118         i1   = vi[1];
1119         tmp0 = tmps[i0];
1120         tmp1 = tmps[i1];
1121         v1   -= 2;
1122         v2   -= 2;
1123         v3   -= 2;
1124         v4   -= 2;
1125         v5   -= 2;
1126         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1127         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1128         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1129         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1130         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1131       }
1132       if (j==1){
1133         tmp0  = tmps[*vi--];
1134         sum1 -= *v1-- * tmp0;
1135         sum2 -= *v2-- * tmp0;
1136         sum3 -= *v3-- * tmp0;
1137         sum4 -= *v4-- * tmp0;
1138         sum5 -= *v5-- * tmp0;
1139       }
1140 
1141       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1142       sum2   -= *v2-- * tmp0;
1143       sum3   -= *v3-- * tmp0;
1144       sum4   -= *v4-- * tmp0;
1145       sum5   -= *v5-- * tmp0;
1146       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1147       sum3   -= *v3-- * tmp0;
1148       sum4   -= *v4-- * tmp0;
1149       sum5   -= *v5-- * tmp0;
1150       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1154       sum5   -= *v5-- * tmp0;
1155       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1156       break;
1157     default:
1158       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1159     }
1160   }
1161   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1162   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1163   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1164   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1165   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 #undef __FUNCT__
1170 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1171 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1172 {
1173   Mat               C = *B;
1174   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1175   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1176   PetscErrorCode    ierr;
1177   PetscInt          *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1178   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1179   PetscInt          *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1180   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1181   PetscScalar       *pc1,*pc2,*pc3,mul1,mul2,mul3;
1182   PetscScalar       tmp,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1183   const PetscScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1184   PetscReal         rs=0.0;
1185   LUShift_Ctx       sctx;
1186   PetscInt          newshift;
1187 
1188   PetscFunctionBegin;
1189   sctx.shift_top  = 0;
1190   sctx.nshift_max = 0;
1191   sctx.shift_lo   = 0;
1192   sctx.shift_hi   = 0;
1193 
1194   /* if both shift schemes are chosen by user, only use info->shiftpd */
1195   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1196   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1197     sctx.shift_top = 0;
1198     for (i=0; i<n; i++) {
1199       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1200       rs    = 0.0;
1201       ajtmp = aj + ai[i];
1202       rtmp1 = aa + ai[i];
1203       nz = ai[i+1] - ai[i];
1204       for (j=0; j<nz; j++){
1205         if (*ajtmp != i){
1206           rs += PetscAbsScalar(*rtmp1++);
1207         } else {
1208           rs -= PetscRealPart(*rtmp1++);
1209         }
1210         ajtmp++;
1211       }
1212       if (rs>sctx.shift_top) sctx.shift_top = rs;
1213     }
1214     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1215     sctx.shift_top *= 1.1;
1216     sctx.nshift_max = 5;
1217     sctx.shift_lo   = 0.;
1218     sctx.shift_hi   = 1.;
1219   }
1220   sctx.shift_amount = 0;
1221   sctx.nshift       = 0;
1222 
1223   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1224   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1225   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1226   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1227   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1228   ics   = ic ;
1229   rtmp22 = rtmp11 + n;
1230   rtmp33 = rtmp22 + n;
1231 
1232   node_max = a->inode.node_count;
1233   ns       = a->inode.size ;
1234   if (!ns){
1235     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1236   }
1237 
1238   /* If max inode size > 3, split it into two inodes.*/
1239   /* also map the inode sizes according to the ordering */
1240   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1241   for (i=0,j=0; i<node_max; ++i,++j){
1242     if (ns[i]>3) {
1243       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1244       ++j;
1245       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1246     } else {
1247       tmp_vec1[j] = ns[i];
1248     }
1249   }
1250   /* Use the correct node_max */
1251   node_max = j;
1252 
1253   /* Now reorder the inode info based on mat re-ordering info */
1254   /* First create a row -> inode_size_array_index map */
1255   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1256   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1257   for (i=0,row=0; i<node_max; i++) {
1258     nodesz = tmp_vec1[i];
1259     for (j=0; j<nodesz; j++,row++) {
1260       nsmap[row] = i;
1261     }
1262   }
1263   /* Using nsmap, create a reordered ns structure */
1264   for (i=0,j=0; i< node_max; i++) {
1265     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1266     tmp_vec2[i]  = nodesz;
1267     j           += nodesz;
1268   }
1269   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1270   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1271   /* Now use the correct ns */
1272   ns = tmp_vec2;
1273 
1274   do {
1275     sctx.lushift = PETSC_FALSE;
1276     /* Now loop over each block-row, and do the factorization */
1277     for (i=0,row=0; i<node_max; i++) {
1278       nodesz = ns[i];
1279       nz     = bi[row+1] - bi[row];
1280       bjtmp  = bj + bi[row];
1281 
1282       switch (nodesz){
1283       case 1:
1284         for  (j=0; j<nz; j++){
1285           idx        = bjtmp[j];
1286           rtmp11[idx] = 0.0;
1287         }
1288 
1289         /* load in initial (unfactored row) */
1290         idx    = r[row];
1291         nz_tmp = ai[idx+1] - ai[idx];
1292         ajtmp  = aj + ai[idx];
1293         v1     = aa + ai[idx];
1294 
1295         for (j=0; j<nz_tmp; j++) {
1296           idx        = ics[ajtmp[j]];
1297           rtmp11[idx] = v1[j];
1298         }
1299         rtmp11[ics[r[row]]] += sctx.shift_amount;
1300 
1301         prow = *bjtmp++ ;
1302         while (prow < row) {
1303           pc1 = rtmp11 + prow;
1304           if (*pc1 != 0.0){
1305             pv   = ba + bd[prow];
1306             pj   = nbj + bd[prow];
1307             mul1 = *pc1 * *pv++;
1308             *pc1 = mul1;
1309             nz_tmp = bi[prow+1] - bd[prow] - 1;
1310             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1311             for (j=0; j<nz_tmp; j++) {
1312               tmp = pv[j];
1313               idx = pj[j];
1314               rtmp11[idx] -= mul1 * tmp;
1315             }
1316           }
1317           prow = *bjtmp++ ;
1318         }
1319         pj  = bj + bi[row];
1320         pc1 = ba + bi[row];
1321 
1322         sctx.pv    = rtmp11[row];
1323         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1324         rs         = 0.0;
1325         for (j=0; j<nz; j++) {
1326           idx    = pj[j];
1327           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1328           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1329         }
1330         sctx.rs  = rs;
1331         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1332         if (newshift == 1) goto endofwhile;
1333         break;
1334 
1335       case 2:
1336         for (j=0; j<nz; j++) {
1337           idx        = bjtmp[j];
1338           rtmp11[idx] = 0.0;
1339           rtmp22[idx] = 0.0;
1340         }
1341 
1342         /* load in initial (unfactored row) */
1343         idx    = r[row];
1344         nz_tmp = ai[idx+1] - ai[idx];
1345         ajtmp  = aj + ai[idx];
1346         v1     = aa + ai[idx];
1347         v2     = aa + ai[idx+1];
1348         for (j=0; j<nz_tmp; j++) {
1349           idx        = ics[ajtmp[j]];
1350           rtmp11[idx] = v1[j];
1351           rtmp22[idx] = v2[j];
1352         }
1353         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1354         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1355 
1356         prow = *bjtmp++ ;
1357         while (prow < row) {
1358           pc1 = rtmp11 + prow;
1359           pc2 = rtmp22 + prow;
1360           if (*pc1 != 0.0 || *pc2 != 0.0){
1361             pv   = ba + bd[prow];
1362             pj   = nbj + bd[prow];
1363             mul1 = *pc1 * *pv;
1364             mul2 = *pc2 * *pv;
1365             ++pv;
1366             *pc1 = mul1;
1367             *pc2 = mul2;
1368 
1369             nz_tmp = bi[prow+1] - bd[prow] - 1;
1370             for (j=0; j<nz_tmp; j++) {
1371               tmp = pv[j];
1372               idx = pj[j];
1373               rtmp11[idx] -= mul1 * tmp;
1374               rtmp22[idx] -= mul2 * tmp;
1375             }
1376             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1377           }
1378           prow = *bjtmp++ ;
1379         }
1380 
1381         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1382         pc1 = rtmp11 + prow;
1383         pc2 = rtmp22 + prow;
1384 
1385         sctx.pv = *pc1;
1386         pj      = bj + bi[prow];
1387         rs      = 0.0;
1388         for (j=0; j<nz; j++){
1389           idx = pj[j];
1390           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1391         }
1392         sctx.rs = rs;
1393         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1394         if (newshift == 1) goto endofwhile;
1395 
1396         if (*pc2 != 0.0){
1397           pj     = nbj + bd[prow];
1398           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1399           *pc2   = mul2;
1400           nz_tmp = bi[prow+1] - bd[prow] - 1;
1401           for (j=0; j<nz_tmp; j++) {
1402             idx = pj[j] ;
1403             tmp = rtmp11[idx];
1404             rtmp22[idx] -= mul2 * tmp;
1405           }
1406           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1407         }
1408 
1409         pj  = bj + bi[row];
1410         pc1 = ba + bi[row];
1411         pc2 = ba + bi[row+1];
1412 
1413         sctx.pv = rtmp22[row+1];
1414         rs = 0.0;
1415         rtmp11[row]   = 1.0/rtmp11[row];
1416         rtmp22[row+1] = 1.0/rtmp22[row+1];
1417         /* copy row entries from dense representation to sparse */
1418         for (j=0; j<nz; j++) {
1419           idx    = pj[j];
1420           pc1[j] = rtmp11[idx];
1421           pc2[j] = rtmp22[idx];
1422           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1423         }
1424         sctx.rs = rs;
1425         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1426         if (newshift == 1) goto endofwhile;
1427         break;
1428 
1429       case 3:
1430         for  (j=0; j<nz; j++) {
1431           idx        = bjtmp[j];
1432           rtmp11[idx] = 0.0;
1433           rtmp22[idx] = 0.0;
1434           rtmp33[idx] = 0.0;
1435         }
1436         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1437         idx    = r[row];
1438         nz_tmp = ai[idx+1] - ai[idx];
1439         ajtmp = aj + ai[idx];
1440         v1    = aa + ai[idx];
1441         v2    = aa + ai[idx+1];
1442         v3    = aa + ai[idx+2];
1443         for (j=0; j<nz_tmp; j++) {
1444           idx        = ics[ajtmp[j]];
1445           rtmp11[idx] = v1[j];
1446           rtmp22[idx] = v2[j];
1447           rtmp33[idx] = v3[j];
1448         }
1449         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1450         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1451         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1452 
1453         /* loop over all pivot row blocks above this row block */
1454         prow = *bjtmp++ ;
1455         while (prow < row) {
1456           pc1 = rtmp11 + prow;
1457           pc2 = rtmp22 + prow;
1458           pc3 = rtmp33 + prow;
1459           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1460             pv   = ba  + bd[prow];
1461             pj   = nbj + bd[prow];
1462             mul1 = *pc1 * *pv;
1463             mul2 = *pc2 * *pv;
1464             mul3 = *pc3 * *pv;
1465             ++pv;
1466             *pc1 = mul1;
1467             *pc2 = mul2;
1468             *pc3 = mul3;
1469 
1470             nz_tmp = bi[prow+1] - bd[prow] - 1;
1471             /* update this row based on pivot row */
1472             for (j=0; j<nz_tmp; j++) {
1473               tmp = pv[j];
1474               idx = pj[j];
1475               rtmp11[idx] -= mul1 * tmp;
1476               rtmp22[idx] -= mul2 * tmp;
1477               rtmp33[idx] -= mul3 * tmp;
1478             }
1479             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1480           }
1481           prow = *bjtmp++ ;
1482         }
1483 
1484         /* Now take care of diagonal 3x3 block in this set of rows */
1485         /* note: prow = row here */
1486         pc1 = rtmp11 + prow;
1487         pc2 = rtmp22 + prow;
1488         pc3 = rtmp33 + prow;
1489 
1490         sctx.pv = *pc1;
1491         pj      = bj + bi[prow];
1492         rs      = 0.0;
1493         for (j=0; j<nz; j++){
1494           idx = pj[j];
1495           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1496         }
1497         sctx.rs = rs;
1498         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1499         if (newshift == 1) goto endofwhile;
1500 
1501         if (*pc2 != 0.0 || *pc3 != 0.0){
1502           mul2 = (*pc2)/(*pc1);
1503           mul3 = (*pc3)/(*pc1);
1504           *pc2 = mul2;
1505           *pc3 = mul3;
1506           nz_tmp = bi[prow+1] - bd[prow] - 1;
1507           pj     = nbj + bd[prow];
1508           for (j=0; j<nz_tmp; j++) {
1509             idx = pj[j] ;
1510             tmp = rtmp11[idx];
1511             rtmp22[idx] -= mul2 * tmp;
1512             rtmp33[idx] -= mul3 * tmp;
1513           }
1514           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1515         }
1516         ++prow;
1517 
1518         pc2 = rtmp22 + prow;
1519         pc3 = rtmp33 + prow;
1520         sctx.pv = *pc2;
1521         pj      = bj + bi[prow];
1522         rs      = 0.0;
1523         for (j=0; j<nz; j++){
1524           idx = pj[j];
1525           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1526         }
1527         sctx.rs = rs;
1528         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1529         if (newshift == 1) goto endofwhile;
1530 
1531         if (*pc3 != 0.0){
1532           mul3   = (*pc3)/(*pc2);
1533           *pc3   = mul3;
1534           pj     = nbj + bd[prow];
1535           nz_tmp = bi[prow+1] - bd[prow] - 1;
1536           for (j=0; j<nz_tmp; j++) {
1537             idx = pj[j] ;
1538             tmp = rtmp22[idx];
1539             rtmp33[idx] -= mul3 * tmp;
1540           }
1541           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1542         }
1543 
1544         pj  = bj + bi[row];
1545         pc1 = ba + bi[row];
1546         pc2 = ba + bi[row+1];
1547         pc3 = ba + bi[row+2];
1548 
1549         sctx.pv = rtmp33[row+2];
1550         rs = 0.0;
1551         rtmp11[row]   = 1.0/rtmp11[row];
1552         rtmp22[row+1] = 1.0/rtmp22[row+1];
1553         rtmp33[row+2] = 1.0/rtmp33[row+2];
1554         /* copy row entries from dense representation to sparse */
1555         for (j=0; j<nz; j++) {
1556           idx    = pj[j];
1557           pc1[j] = rtmp11[idx];
1558           pc2[j] = rtmp22[idx];
1559           pc3[j] = rtmp33[idx];
1560           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1561         }
1562 
1563         sctx.rs = rs;
1564         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1565         if (newshift == 1) goto endofwhile;
1566         break;
1567 
1568       default:
1569         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1570       }
1571       row += nodesz;                 /* Update the row */
1572     }
1573     endofwhile:;
1574   } while (sctx.lushift);
1575   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1576   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1577   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1578   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1579   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1580   C->factor      = FACTOR_LU;
1581   C->assembled   = PETSC_TRUE;
1582   if (sctx.nshift) {
1583     if (info->shiftnz) {
1584       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1585     } else if (info->shiftpd) {
1586       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1587     }
1588   }
1589   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 /*
1594      Makes a longer coloring[] array and calls the usual code with that
1595 */
1596 #undef __FUNCT__
1597 #define __FUNCT__ "MatColoringPatch_Inode"
1598 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1599 {
1600   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1601   PetscErrorCode  ierr;
1602   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1603   PetscInt        *colorused,i;
1604   ISColoringValue *newcolor;
1605 
1606   PetscFunctionBegin;
1607   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1608   /* loop over inodes, marking a color for each column*/
1609   row = 0;
1610   for (i=0; i<m; i++){
1611     for (j=0; j<ns[i]; j++) {
1612       newcolor[row++] = coloring[i] + j*ncolors;
1613     }
1614   }
1615 
1616   /* eliminate unneeded colors */
1617   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1618   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1619   for (i=0; i<n; i++) {
1620     colorused[newcolor[i]] = 1;
1621   }
1622 
1623   for (i=1; i<5*ncolors; i++) {
1624     colorused[i] += colorused[i-1];
1625   }
1626   ncolors = colorused[5*ncolors-1];
1627   for (i=0; i<n; i++) {
1628     newcolor[i] = colorused[newcolor[i]]-1;
1629   }
1630   ierr = PetscFree(colorused);CHKERRQ(ierr);
1631   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1632   ierr = PetscFree(coloring);CHKERRQ(ierr);
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 #include "src/inline/ilu.h"
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatRelax_Inode"
1640 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1641 {
1642   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1643   PetscScalar        *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1644   MatScalar          *ibdiag,*bdiag;
1645   PetscScalar        *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1646   const PetscScalar  *v = a->a,*v1,*v2,*v3,*v4,*v5;
1647   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1648   PetscErrorCode     ierr;
1649   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1650   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1651 
1652   PetscFunctionBegin;
1653   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1654   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1655   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1656 
1657   if (!a->inode.ibdiagvalid) {
1658     if (!a->inode.ibdiag) {
1659       /* calculate space needed for diagonal blocks */
1660       for (i=0; i<m; i++) {
1661 	cnt += sizes[i]*sizes[i];
1662       }
1663       a->inode.bdiagsize = cnt;
1664       ierr   = PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);CHKERRQ(ierr);
1665     }
1666 
1667     /* copy over the diagonal blocks and invert them */
1668     ibdiag = a->inode.ibdiag;
1669     bdiag  = a->inode.bdiag;
1670     cnt = 0;
1671     for (i=0, row = 0; i<m; i++) {
1672       for (j=0; j<sizes[i]; j++) {
1673         for (k=0; k<sizes[i]; k++) {
1674           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1675         }
1676       }
1677       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1678 
1679       switch(sizes[i]) {
1680         case 1:
1681           /* Create matrix data structure */
1682           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1683           ibdiag[cnt] = 1.0/ibdiag[cnt];
1684           break;
1685         case 2:
1686           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1687           break;
1688         case 3:
1689           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1690           break;
1691         case 4:
1692           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1693           break;
1694         case 5:
1695           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1696           break;
1697        default:
1698 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1699       }
1700       cnt += sizes[i]*sizes[i];
1701       row += sizes[i];
1702     }
1703     a->inode.ibdiagvalid = PETSC_TRUE;
1704   }
1705   ibdiag = a->inode.ibdiag;
1706   bdiag  = a->inode.bdiag;
1707 
1708   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1709   if (xx != bb) {
1710     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1711   } else {
1712     b = x;
1713   }
1714 
1715   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1716   xs   = x;
1717   if (flag & SOR_ZERO_INITIAL_GUESS) {
1718     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1719 
1720       for (i=0, row=0; i<m; i++) {
1721         sz  = diag[row] - ii[row];
1722         v1  = a->a + ii[row];
1723         idx = a->j + ii[row];
1724 
1725         /* see comments for MatMult_Inode() for how this is coded */
1726         switch (sizes[i]){
1727           case 1:
1728 
1729             sum1  = b[row];
1730             for(n = 0; n<sz-1; n+=2) {
1731               i1   = idx[0];
1732               i2   = idx[1];
1733               idx += 2;
1734               tmp0 = x[i1];
1735               tmp1 = x[i2];
1736               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1737             }
1738 
1739             if (n == sz-1){
1740               tmp0  = x[*idx];
1741               sum1 -= *v1 * tmp0;
1742             }
1743             x[row++] = sum1*(*ibdiag++);
1744             break;
1745           case 2:
1746             v2    = a->a + ii[row+1];
1747             sum1  = b[row];
1748             sum2  = b[row+1];
1749             for(n = 0; n<sz-1; n+=2) {
1750               i1   = idx[0];
1751               i2   = idx[1];
1752               idx += 2;
1753               tmp0 = x[i1];
1754               tmp1 = x[i2];
1755               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1756               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1757             }
1758 
1759             if (n == sz-1){
1760               tmp0  = x[*idx];
1761               sum1 -= v1[0] * tmp0;
1762               sum2 -= v2[0] * tmp0;
1763             }
1764             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1765             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1766             ibdiag  += 4;
1767             break;
1768           case 3:
1769             v2    = a->a + ii[row+1];
1770             v3    = a->a + ii[row+2];
1771             sum1  = b[row];
1772             sum2  = b[row+1];
1773             sum3  = b[row+2];
1774             for(n = 0; n<sz-1; n+=2) {
1775               i1   = idx[0];
1776               i2   = idx[1];
1777               idx += 2;
1778               tmp0 = x[i1];
1779               tmp1 = x[i2];
1780               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1781               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1782               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1783             }
1784 
1785             if (n == sz-1){
1786               tmp0  = x[*idx];
1787               sum1 -= v1[0] * tmp0;
1788               sum2 -= v2[0] * tmp0;
1789               sum3 -= v3[0] * tmp0;
1790             }
1791             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1792             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1793             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1794             ibdiag  += 9;
1795             break;
1796           case 4:
1797             v2    = a->a + ii[row+1];
1798             v3    = a->a + ii[row+2];
1799             v4    = a->a + ii[row+3];
1800             sum1  = b[row];
1801             sum2  = b[row+1];
1802             sum3  = b[row+2];
1803             sum4  = b[row+3];
1804             for(n = 0; n<sz-1; n+=2) {
1805               i1   = idx[0];
1806               i2   = idx[1];
1807               idx += 2;
1808               tmp0 = x[i1];
1809               tmp1 = x[i2];
1810               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1811               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1812               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1813               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1814             }
1815 
1816             if (n == sz-1){
1817               tmp0  = x[*idx];
1818               sum1 -= v1[0] * tmp0;
1819               sum2 -= v2[0] * tmp0;
1820               sum3 -= v3[0] * tmp0;
1821               sum4 -= v4[0] * tmp0;
1822             }
1823             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1824             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1825             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1826             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1827             ibdiag  += 16;
1828             break;
1829           case 5:
1830             v2    = a->a + ii[row+1];
1831             v3    = a->a + ii[row+2];
1832             v4    = a->a + ii[row+3];
1833             v5    = a->a + ii[row+4];
1834             sum1  = b[row];
1835             sum2  = b[row+1];
1836             sum3  = b[row+2];
1837             sum4  = b[row+3];
1838             sum5  = b[row+4];
1839             for(n = 0; n<sz-1; n+=2) {
1840               i1   = idx[0];
1841               i2   = idx[1];
1842               idx += 2;
1843               tmp0 = x[i1];
1844               tmp1 = x[i2];
1845               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1846               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1847               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1848               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1849               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1850             }
1851 
1852             if (n == sz-1){
1853               tmp0  = x[*idx];
1854               sum1 -= v1[0] * tmp0;
1855               sum2 -= v2[0] * tmp0;
1856               sum3 -= v3[0] * tmp0;
1857               sum4 -= v4[0] * tmp0;
1858               sum5 -= v5[0] * tmp0;
1859             }
1860             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1861             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1862             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1863             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1864             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1865             ibdiag  += 25;
1866             break;
1867 	  default:
1868    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1869         }
1870       }
1871 
1872       xb = x;
1873       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1874     } else xb = b;
1875     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1876         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1877       cnt = 0;
1878       for (i=0, row=0; i<m; i++) {
1879 
1880         switch (sizes[i]){
1881           case 1:
1882             x[row++] *= bdiag[cnt++];
1883             break;
1884           case 2:
1885             x1   = x[row]; x2 = x[row+1];
1886             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1887             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1888             x[row++] = tmp1;
1889             x[row++] = tmp2;
1890             cnt += 4;
1891             break;
1892           case 3:
1893             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1894             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1895             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1896             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1897             x[row++] = tmp1;
1898             x[row++] = tmp2;
1899             x[row++] = tmp3;
1900             cnt += 9;
1901             break;
1902           case 4:
1903             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1904             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1905             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1906             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1907             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1908             x[row++] = tmp1;
1909             x[row++] = tmp2;
1910             x[row++] = tmp3;
1911             x[row++] = tmp4;
1912             cnt += 16;
1913             break;
1914           case 5:
1915             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1916             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1917             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1918             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1919             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1920             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1921             x[row++] = tmp1;
1922             x[row++] = tmp2;
1923             x[row++] = tmp3;
1924             x[row++] = tmp4;
1925             x[row++] = tmp5;
1926             cnt += 25;
1927             break;
1928 	  default:
1929    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1930         }
1931       }
1932       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1933     }
1934     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1935 
1936       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1937       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1938         ibdiag -= sizes[i]*sizes[i];
1939         sz      = ii[row+1] - diag[row] - 1;
1940         v1      = a->a + diag[row] + 1;
1941         idx     = a->j + diag[row] + 1;
1942 
1943         /* see comments for MatMult_Inode() for how this is coded */
1944         switch (sizes[i]){
1945           case 1:
1946 
1947             sum1  = xb[row];
1948             for(n = 0; n<sz-1; n+=2) {
1949               i1   = idx[0];
1950               i2   = idx[1];
1951               idx += 2;
1952               tmp0 = x[i1];
1953               tmp1 = x[i2];
1954               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1955             }
1956 
1957             if (n == sz-1){
1958               tmp0  = x[*idx];
1959               sum1 -= *v1*tmp0;
1960             }
1961             x[row--] = sum1*(*ibdiag);
1962             break;
1963 
1964           case 2:
1965 
1966             sum1  = xb[row];
1967             sum2  = xb[row-1];
1968             /* note that sum1 is associated with the second of the two rows */
1969             v2    = a->a + diag[row-1] + 2;
1970             for(n = 0; n<sz-1; n+=2) {
1971               i1   = idx[0];
1972               i2   = idx[1];
1973               idx += 2;
1974               tmp0 = x[i1];
1975               tmp1 = x[i2];
1976               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1977               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1978             }
1979 
1980             if (n == sz-1){
1981               tmp0  = x[*idx];
1982               sum1 -= *v1*tmp0;
1983               sum2 -= *v2*tmp0;
1984             }
1985             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1986             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1987             break;
1988           case 3:
1989 
1990             sum1  = xb[row];
1991             sum2  = xb[row-1];
1992             sum3  = xb[row-2];
1993             v2    = a->a + diag[row-1] + 2;
1994             v3    = a->a + diag[row-2] + 3;
1995             for(n = 0; n<sz-1; n+=2) {
1996               i1   = idx[0];
1997               i2   = idx[1];
1998               idx += 2;
1999               tmp0 = x[i1];
2000               tmp1 = x[i2];
2001               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2002               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2003               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2004             }
2005 
2006             if (n == sz-1){
2007               tmp0  = x[*idx];
2008               sum1 -= *v1*tmp0;
2009               sum2 -= *v2*tmp0;
2010               sum3 -= *v3*tmp0;
2011             }
2012             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2013             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2014             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2015             break;
2016           case 4:
2017 
2018             sum1  = xb[row];
2019             sum2  = xb[row-1];
2020             sum3  = xb[row-2];
2021             sum4  = xb[row-3];
2022             v2    = a->a + diag[row-1] + 2;
2023             v3    = a->a + diag[row-2] + 3;
2024             v4    = a->a + diag[row-3] + 4;
2025             for(n = 0; n<sz-1; n+=2) {
2026               i1   = idx[0];
2027               i2   = idx[1];
2028               idx += 2;
2029               tmp0 = x[i1];
2030               tmp1 = x[i2];
2031               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2032               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2033               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2034               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2035             }
2036 
2037             if (n == sz-1){
2038               tmp0  = x[*idx];
2039               sum1 -= *v1*tmp0;
2040               sum2 -= *v2*tmp0;
2041               sum3 -= *v3*tmp0;
2042               sum4 -= *v4*tmp0;
2043             }
2044             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2045             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2046             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2047             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2048             break;
2049           case 5:
2050 
2051             sum1  = xb[row];
2052             sum2  = xb[row-1];
2053             sum3  = xb[row-2];
2054             sum4  = xb[row-3];
2055             sum5  = xb[row-4];
2056             v2    = a->a + diag[row-1] + 2;
2057             v3    = a->a + diag[row-2] + 3;
2058             v4    = a->a + diag[row-3] + 4;
2059             v5    = a->a + diag[row-4] + 5;
2060             for(n = 0; n<sz-1; n+=2) {
2061               i1   = idx[0];
2062               i2   = idx[1];
2063               idx += 2;
2064               tmp0 = x[i1];
2065               tmp1 = x[i2];
2066               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2067               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2068               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2069               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2070               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2071             }
2072 
2073             if (n == sz-1){
2074               tmp0  = x[*idx];
2075               sum1 -= *v1*tmp0;
2076               sum2 -= *v2*tmp0;
2077               sum3 -= *v3*tmp0;
2078               sum4 -= *v4*tmp0;
2079               sum5 -= *v5*tmp0;
2080             }
2081             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2082             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2083             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2084             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2085             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2086             break;
2087 	  default:
2088    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2089         }
2090       }
2091 
2092       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2093     }
2094     its--;
2095   }
2096   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2098   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 
2103 /*
2104     samestructure indicates that the matrix has not changed its nonzero structure so we
2105     do not need to recompute the inodes
2106 */
2107 #undef __FUNCT__
2108 #define __FUNCT__ "Mat_CheckInode"
2109 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2110 {
2111   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2112   PetscErrorCode ierr;
2113   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2114   PetscTruth     flag;
2115 
2116   PetscFunctionBegin;
2117   if (!a->inode.use)                     PetscFunctionReturn(0);
2118   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2119 
2120 
2121   m = A->rmap.n;
2122   if (a->inode.size) {ns = a->inode.size;}
2123   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2124 
2125   i          = 0;
2126   node_count = 0;
2127   idx        = a->j;
2128   ii         = a->i;
2129   while (i < m){                /* For each row */
2130     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2131     /* Limits the number of elements in a node to 'a->inode.limit' */
2132     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2133       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2134       if (nzy != nzx) break;
2135       idy  += nzx;             /* Same nonzero pattern */
2136       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2137       if (!flag) break;
2138     }
2139     ns[node_count++] = blk_size;
2140     idx += blk_size*nzx;
2141     i    = j;
2142   }
2143   /* If not enough inodes found,, do not use inode version of the routines */
2144   if (!a->inode.size && m && node_count > .9*m) {
2145     ierr = PetscFree(ns);CHKERRQ(ierr);
2146     a->inode.node_count     = 0;
2147     a->inode.size           = PETSC_NULL;
2148     a->inode.use            = PETSC_FALSE;
2149     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2150   } else {
2151     A->ops->mult            = MatMult_Inode;
2152     A->ops->relax           = MatRelax_Inode;
2153     A->ops->multadd         = MatMultAdd_Inode;
2154     A->ops->solve           = MatSolve_Inode;
2155     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2156     A->ops->getrowij        = MatGetRowIJ_Inode;
2157     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2158     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2159     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2160     A->ops->coloringpatch   = MatColoringPatch_Inode;
2161     a->inode.node_count     = node_count;
2162     a->inode.size           = ns;
2163     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2164   }
2165   PetscFunctionReturn(0);
2166 }
2167 
2168 /*
2169      This is really ugly. if inodes are used this replaces the
2170   permutations with ones that correspond to rows/cols of the matrix
2171   rather then inode blocks
2172 */
2173 #undef __FUNCT__
2174 #define __FUNCT__ "MatInodeAdjustForInodes"
2175 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2176 {
2177   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2178 
2179   PetscFunctionBegin;
2180   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2181   if (f) {
2182     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2183   }
2184   PetscFunctionReturn(0);
2185 }
2186 
2187 EXTERN_C_BEGIN
2188 #undef __FUNCT__
2189 #define __FUNCT__ "MatAdjustForInodes_Inode"
2190 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2191 {
2192   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2193   PetscErrorCode ierr;
2194   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2195   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2196   PetscInt       nslim_col,*ns_col;
2197   IS             ris = *rperm,cis = *cperm;
2198 
2199   PetscFunctionBegin;
2200   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2201   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2202 
2203   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2204   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2205   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2206   permc = permr + m;
2207 
2208   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2209   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2210 
2211   /* Form the inode structure for the rows of permuted matric using inv perm*/
2212   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2213 
2214   /* Construct the permutations for rows*/
2215   for (i=0,row = 0; i<nslim_row; ++i){
2216     indx      = ridx[i];
2217     start_val = tns[indx];
2218     end_val   = tns[indx + 1];
2219     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2220   }
2221 
2222   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2223   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2224 
2225  /* Construct permutations for columns */
2226   for (i=0,col=0; i<nslim_col; ++i){
2227     indx      = cidx[i];
2228     start_val = tns[indx];
2229     end_val   = tns[indx + 1];
2230     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2231   }
2232 
2233   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2234   ISSetPermutation(*rperm);
2235   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2236   ISSetPermutation(*cperm);
2237 
2238   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2239   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2240 
2241   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2242   ierr = PetscFree(permr);CHKERRQ(ierr);
2243   ierr = ISDestroy(cis);CHKERRQ(ierr);
2244   ierr = ISDestroy(ris);CHKERRQ(ierr);
2245   ierr = PetscFree(tns);CHKERRQ(ierr);
2246   PetscFunctionReturn(0);
2247 }
2248 EXTERN_C_END
2249 
2250 #undef __FUNCT__
2251 #define __FUNCT__ "MatInodeGetInodeSizes"
2252 /*@C
2253    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2254 
2255    Collective on Mat
2256 
2257    Input Parameter:
2258 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2259 
2260    Output Parameter:
2261 +  node_count - no of inodes present in the matrix.
2262 .  sizes      - an array of size node_count,with sizes of each inode.
2263 -  limit      - the max size used to generate the inodes.
2264 
2265    Level: advanced
2266 
2267    Notes: This routine returns some internal storage information
2268    of the matrix, it is intended to be used by advanced users.
2269    It should be called after the matrix is assembled.
2270    The contents of the sizes[] array should not be changed.
2271    PETSC_NULL may be passed for information not requested.
2272 
2273 .keywords: matrix, seqaij, get, inode
2274 
2275 .seealso: MatGetInfo()
2276 @*/
2277 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2278 {
2279   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2280 
2281   PetscFunctionBegin;
2282   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2283   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2284   if (f) {
2285     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2286   }
2287   PetscFunctionReturn(0);
2288 }
2289 
2290 EXTERN_C_BEGIN
2291 #undef __FUNCT__
2292 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2293 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2294 {
2295   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2296 
2297   PetscFunctionBegin;
2298   if (node_count) *node_count = a->inode.node_count;
2299   if (sizes)      *sizes      = a->inode.size;
2300   if (limit)      *limit      = a->inode.limit;
2301   PetscFunctionReturn(0);
2302 }
2303 EXTERN_C_END
2304