xref: /petsc/src/mat/impls/aij/seq/inode.c (revision c8cd8b03a8e69285ecfea39db41c0ff684948fa5)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap.n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
402   PetscErrorCode ierr;
403   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
404 
405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407 #endif
408 
409   PetscFunctionBegin;
410   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
411   node_max = a->inode.node_count;
412   ns       = a->inode.size;     /* Node Size array */
413   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
414   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
415   idx  = a->j;
416   v1   = a->a;
417   ii   = a->i;
418 
419   for (i = 0,row = 0; i< node_max; ++i){
420     nsz  = ns[i];
421     n    = ii[1] - ii[0];
422     nonzerorow += (n>0)*nsz;
423     ii  += nsz;
424     sz   = n;                   /* No of non zeros in this row */
425                                 /* Switch on the size of Node */
426     switch (nsz){               /* Each loop in 'case' is unrolled */
427     case 1 :
428       sum1  = 0;
429 
430       for(n = 0; n< sz-1; n+=2) {
431         i1   = idx[0];          /* The instructions are ordered to */
432         i2   = idx[1];          /* make the compiler's job easy */
433         idx += 2;
434         tmp0 = x[i1];
435         tmp1 = x[i2];
436         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
437        }
438 
439       if (n == sz-1){          /* Take care of the last nonzero  */
440         tmp0  = x[*idx++];
441         sum1 += *v1++ * tmp0;
442       }
443       y[row++]=sum1;
444       break;
445     case 2:
446       sum1  = 0;
447       sum2  = 0;
448       v2    = v1 + n;
449 
450       for (n = 0; n< sz-1; n+=2) {
451         i1   = idx[0];
452         i2   = idx[1];
453         idx += 2;
454         tmp0 = x[i1];
455         tmp1 = x[i2];
456         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
457         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
458       }
459       if (n == sz-1){
460         tmp0  = x[*idx++];
461         sum1 += *v1++ * tmp0;
462         sum2 += *v2++ * tmp0;
463       }
464       y[row++]=sum1;
465       y[row++]=sum2;
466       v1      =v2;              /* Since the next block to be processed starts there*/
467       idx    +=sz;
468       break;
469     case 3:
470       sum1  = 0;
471       sum2  = 0;
472       sum3  = 0;
473       v2    = v1 + n;
474       v3    = v2 + n;
475 
476       for (n = 0; n< sz-1; n+=2) {
477         i1   = idx[0];
478         i2   = idx[1];
479         idx += 2;
480         tmp0 = x[i1];
481         tmp1 = x[i2];
482         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
483         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
484         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
485       }
486       if (n == sz-1){
487         tmp0  = x[*idx++];
488         sum1 += *v1++ * tmp0;
489         sum2 += *v2++ * tmp0;
490         sum3 += *v3++ * tmp0;
491       }
492       y[row++]=sum1;
493       y[row++]=sum2;
494       y[row++]=sum3;
495       v1       =v3;             /* Since the next block to be processed starts there*/
496       idx     +=2*sz;
497       break;
498     case 4:
499       sum1  = 0;
500       sum2  = 0;
501       sum3  = 0;
502       sum4  = 0;
503       v2    = v1 + n;
504       v3    = v2 + n;
505       v4    = v3 + n;
506 
507       for (n = 0; n< sz-1; n+=2) {
508         i1   = idx[0];
509         i2   = idx[1];
510         idx += 2;
511         tmp0 = x[i1];
512         tmp1 = x[i2];
513         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
514         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
515         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
516         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
517       }
518       if (n == sz-1){
519         tmp0  = x[*idx++];
520         sum1 += *v1++ * tmp0;
521         sum2 += *v2++ * tmp0;
522         sum3 += *v3++ * tmp0;
523         sum4 += *v4++ * tmp0;
524       }
525       y[row++]=sum1;
526       y[row++]=sum2;
527       y[row++]=sum3;
528       y[row++]=sum4;
529       v1      =v4;              /* Since the next block to be processed starts there*/
530       idx    +=3*sz;
531       break;
532     case 5:
533       sum1  = 0;
534       sum2  = 0;
535       sum3  = 0;
536       sum4  = 0;
537       sum5  = 0;
538       v2    = v1 + n;
539       v3    = v2 + n;
540       v4    = v3 + n;
541       v5    = v4 + n;
542 
543       for (n = 0; n<sz-1; n+=2) {
544         i1   = idx[0];
545         i2   = idx[1];
546         idx += 2;
547         tmp0 = x[i1];
548         tmp1 = x[i2];
549         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
550         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
551         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
552         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
553         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
554       }
555       if (n == sz-1){
556         tmp0  = x[*idx++];
557         sum1 += *v1++ * tmp0;
558         sum2 += *v2++ * tmp0;
559         sum3 += *v3++ * tmp0;
560         sum4 += *v4++ * tmp0;
561         sum5 += *v5++ * tmp0;
562       }
563       y[row++]=sum1;
564       y[row++]=sum2;
565       y[row++]=sum3;
566       y[row++]=sum4;
567       y[row++]=sum5;
568       v1      =v5;       /* Since the next block to be processed starts there */
569       idx    +=4*sz;
570       break;
571     default :
572       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
573     }
574   }
575   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
576   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
577   ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr);
578   PetscFunctionReturn(0);
579 }
580 /* ----------------------------------------------------------- */
581 /* Almost same code as the MatMult_Inode() */
582 #undef __FUNCT__
583 #define __FUNCT__ "MatMultAdd_Inode"
584 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
585 {
586   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
587   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
588   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
589   PetscErrorCode ierr;
590   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
591 
592   PetscFunctionBegin;
593   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
594   node_max = a->inode.node_count;
595   ns       = a->inode.size;     /* Node Size array */
596   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
597   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
598   if (zz != yy) {
599     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
600   } else {
601     z = y;
602   }
603   zt = z;
604 
605   idx  = a->j;
606   v1   = a->a;
607   ii   = a->i;
608 
609   for (i = 0,row = 0; i< node_max; ++i){
610     nsz  = ns[i];
611     n    = ii[1] - ii[0];
612     ii  += nsz;
613     sz   = n;                   /* No of non zeros in this row */
614                                 /* Switch on the size of Node */
615     switch (nsz){               /* Each loop in 'case' is unrolled */
616     case 1 :
617       sum1  = *zt++;
618 
619       for(n = 0; n< sz-1; n+=2) {
620         i1   = idx[0];          /* The instructions are ordered to */
621         i2   = idx[1];          /* make the compiler's job easy */
622         idx += 2;
623         tmp0 = x[i1];
624         tmp1 = x[i2];
625         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
626        }
627 
628       if(n   == sz-1){          /* Take care of the last nonzero  */
629         tmp0  = x[*idx++];
630         sum1 += *v1++ * tmp0;
631       }
632       y[row++]=sum1;
633       break;
634     case 2:
635       sum1  = *zt++;
636       sum2  = *zt++;
637       v2    = v1 + n;
638 
639       for(n = 0; n< sz-1; n+=2) {
640         i1   = idx[0];
641         i2   = idx[1];
642         idx += 2;
643         tmp0 = x[i1];
644         tmp1 = x[i2];
645         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
646         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
647       }
648       if(n   == sz-1){
649         tmp0  = x[*idx++];
650         sum1 += *v1++ * tmp0;
651         sum2 += *v2++ * tmp0;
652       }
653       y[row++]=sum1;
654       y[row++]=sum2;
655       v1      =v2;              /* Since the next block to be processed starts there*/
656       idx    +=sz;
657       break;
658     case 3:
659       sum1  = *zt++;
660       sum2  = *zt++;
661       sum3  = *zt++;
662       v2    = v1 + n;
663       v3    = v2 + n;
664 
665       for (n = 0; n< sz-1; n+=2) {
666         i1   = idx[0];
667         i2   = idx[1];
668         idx += 2;
669         tmp0 = x[i1];
670         tmp1 = x[i2];
671         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
672         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
673         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
674       }
675       if (n == sz-1){
676         tmp0  = x[*idx++];
677         sum1 += *v1++ * tmp0;
678         sum2 += *v2++ * tmp0;
679         sum3 += *v3++ * tmp0;
680       }
681       y[row++]=sum1;
682       y[row++]=sum2;
683       y[row++]=sum3;
684       v1       =v3;             /* Since the next block to be processed starts there*/
685       idx     +=2*sz;
686       break;
687     case 4:
688       sum1  = *zt++;
689       sum2  = *zt++;
690       sum3  = *zt++;
691       sum4  = *zt++;
692       v2    = v1 + n;
693       v3    = v2 + n;
694       v4    = v3 + n;
695 
696       for (n = 0; n< sz-1; n+=2) {
697         i1   = idx[0];
698         i2   = idx[1];
699         idx += 2;
700         tmp0 = x[i1];
701         tmp1 = x[i2];
702         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
703         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
704         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
705         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
706       }
707       if (n == sz-1){
708         tmp0  = x[*idx++];
709         sum1 += *v1++ * tmp0;
710         sum2 += *v2++ * tmp0;
711         sum3 += *v3++ * tmp0;
712         sum4 += *v4++ * tmp0;
713       }
714       y[row++]=sum1;
715       y[row++]=sum2;
716       y[row++]=sum3;
717       y[row++]=sum4;
718       v1      =v4;              /* Since the next block to be processed starts there*/
719       idx    +=3*sz;
720       break;
721     case 5:
722       sum1  = *zt++;
723       sum2  = *zt++;
724       sum3  = *zt++;
725       sum4  = *zt++;
726       sum5  = *zt++;
727       v2    = v1 + n;
728       v3    = v2 + n;
729       v4    = v3 + n;
730       v5    = v4 + n;
731 
732       for (n = 0; n<sz-1; n+=2) {
733         i1   = idx[0];
734         i2   = idx[1];
735         idx += 2;
736         tmp0 = x[i1];
737         tmp1 = x[i2];
738         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
739         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
740         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
741         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
742         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
743       }
744       if(n   == sz-1){
745         tmp0  = x[*idx++];
746         sum1 += *v1++ * tmp0;
747         sum2 += *v2++ * tmp0;
748         sum3 += *v3++ * tmp0;
749         sum4 += *v4++ * tmp0;
750         sum5 += *v5++ * tmp0;
751       }
752       y[row++]=sum1;
753       y[row++]=sum2;
754       y[row++]=sum3;
755       y[row++]=sum4;
756       y[row++]=sum5;
757       v1      =v5;       /* Since the next block to be processed starts there */
758       idx    +=4*sz;
759       break;
760     default :
761       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
762     }
763   }
764   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
765   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
766   if (zz != yy) {
767     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
768   }
769   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
770   PetscFunctionReturn(0);
771 }
772 
773 /* ----------------------------------------------------------- */
774 #undef __FUNCT__
775 #define __FUNCT__ "MatSolve_Inode"
776 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
777 {
778   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
779   IS             iscol = a->col,isrow = a->row;
780   PetscErrorCode ierr;
781   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
782   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
783   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
784   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
785 
786   PetscFunctionBegin;
787   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
788   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
789   node_max = a->inode.node_count;
790   ns       = a->inode.size;     /* Node Size array */
791 
792   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
793   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
794   tmp  = a->solve_work;
795 
796   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
797   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
798 
799   /* forward solve the lower triangular */
800   tmps = tmp ;
801   aa   = a_a ;
802   aj   = a_j ;
803   ad   = a->diag;
804 
805   for (i = 0,row = 0; i< node_max; ++i){
806     nsz = ns[i];
807     aii = ai[row];
808     v1  = aa + aii;
809     vi  = aj + aii;
810     nz  = ad[row]- aii;
811 
812     switch (nsz){               /* Each loop in 'case' is unrolled */
813     case 1 :
814       sum1 = b[*r++];
815       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
816       for(j=0; j<nz-1; j+=2){
817         i0   = vi[0];
818         i1   = vi[1];
819         vi  +=2;
820         tmp0 = tmps[i0];
821         tmp1 = tmps[i1];
822         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
823       }
824       if(j == nz-1){
825         tmp0 = tmps[*vi++];
826         sum1 -= *v1++ *tmp0;
827       }
828       tmp[row ++]=sum1;
829       break;
830     case 2:
831       sum1 = b[*r++];
832       sum2 = b[*r++];
833       v2   = aa + ai[row+1];
834 
835       for(j=0; j<nz-1; j+=2){
836         i0   = vi[0];
837         i1   = vi[1];
838         vi  +=2;
839         tmp0 = tmps[i0];
840         tmp1 = tmps[i1];
841         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
842         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
843       }
844       if(j == nz-1){
845         tmp0 = tmps[*vi++];
846         sum1 -= *v1++ *tmp0;
847         sum2 -= *v2++ *tmp0;
848       }
849       sum2 -= *v2++ * sum1;
850       tmp[row ++]=sum1;
851       tmp[row ++]=sum2;
852       break;
853     case 3:
854       sum1 = b[*r++];
855       sum2 = b[*r++];
856       sum3 = b[*r++];
857       v2   = aa + ai[row+1];
858       v3   = aa + ai[row+2];
859 
860       for (j=0; j<nz-1; j+=2){
861         i0   = vi[0];
862         i1   = vi[1];
863         vi  +=2;
864         tmp0 = tmps[i0];
865         tmp1 = tmps[i1];
866         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
867         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
868         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
869       }
870       if (j == nz-1){
871         tmp0 = tmps[*vi++];
872         sum1 -= *v1++ *tmp0;
873         sum2 -= *v2++ *tmp0;
874         sum3 -= *v3++ *tmp0;
875       }
876       sum2 -= *v2++ * sum1;
877       sum3 -= *v3++ * sum1;
878       sum3 -= *v3++ * sum2;
879       tmp[row ++]=sum1;
880       tmp[row ++]=sum2;
881       tmp[row ++]=sum3;
882       break;
883 
884     case 4:
885       sum1 = b[*r++];
886       sum2 = b[*r++];
887       sum3 = b[*r++];
888       sum4 = b[*r++];
889       v2   = aa + ai[row+1];
890       v3   = aa + ai[row+2];
891       v4   = aa + ai[row+3];
892 
893       for (j=0; j<nz-1; j+=2){
894         i0   = vi[0];
895         i1   = vi[1];
896         vi  +=2;
897         tmp0 = tmps[i0];
898         tmp1 = tmps[i1];
899         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
900         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
901         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
902         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
903       }
904       if (j == nz-1){
905         tmp0 = tmps[*vi++];
906         sum1 -= *v1++ *tmp0;
907         sum2 -= *v2++ *tmp0;
908         sum3 -= *v3++ *tmp0;
909         sum4 -= *v4++ *tmp0;
910       }
911       sum2 -= *v2++ * sum1;
912       sum3 -= *v3++ * sum1;
913       sum4 -= *v4++ * sum1;
914       sum3 -= *v3++ * sum2;
915       sum4 -= *v4++ * sum2;
916       sum4 -= *v4++ * sum3;
917 
918       tmp[row ++]=sum1;
919       tmp[row ++]=sum2;
920       tmp[row ++]=sum3;
921       tmp[row ++]=sum4;
922       break;
923     case 5:
924       sum1 = b[*r++];
925       sum2 = b[*r++];
926       sum3 = b[*r++];
927       sum4 = b[*r++];
928       sum5 = b[*r++];
929       v2   = aa + ai[row+1];
930       v3   = aa + ai[row+2];
931       v4   = aa + ai[row+3];
932       v5   = aa + ai[row+4];
933 
934       for (j=0; j<nz-1; j+=2){
935         i0   = vi[0];
936         i1   = vi[1];
937         vi  +=2;
938         tmp0 = tmps[i0];
939         tmp1 = tmps[i1];
940         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
941         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
942         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
943         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
944         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
945       }
946       if (j == nz-1){
947         tmp0 = tmps[*vi++];
948         sum1 -= *v1++ *tmp0;
949         sum2 -= *v2++ *tmp0;
950         sum3 -= *v3++ *tmp0;
951         sum4 -= *v4++ *tmp0;
952         sum5 -= *v5++ *tmp0;
953       }
954 
955       sum2 -= *v2++ * sum1;
956       sum3 -= *v3++ * sum1;
957       sum4 -= *v4++ * sum1;
958       sum5 -= *v5++ * sum1;
959       sum3 -= *v3++ * sum2;
960       sum4 -= *v4++ * sum2;
961       sum5 -= *v5++ * sum2;
962       sum4 -= *v4++ * sum3;
963       sum5 -= *v5++ * sum3;
964       sum5 -= *v5++ * sum4;
965 
966       tmp[row ++]=sum1;
967       tmp[row ++]=sum2;
968       tmp[row ++]=sum3;
969       tmp[row ++]=sum4;
970       tmp[row ++]=sum5;
971       break;
972     default:
973       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
974     }
975   }
976   /* backward solve the upper triangular */
977   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
978     nsz = ns[i];
979     aii = ai[row+1] -1;
980     v1  = aa + aii;
981     vi  = aj + aii;
982     nz  = aii- ad[row];
983     switch (nsz){               /* Each loop in 'case' is unrolled */
984     case 1 :
985       sum1 = tmp[row];
986 
987       for(j=nz ; j>1; j-=2){
988         vi  -=2;
989         i0   = vi[2];
990         i1   = vi[1];
991         tmp0 = tmps[i0];
992         tmp1 = tmps[i1];
993         v1   -= 2;
994         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
995       }
996       if (j==1){
997         tmp0  = tmps[*vi--];
998         sum1 -= *v1-- * tmp0;
999       }
1000       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1001       break;
1002     case 2 :
1003       sum1 = tmp[row];
1004       sum2 = tmp[row -1];
1005       v2   = aa + ai[row]-1;
1006       for (j=nz ; j>1; j-=2){
1007         vi  -=2;
1008         i0   = vi[2];
1009         i1   = vi[1];
1010         tmp0 = tmps[i0];
1011         tmp1 = tmps[i1];
1012         v1   -= 2;
1013         v2   -= 2;
1014         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1015         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1016       }
1017       if (j==1){
1018         tmp0  = tmps[*vi--];
1019         sum1 -= *v1-- * tmp0;
1020         sum2 -= *v2-- * tmp0;
1021       }
1022 
1023       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1024       sum2   -= *v2-- * tmp0;
1025       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1026       break;
1027     case 3 :
1028       sum1 = tmp[row];
1029       sum2 = tmp[row -1];
1030       sum3 = tmp[row -2];
1031       v2   = aa + ai[row]-1;
1032       v3   = aa + ai[row -1]-1;
1033       for (j=nz ; j>1; j-=2){
1034         vi  -=2;
1035         i0   = vi[2];
1036         i1   = vi[1];
1037         tmp0 = tmps[i0];
1038         tmp1 = tmps[i1];
1039         v1   -= 2;
1040         v2   -= 2;
1041         v3   -= 2;
1042         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1043         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1044         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1045       }
1046       if (j==1){
1047         tmp0  = tmps[*vi--];
1048         sum1 -= *v1-- * tmp0;
1049         sum2 -= *v2-- * tmp0;
1050         sum3 -= *v3-- * tmp0;
1051       }
1052       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1053       sum2   -= *v2-- * tmp0;
1054       sum3   -= *v3-- * tmp0;
1055       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1056       sum3   -= *v3-- * tmp0;
1057       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1058 
1059       break;
1060     case 4 :
1061       sum1 = tmp[row];
1062       sum2 = tmp[row -1];
1063       sum3 = tmp[row -2];
1064       sum4 = tmp[row -3];
1065       v2   = aa + ai[row]-1;
1066       v3   = aa + ai[row -1]-1;
1067       v4   = aa + ai[row -2]-1;
1068 
1069       for (j=nz ; j>1; j-=2){
1070         vi  -=2;
1071         i0   = vi[2];
1072         i1   = vi[1];
1073         tmp0 = tmps[i0];
1074         tmp1 = tmps[i1];
1075         v1  -= 2;
1076         v2  -= 2;
1077         v3  -= 2;
1078         v4  -= 2;
1079         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1080         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1081         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1082         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1083       }
1084       if (j==1){
1085         tmp0  = tmps[*vi--];
1086         sum1 -= *v1-- * tmp0;
1087         sum2 -= *v2-- * tmp0;
1088         sum3 -= *v3-- * tmp0;
1089         sum4 -= *v4-- * tmp0;
1090       }
1091 
1092       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1093       sum2   -= *v2-- * tmp0;
1094       sum3   -= *v3-- * tmp0;
1095       sum4   -= *v4-- * tmp0;
1096       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1097       sum3   -= *v3-- * tmp0;
1098       sum4   -= *v4-- * tmp0;
1099       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1100       sum4   -= *v4-- * tmp0;
1101       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1102       break;
1103     case 5 :
1104       sum1 = tmp[row];
1105       sum2 = tmp[row -1];
1106       sum3 = tmp[row -2];
1107       sum4 = tmp[row -3];
1108       sum5 = tmp[row -4];
1109       v2   = aa + ai[row]-1;
1110       v3   = aa + ai[row -1]-1;
1111       v4   = aa + ai[row -2]-1;
1112       v5   = aa + ai[row -3]-1;
1113       for (j=nz ; j>1; j-=2){
1114         vi  -= 2;
1115         i0   = vi[2];
1116         i1   = vi[1];
1117         tmp0 = tmps[i0];
1118         tmp1 = tmps[i1];
1119         v1   -= 2;
1120         v2   -= 2;
1121         v3   -= 2;
1122         v4   -= 2;
1123         v5   -= 2;
1124         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1125         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1126         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1127         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1128         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1129       }
1130       if (j==1){
1131         tmp0  = tmps[*vi--];
1132         sum1 -= *v1-- * tmp0;
1133         sum2 -= *v2-- * tmp0;
1134         sum3 -= *v3-- * tmp0;
1135         sum4 -= *v4-- * tmp0;
1136         sum5 -= *v5-- * tmp0;
1137       }
1138 
1139       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1140       sum2   -= *v2-- * tmp0;
1141       sum3   -= *v3-- * tmp0;
1142       sum4   -= *v4-- * tmp0;
1143       sum5   -= *v5-- * tmp0;
1144       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1145       sum3   -= *v3-- * tmp0;
1146       sum4   -= *v4-- * tmp0;
1147       sum5   -= *v5-- * tmp0;
1148       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1149       sum4   -= *v4-- * tmp0;
1150       sum5   -= *v5-- * tmp0;
1151       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1152       sum5   -= *v5-- * tmp0;
1153       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1154       break;
1155     default:
1156       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1157     }
1158   }
1159   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1160   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1162   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1163   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 #undef __FUNCT__
1168 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1169 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1170 {
1171   Mat            C = *B;
1172   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1173   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1174   PetscErrorCode ierr;
1175   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1176   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1177   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1178   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1179   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1180   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv;
1181   PetscReal      rs=0.0;
1182   LUShift_Ctx    sctx;
1183   PetscInt       newshift;
1184 
1185   PetscFunctionBegin;
1186   sctx.shift_top  = 0;
1187   sctx.nshift_max = 0;
1188   sctx.shift_lo   = 0;
1189   sctx.shift_hi   = 0;
1190 
1191   /* if both shift schemes are chosen by user, only use info->shiftpd */
1192   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1193   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1194     sctx.shift_top = 0;
1195     for (i=0; i<n; i++) {
1196       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1197       rs    = 0.0;
1198       ajtmp = aj + ai[i];
1199       rtmp1 = aa + ai[i];
1200       nz = ai[i+1] - ai[i];
1201       for (j=0; j<nz; j++){
1202         if (*ajtmp != i){
1203           rs += PetscAbsScalar(*rtmp1++);
1204         } else {
1205           rs -= PetscRealPart(*rtmp1++);
1206         }
1207         ajtmp++;
1208       }
1209       if (rs>sctx.shift_top) sctx.shift_top = rs;
1210     }
1211     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1212     sctx.shift_top *= 1.1;
1213     sctx.nshift_max = 5;
1214     sctx.shift_lo   = 0.;
1215     sctx.shift_hi   = 1.;
1216   }
1217   sctx.shift_amount = 0;
1218   sctx.nshift       = 0;
1219 
1220   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1221   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1222   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1223   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1224   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1225   ics   = ic ;
1226   rtmp2 = rtmp1 + n;
1227   rtmp3 = rtmp2 + n;
1228 
1229   node_max = a->inode.node_count;
1230   ns       = a->inode.size ;
1231   if (!ns){
1232     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1233   }
1234 
1235   /* If max inode size > 3, split it into two inodes.*/
1236   /* also map the inode sizes according to the ordering */
1237   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1238   for (i=0,j=0; i<node_max; ++i,++j){
1239     if (ns[i]>3) {
1240       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1241       ++j;
1242       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1243     } else {
1244       tmp_vec1[j] = ns[i];
1245     }
1246   }
1247   /* Use the correct node_max */
1248   node_max = j;
1249 
1250   /* Now reorder the inode info based on mat re-ordering info */
1251   /* First create a row -> inode_size_array_index map */
1252   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1253   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1254   for (i=0,row=0; i<node_max; i++) {
1255     nodesz = tmp_vec1[i];
1256     for (j=0; j<nodesz; j++,row++) {
1257       nsmap[row] = i;
1258     }
1259   }
1260   /* Using nsmap, create a reordered ns structure */
1261   for (i=0,j=0; i< node_max; i++) {
1262     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1263     tmp_vec2[i]  = nodesz;
1264     j           += nodesz;
1265   }
1266   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1267   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1268   /* Now use the correct ns */
1269   ns = tmp_vec2;
1270 
1271   do {
1272     sctx.lushift = PETSC_FALSE;
1273     /* Now loop over each block-row, and do the factorization */
1274     for (i=0,row=0; i<node_max; i++) {
1275       nodesz = ns[i];
1276       nz     = bi[row+1] - bi[row];
1277       bjtmp  = bj + bi[row];
1278 
1279       switch (nodesz){
1280       case 1:
1281         for  (j=0; j<nz; j++){
1282           idx        = bjtmp[j];
1283           rtmp1[idx] = 0.0;
1284         }
1285 
1286         /* load in initial (unfactored row) */
1287         idx    = r[row];
1288         nz_tmp = ai[idx+1] - ai[idx];
1289         ajtmp  = aj + ai[idx];
1290         v1     = aa + ai[idx];
1291 
1292         for (j=0; j<nz_tmp; j++) {
1293           idx        = ics[ajtmp[j]];
1294           rtmp1[idx] = v1[j];
1295         }
1296         rtmp1[ics[r[row]]] += sctx.shift_amount;
1297 
1298         prow = *bjtmp++ ;
1299         while (prow < row) {
1300           pc1 = rtmp1 + prow;
1301           if (*pc1 != 0.0){
1302             pv   = ba + bd[prow];
1303             pj   = nbj + bd[prow];
1304             mul1 = *pc1 * *pv++;
1305             *pc1 = mul1;
1306             nz_tmp = bi[prow+1] - bd[prow] - 1;
1307             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1308             for (j=0; j<nz_tmp; j++) {
1309               tmp = pv[j];
1310               idx = pj[j];
1311               rtmp1[idx] -= mul1 * tmp;
1312             }
1313           }
1314           prow = *bjtmp++ ;
1315         }
1316         pj  = bj + bi[row];
1317         pc1 = ba + bi[row];
1318 
1319         sctx.pv    = rtmp1[row];
1320         rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1321         rs         = 0.0;
1322         for (j=0; j<nz; j++) {
1323           idx    = pj[j];
1324           pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1325           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1326         }
1327         sctx.rs  = rs;
1328         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1329         if (newshift == 1) goto endofwhile;
1330         break;
1331 
1332       case 2:
1333         for (j=0; j<nz; j++) {
1334           idx        = bjtmp[j];
1335           rtmp1[idx] = 0.0;
1336           rtmp2[idx] = 0.0;
1337         }
1338 
1339         /* load in initial (unfactored row) */
1340         idx    = r[row];
1341         nz_tmp = ai[idx+1] - ai[idx];
1342         ajtmp  = aj + ai[idx];
1343         v1     = aa + ai[idx];
1344         v2     = aa + ai[idx+1];
1345         for (j=0; j<nz_tmp; j++) {
1346           idx        = ics[ajtmp[j]];
1347           rtmp1[idx] = v1[j];
1348           rtmp2[idx] = v2[j];
1349         }
1350         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1351         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1352 
1353         prow = *bjtmp++ ;
1354         while (prow < row) {
1355           pc1 = rtmp1 + prow;
1356           pc2 = rtmp2 + prow;
1357           if (*pc1 != 0.0 || *pc2 != 0.0){
1358             pv   = ba + bd[prow];
1359             pj   = nbj + bd[prow];
1360             mul1 = *pc1 * *pv;
1361             mul2 = *pc2 * *pv;
1362             ++pv;
1363             *pc1 = mul1;
1364             *pc2 = mul2;
1365 
1366             nz_tmp = bi[prow+1] - bd[prow] - 1;
1367             for (j=0; j<nz_tmp; j++) {
1368               tmp = pv[j];
1369               idx = pj[j];
1370               rtmp1[idx] -= mul1 * tmp;
1371               rtmp2[idx] -= mul2 * tmp;
1372             }
1373             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1374           }
1375           prow = *bjtmp++ ;
1376         }
1377 
1378         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1379         pc1 = rtmp1 + prow;
1380         pc2 = rtmp2 + prow;
1381 
1382         sctx.pv = *pc1;
1383         pj      = bj + bi[prow];
1384         rs      = 0.0;
1385         for (j=0; j<nz; j++){
1386           idx = pj[j];
1387           if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1388         }
1389         sctx.rs = rs;
1390         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1391         if (newshift == 1) goto endofwhile;
1392 
1393         if (*pc2 != 0.0){
1394           pj     = nbj + bd[prow];
1395           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1396           *pc2   = mul2;
1397           nz_tmp = bi[prow+1] - bd[prow] - 1;
1398           for (j=0; j<nz_tmp; j++) {
1399             idx = pj[j] ;
1400             tmp = rtmp1[idx];
1401             rtmp2[idx] -= mul2 * tmp;
1402           }
1403           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1404         }
1405 
1406         pj  = bj + bi[row];
1407         pc1 = ba + bi[row];
1408         pc2 = ba + bi[row+1];
1409 
1410         sctx.pv = rtmp2[row+1];
1411         rs = 0.0;
1412         rtmp1[row]   = 1.0/rtmp1[row];
1413         rtmp2[row+1] = 1.0/rtmp2[row+1];
1414         /* copy row entries from dense representation to sparse */
1415         for (j=0; j<nz; j++) {
1416           idx    = pj[j];
1417           pc1[j] = rtmp1[idx];
1418           pc2[j] = rtmp2[idx];
1419           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1420         }
1421         sctx.rs = rs;
1422         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1423         if (newshift == 1) goto endofwhile;
1424         break;
1425 
1426       case 3:
1427         for  (j=0; j<nz; j++) {
1428           idx        = bjtmp[j];
1429           rtmp1[idx] = 0.0;
1430           rtmp2[idx] = 0.0;
1431           rtmp3[idx] = 0.0;
1432         }
1433         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1434         idx    = r[row];
1435         nz_tmp = ai[idx+1] - ai[idx];
1436         ajtmp = aj + ai[idx];
1437         v1    = aa + ai[idx];
1438         v2    = aa + ai[idx+1];
1439         v3    = aa + ai[idx+2];
1440         for (j=0; j<nz_tmp; j++) {
1441           idx        = ics[ajtmp[j]];
1442           rtmp1[idx] = v1[j];
1443           rtmp2[idx] = v2[j];
1444           rtmp3[idx] = v3[j];
1445         }
1446         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1447         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1448         rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1449 
1450         /* loop over all pivot row blocks above this row block */
1451         prow = *bjtmp++ ;
1452         while (prow < row) {
1453           pc1 = rtmp1 + prow;
1454           pc2 = rtmp2 + prow;
1455           pc3 = rtmp3 + prow;
1456           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1457             pv   = ba  + bd[prow];
1458             pj   = nbj + bd[prow];
1459             mul1 = *pc1 * *pv;
1460             mul2 = *pc2 * *pv;
1461             mul3 = *pc3 * *pv;
1462             ++pv;
1463             *pc1 = mul1;
1464             *pc2 = mul2;
1465             *pc3 = mul3;
1466 
1467             nz_tmp = bi[prow+1] - bd[prow] - 1;
1468             /* update this row based on pivot row */
1469             for (j=0; j<nz_tmp; j++) {
1470               tmp = pv[j];
1471               idx = pj[j];
1472               rtmp1[idx] -= mul1 * tmp;
1473               rtmp2[idx] -= mul2 * tmp;
1474               rtmp3[idx] -= mul3 * tmp;
1475             }
1476             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1477           }
1478           prow = *bjtmp++ ;
1479         }
1480 
1481         /* Now take care of diagonal 3x3 block in this set of rows */
1482         /* note: prow = row here */
1483         pc1 = rtmp1 + prow;
1484         pc2 = rtmp2 + prow;
1485         pc3 = rtmp3 + prow;
1486 
1487         sctx.pv = *pc1;
1488         pj      = bj + bi[prow];
1489         rs      = 0.0;
1490         for (j=0; j<nz; j++){
1491           idx = pj[j];
1492           if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1493         }
1494         sctx.rs = rs;
1495         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1496         if (newshift == 1) goto endofwhile;
1497 
1498         if (*pc2 != 0.0 || *pc3 != 0.0){
1499           mul2 = (*pc2)/(*pc1);
1500           mul3 = (*pc3)/(*pc1);
1501           *pc2 = mul2;
1502           *pc3 = mul3;
1503           nz_tmp = bi[prow+1] - bd[prow] - 1;
1504           pj     = nbj + bd[prow];
1505           for (j=0; j<nz_tmp; j++) {
1506             idx = pj[j] ;
1507             tmp = rtmp1[idx];
1508             rtmp2[idx] -= mul2 * tmp;
1509             rtmp3[idx] -= mul3 * tmp;
1510           }
1511           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1512         }
1513         ++prow;
1514 
1515         pc2 = rtmp2 + prow;
1516         pc3 = rtmp3 + prow;
1517         sctx.pv = *pc2;
1518         pj      = bj + bi[prow];
1519         rs      = 0.0;
1520         for (j=0; j<nz; j++){
1521           idx = pj[j];
1522           if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1523         }
1524         sctx.rs = rs;
1525         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1526         if (newshift == 1) goto endofwhile;
1527 
1528         if (*pc3 != 0.0){
1529           mul3   = (*pc3)/(*pc2);
1530           *pc3   = mul3;
1531           pj     = nbj + bd[prow];
1532           nz_tmp = bi[prow+1] - bd[prow] - 1;
1533           for (j=0; j<nz_tmp; j++) {
1534             idx = pj[j] ;
1535             tmp = rtmp2[idx];
1536             rtmp3[idx] -= mul3 * tmp;
1537           }
1538           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1539         }
1540 
1541         pj  = bj + bi[row];
1542         pc1 = ba + bi[row];
1543         pc2 = ba + bi[row+1];
1544         pc3 = ba + bi[row+2];
1545 
1546         sctx.pv = rtmp3[row+2];
1547         rs = 0.0;
1548         rtmp1[row]   = 1.0/rtmp1[row];
1549         rtmp2[row+1] = 1.0/rtmp2[row+1];
1550         rtmp3[row+2] = 1.0/rtmp3[row+2];
1551         /* copy row entries from dense representation to sparse */
1552         for (j=0; j<nz; j++) {
1553           idx    = pj[j];
1554           pc1[j] = rtmp1[idx];
1555           pc2[j] = rtmp2[idx];
1556           pc3[j] = rtmp3[idx];
1557           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1558         }
1559 
1560         sctx.rs = rs;
1561         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1562         if (newshift == 1) goto endofwhile;
1563         break;
1564 
1565       default:
1566         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1567       }
1568       row += nodesz;                 /* Update the row */
1569     }
1570     endofwhile:;
1571   } while (sctx.lushift);
1572   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1573   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1574   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1575   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1576   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1577   C->factor      = FACTOR_LU;
1578   C->assembled   = PETSC_TRUE;
1579   if (sctx.nshift) {
1580     if (info->shiftnz) {
1581       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1582     } else if (info->shiftpd) {
1583       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1584     }
1585   }
1586   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 /*
1591      Makes a longer coloring[] array and calls the usual code with that
1592 */
1593 #undef __FUNCT__
1594 #define __FUNCT__ "MatColoringPatch_Inode"
1595 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1596 {
1597   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1598   PetscErrorCode  ierr;
1599   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1600   PetscInt        *colorused,i;
1601   ISColoringValue *newcolor;
1602 
1603   PetscFunctionBegin;
1604   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1605   /* loop over inodes, marking a color for each column*/
1606   row = 0;
1607   for (i=0; i<m; i++){
1608     for (j=0; j<ns[i]; j++) {
1609       newcolor[row++] = coloring[i] + j*ncolors;
1610     }
1611   }
1612 
1613   /* eliminate unneeded colors */
1614   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1615   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1616   for (i=0; i<n; i++) {
1617     colorused[newcolor[i]] = 1;
1618   }
1619 
1620   for (i=1; i<5*ncolors; i++) {
1621     colorused[i] += colorused[i-1];
1622   }
1623   ncolors = colorused[5*ncolors-1];
1624   for (i=0; i<n; i++) {
1625     newcolor[i] = colorused[newcolor[i]]-1;
1626   }
1627   ierr = PetscFree(colorused);CHKERRQ(ierr);
1628   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1629   ierr = PetscFree(coloring);CHKERRQ(ierr);
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 #include "src/inline/ilu.h"
1634 
1635 #undef __FUNCT__
1636 #define __FUNCT__ "MatRelax_Inode"
1637 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1638 {
1639   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
1640   PetscScalar     *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3,*v1,*v2,*v3,*v4,*v5;
1641   PetscScalar     *v = a->a,*b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1642   PetscReal       zeropivot = 1.0e-15;
1643   PetscErrorCode  ierr;
1644   PetscInt        n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1645   PetscInt        *idx,*diag = a->diag,*ii = a->i,sz,k;
1646 
1647   PetscFunctionBegin;
1648   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1649   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1650   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1651 
1652   if (!a->inode.ibdiagvalid) {
1653     if (!a->inode.ibdiag) {
1654       /* calculate space needed for diagonal blocks */
1655       for (i=0; i<m; i++) {
1656 	cnt += sizes[i]*sizes[i];
1657       }
1658       a->inode.bdiagsize = cnt;
1659       ierr   = PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);CHKERRQ(ierr);
1660     }
1661 
1662     /* copy over the diagonal blocks and invert them */
1663     ibdiag = a->inode.ibdiag;
1664     bdiag  = a->inode.bdiag;
1665     cnt = 0;
1666     for (i=0, row = 0; i<m; i++) {
1667       for (j=0; j<sizes[i]; j++) {
1668         for (k=0; k<sizes[i]; k++) {
1669           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1670         }
1671       }
1672       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));CHKERRQ(ierr);
1673 
1674       switch(sizes[i]) {
1675         case 1:
1676           /* Create matrix data structure */
1677           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1678           ibdiag[cnt] = 1.0/ibdiag[cnt];
1679           break;
1680         case 2:
1681           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt);CHKERRQ(ierr);
1682           break;
1683         case 3:
1684           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt);CHKERRQ(ierr);
1685           break;
1686         case 4:
1687           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt);CHKERRQ(ierr);
1688           break;
1689         case 5:
1690           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt);CHKERRQ(ierr);
1691           break;
1692        default:
1693 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1694       }
1695       cnt += sizes[i]*sizes[i];
1696       row += sizes[i];
1697     }
1698     a->inode.ibdiagvalid = PETSC_TRUE;
1699   }
1700   ibdiag = a->inode.ibdiag;
1701   bdiag  = a->inode.bdiag;
1702 
1703   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1704   if (xx != bb) {
1705     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1706   } else {
1707     b = x;
1708   }
1709 
1710   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1711   xs   = x;
1712   if (flag & SOR_ZERO_INITIAL_GUESS) {
1713     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1714 
1715       for (i=0, row=0; i<m; i++) {
1716         sz  = diag[row] - ii[row];
1717         v1  = a->a + ii[row];
1718         idx = a->j + ii[row];
1719 
1720         /* see comments for MatMult_Inode() for how this is coded */
1721         switch (sizes[i]){
1722           case 1:
1723 
1724             sum1  = b[row];
1725             for(n = 0; n<sz-1; n+=2) {
1726               i1   = idx[0];
1727               i2   = idx[1];
1728               idx += 2;
1729               tmp0 = x[i1];
1730               tmp1 = x[i2];
1731               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1732             }
1733 
1734             if (n == sz-1){
1735               tmp0  = x[*idx];
1736               sum1 -= *v1 * tmp0;
1737             }
1738             x[row++] = sum1*(*ibdiag++);
1739             break;
1740           case 2:
1741             v2    = a->a + ii[row+1];
1742             sum1  = b[row];
1743             sum2  = b[row+1];
1744             for(n = 0; n<sz-1; n+=2) {
1745               i1   = idx[0];
1746               i2   = idx[1];
1747               idx += 2;
1748               tmp0 = x[i1];
1749               tmp1 = x[i2];
1750               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1751               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1752             }
1753 
1754             if (n == sz-1){
1755               tmp0  = x[*idx];
1756               sum1 -= v1[0] * tmp0;
1757               sum2 -= v2[0] * tmp0;
1758             }
1759             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1760             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1761             ibdiag  += 4;
1762             break;
1763           case 3:
1764             v2    = a->a + ii[row+1];
1765             v3    = a->a + ii[row+2];
1766             sum1  = b[row];
1767             sum2  = b[row+1];
1768             sum3  = b[row+2];
1769             for(n = 0; n<sz-1; n+=2) {
1770               i1   = idx[0];
1771               i2   = idx[1];
1772               idx += 2;
1773               tmp0 = x[i1];
1774               tmp1 = x[i2];
1775               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1776               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1777               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1778             }
1779 
1780             if (n == sz-1){
1781               tmp0  = x[*idx];
1782               sum1 -= v1[0] * tmp0;
1783               sum2 -= v2[0] * tmp0;
1784               sum3 -= v3[0] * tmp0;
1785             }
1786             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1787             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1788             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1789             ibdiag  += 9;
1790             break;
1791           case 4:
1792             v2    = a->a + ii[row+1];
1793             v3    = a->a + ii[row+2];
1794             v4    = a->a + ii[row+3];
1795             sum1  = b[row];
1796             sum2  = b[row+1];
1797             sum3  = b[row+2];
1798             sum4  = b[row+3];
1799             for(n = 0; n<sz-1; n+=2) {
1800               i1   = idx[0];
1801               i2   = idx[1];
1802               idx += 2;
1803               tmp0 = x[i1];
1804               tmp1 = x[i2];
1805               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1806               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1807               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1808               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1809             }
1810 
1811             if (n == sz-1){
1812               tmp0  = x[*idx];
1813               sum1 -= v1[0] * tmp0;
1814               sum2 -= v2[0] * tmp0;
1815               sum3 -= v3[0] * tmp0;
1816               sum4 -= v4[0] * tmp0;
1817             }
1818             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1819             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1820             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1821             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1822             ibdiag  += 16;
1823             break;
1824           case 5:
1825             v2    = a->a + ii[row+1];
1826             v3    = a->a + ii[row+2];
1827             v4    = a->a + ii[row+3];
1828             v5    = a->a + ii[row+4];
1829             sum1  = b[row];
1830             sum2  = b[row+1];
1831             sum3  = b[row+2];
1832             sum4  = b[row+3];
1833             sum5  = b[row+4];
1834             for(n = 0; n<sz-1; n+=2) {
1835               i1   = idx[0];
1836               i2   = idx[1];
1837               idx += 2;
1838               tmp0 = x[i1];
1839               tmp1 = x[i2];
1840               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1841               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1842               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1843               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1844               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1845             }
1846 
1847             if (n == sz-1){
1848               tmp0  = x[*idx];
1849               sum1 -= v1[0] * tmp0;
1850               sum2 -= v2[0] * tmp0;
1851               sum3 -= v3[0] * tmp0;
1852               sum4 -= v4[0] * tmp0;
1853               sum5 -= v5[0] * tmp0;
1854             }
1855             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1856             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1857             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1858             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1859             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1860             ibdiag  += 25;
1861             break;
1862 	  default:
1863    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1864         }
1865       }
1866 
1867       xb = x;
1868       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1869     } else xb = b;
1870     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1871         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1872       cnt = 0;
1873       for (i=0, row=0; i<m; i++) {
1874 
1875         switch (sizes[i]){
1876           case 1:
1877             x[row++] *= bdiag[cnt++];
1878             break;
1879           case 2:
1880             x1   = x[row]; x2 = x[row+1];
1881             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1882             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1883             x[row++] = tmp1;
1884             x[row++] = tmp2;
1885             cnt += 4;
1886             break;
1887           case 3:
1888             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1889             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1890             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1891             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1892             x[row++] = tmp1;
1893             x[row++] = tmp2;
1894             x[row++] = tmp3;
1895             cnt += 9;
1896             break;
1897           case 4:
1898             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1899             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1900             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1901             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1902             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1903             x[row++] = tmp1;
1904             x[row++] = tmp2;
1905             x[row++] = tmp3;
1906             x[row++] = tmp4;
1907             cnt += 16;
1908             break;
1909           case 5:
1910             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1911             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1912             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1913             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1914             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1915             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1916             x[row++] = tmp1;
1917             x[row++] = tmp2;
1918             x[row++] = tmp3;
1919             x[row++] = tmp4;
1920             x[row++] = tmp5;
1921             cnt += 25;
1922             break;
1923 	  default:
1924    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1925         }
1926       }
1927       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1928     }
1929     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1930 
1931       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1932       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1933         ibdiag -= sizes[i]*sizes[i];
1934         sz      = ii[row+1] - diag[row] - 1;
1935         v1      = a->a + diag[row] + 1;
1936         idx     = a->j + diag[row] + 1;
1937 
1938         /* see comments for MatMult_Inode() for how this is coded */
1939         switch (sizes[i]){
1940           case 1:
1941 
1942             sum1  = xb[row];
1943             for(n = 0; n<sz-1; n+=2) {
1944               i1   = idx[0];
1945               i2   = idx[1];
1946               idx += 2;
1947               tmp0 = x[i1];
1948               tmp1 = x[i2];
1949               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1950             }
1951 
1952             if (n == sz-1){
1953               tmp0  = x[*idx];
1954               sum1 -= *v1*tmp0;
1955             }
1956             x[row--] = sum1*(*ibdiag);
1957             break;
1958 
1959           case 2:
1960 
1961             sum1  = xb[row];
1962             sum2  = xb[row-1];
1963             /* note that sum1 is associated with the second of the two rows */
1964             v2    = a->a + diag[row-1] + 2;
1965             for(n = 0; n<sz-1; n+=2) {
1966               i1   = idx[0];
1967               i2   = idx[1];
1968               idx += 2;
1969               tmp0 = x[i1];
1970               tmp1 = x[i2];
1971               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1972               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1973             }
1974 
1975             if (n == sz-1){
1976               tmp0  = x[*idx];
1977               sum1 -= *v1*tmp0;
1978               sum2 -= *v2*tmp0;
1979             }
1980             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1981             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1982             break;
1983           case 3:
1984 
1985             sum1  = xb[row];
1986             sum2  = xb[row-1];
1987             sum3  = xb[row-2];
1988             v2    = a->a + diag[row-1] + 2;
1989             v3    = a->a + diag[row-2] + 3;
1990             for(n = 0; n<sz-1; n+=2) {
1991               i1   = idx[0];
1992               i2   = idx[1];
1993               idx += 2;
1994               tmp0 = x[i1];
1995               tmp1 = x[i2];
1996               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1997               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1998               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1999             }
2000 
2001             if (n == sz-1){
2002               tmp0  = x[*idx];
2003               sum1 -= *v1*tmp0;
2004               sum2 -= *v2*tmp0;
2005               sum3 -= *v3*tmp0;
2006             }
2007             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2008             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2009             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2010             break;
2011           case 4:
2012 
2013             sum1  = xb[row];
2014             sum2  = xb[row-1];
2015             sum3  = xb[row-2];
2016             sum4  = xb[row-3];
2017             v2    = a->a + diag[row-1] + 2;
2018             v3    = a->a + diag[row-2] + 3;
2019             v4    = a->a + diag[row-3] + 4;
2020             for(n = 0; n<sz-1; n+=2) {
2021               i1   = idx[0];
2022               i2   = idx[1];
2023               idx += 2;
2024               tmp0 = x[i1];
2025               tmp1 = x[i2];
2026               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2027               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2028               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2029               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2030             }
2031 
2032             if (n == sz-1){
2033               tmp0  = x[*idx];
2034               sum1 -= *v1*tmp0;
2035               sum2 -= *v2*tmp0;
2036               sum3 -= *v3*tmp0;
2037               sum4 -= *v4*tmp0;
2038             }
2039             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2040             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2041             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2042             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2043             break;
2044           case 5:
2045 
2046             sum1  = xb[row];
2047             sum2  = xb[row-1];
2048             sum3  = xb[row-2];
2049             sum4  = xb[row-3];
2050             sum5  = xb[row-4];
2051             v2    = a->a + diag[row-1] + 2;
2052             v3    = a->a + diag[row-2] + 3;
2053             v4    = a->a + diag[row-3] + 4;
2054             v5    = a->a + diag[row-4] + 5;
2055             for(n = 0; n<sz-1; n+=2) {
2056               i1   = idx[0];
2057               i2   = idx[1];
2058               idx += 2;
2059               tmp0 = x[i1];
2060               tmp1 = x[i2];
2061               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2062               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2063               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2064               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2065               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2066             }
2067 
2068             if (n == sz-1){
2069               tmp0  = x[*idx];
2070               sum1 -= *v1*tmp0;
2071               sum2 -= *v2*tmp0;
2072               sum3 -= *v3*tmp0;
2073               sum4 -= *v4*tmp0;
2074               sum5 -= *v5*tmp0;
2075             }
2076             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2077             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2078             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2079             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2080             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2081             break;
2082 	  default:
2083    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2084         }
2085       }
2086 
2087       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2088     }
2089     its--;
2090   }
2091   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2092   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2093   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 
2098 /*
2099     samestructure indicates that the matrix has not changed its nonzero structure so we
2100     do not need to recompute the inodes
2101 */
2102 #undef __FUNCT__
2103 #define __FUNCT__ "Mat_CheckInode"
2104 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2105 {
2106   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2107   PetscErrorCode ierr;
2108   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2109   PetscTruth     flag;
2110 
2111   PetscFunctionBegin;
2112   if (!a->inode.use)                     PetscFunctionReturn(0);
2113   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2114 
2115 
2116   m = A->rmap.n;
2117   if (a->inode.size) {ns = a->inode.size;}
2118   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2119 
2120   i          = 0;
2121   node_count = 0;
2122   idx        = a->j;
2123   ii         = a->i;
2124   while (i < m){                /* For each row */
2125     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2126     /* Limits the number of elements in a node to 'a->inode.limit' */
2127     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2128       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2129       if (nzy != nzx) break;
2130       idy  += nzx;             /* Same nonzero pattern */
2131       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2132       if (!flag) break;
2133     }
2134     ns[node_count++] = blk_size;
2135     idx += blk_size*nzx;
2136     i    = j;
2137   }
2138   /* If not enough inodes found,, do not use inode version of the routines */
2139   if (!a->inode.size && m && node_count > .9*m) {
2140     ierr = PetscFree(ns);CHKERRQ(ierr);
2141     a->inode.node_count     = 0;
2142     a->inode.size           = PETSC_NULL;
2143     a->inode.use            = PETSC_FALSE;
2144     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2145   } else {
2146     A->ops->mult            = MatMult_Inode;
2147     A->ops->relax           = MatRelax_Inode;
2148     A->ops->multadd         = MatMultAdd_Inode;
2149     A->ops->solve           = MatSolve_Inode;
2150     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2151     A->ops->getrowij        = MatGetRowIJ_Inode;
2152     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2153     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2154     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2155     A->ops->coloringpatch   = MatColoringPatch_Inode;
2156     a->inode.node_count     = node_count;
2157     a->inode.size           = ns;
2158     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2159   }
2160   PetscFunctionReturn(0);
2161 }
2162 
2163 /*
2164      This is really ugly. if inodes are used this replaces the
2165   permutations with ones that correspond to rows/cols of the matrix
2166   rather then inode blocks
2167 */
2168 #undef __FUNCT__
2169 #define __FUNCT__ "MatInodeAdjustForInodes"
2170 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2171 {
2172   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2173 
2174   PetscFunctionBegin;
2175   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2176   if (f) {
2177     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2178   }
2179   PetscFunctionReturn(0);
2180 }
2181 
2182 EXTERN_C_BEGIN
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatAdjustForInodes_Inode"
2185 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2186 {
2187   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2188   PetscErrorCode ierr;
2189   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2190   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2191   PetscInt       nslim_col,*ns_col;
2192   IS             ris = *rperm,cis = *cperm;
2193 
2194   PetscFunctionBegin;
2195   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2196   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2197 
2198   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2199   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2200   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2201   permc = permr + m;
2202 
2203   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2204   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2205 
2206   /* Form the inode structure for the rows of permuted matric using inv perm*/
2207   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2208 
2209   /* Construct the permutations for rows*/
2210   for (i=0,row = 0; i<nslim_row; ++i){
2211     indx      = ridx[i];
2212     start_val = tns[indx];
2213     end_val   = tns[indx + 1];
2214     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2215   }
2216 
2217   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2218   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2219 
2220  /* Construct permutations for columns */
2221   for (i=0,col=0; i<nslim_col; ++i){
2222     indx      = cidx[i];
2223     start_val = tns[indx];
2224     end_val   = tns[indx + 1];
2225     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2226   }
2227 
2228   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2229   ISSetPermutation(*rperm);
2230   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2231   ISSetPermutation(*cperm);
2232 
2233   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2234   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2235 
2236   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2237   ierr = PetscFree(permr);CHKERRQ(ierr);
2238   ierr = ISDestroy(cis);CHKERRQ(ierr);
2239   ierr = ISDestroy(ris);CHKERRQ(ierr);
2240   ierr = PetscFree(tns);CHKERRQ(ierr);
2241   PetscFunctionReturn(0);
2242 }
2243 EXTERN_C_END
2244 
2245 #undef __FUNCT__
2246 #define __FUNCT__ "MatInodeGetInodeSizes"
2247 /*@C
2248    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2249 
2250    Collective on Mat
2251 
2252    Input Parameter:
2253 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2254 
2255    Output Parameter:
2256 +  node_count - no of inodes present in the matrix.
2257 .  sizes      - an array of size node_count,with sizes of each inode.
2258 -  limit      - the max size used to generate the inodes.
2259 
2260    Level: advanced
2261 
2262    Notes: This routine returns some internal storage information
2263    of the matrix, it is intended to be used by advanced users.
2264    It should be called after the matrix is assembled.
2265    The contents of the sizes[] array should not be changed.
2266    PETSC_NULL may be passed for information not requested.
2267 
2268 .keywords: matrix, seqaij, get, inode
2269 
2270 .seealso: MatGetInfo()
2271 @*/
2272 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2273 {
2274   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2275 
2276   PetscFunctionBegin;
2277   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2278   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2279   if (f) {
2280     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2281   }
2282   PetscFunctionReturn(0);
2283 }
2284 
2285 EXTERN_C_BEGIN
2286 #undef __FUNCT__
2287 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2288 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2289 {
2290   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2291 
2292   PetscFunctionBegin;
2293   if (node_count) *node_count = a->inode.node_count;
2294   if (sizes)      *sizes      = a->inode.size;
2295   if (limit)      *limit      = a->inode.limit;
2296   PetscFunctionReturn(0);
2297 }
2298 EXTERN_C_END
2299