xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 719d5645761d844e4357b7ee00a3296dffe0b787)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     sz   = n;                   /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz){               /* Each loop in 'case' is unrolled */
429     case 1 :
430       sum1  = 0;
431 
432       for(n = 0; n< sz-1; n+=2) {
433         i1   = idx[0];          /* The instructions are ordered to */
434         i2   = idx[1];          /* make the compiler's job easy */
435         idx += 2;
436         tmp0 = x[i1];
437         tmp1 = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439        }
440 
441       if (n == sz-1){          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ * tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1  = 0;
449       sum2  = 0;
450       v2    = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1   = idx[0];
454         i2   = idx[1];
455         idx += 2;
456         tmp0 = x[i1];
457         tmp1 = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1){
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1  = 0;
473       sum2  = 0;
474       sum3  = 0;
475       v2    = v1 + n;
476       v3    = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1   = idx[0];
480         i2   = idx[1];
481         idx += 2;
482         tmp0 = x[i1];
483         tmp1 = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1){
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1       =v3;             /* Since the next block to be processed starts there*/
498       idx     +=2*sz;
499       break;
500     case 4:
501       sum1  = 0;
502       sum2  = 0;
503       sum3  = 0;
504       sum4  = 0;
505       v2    = v1 + n;
506       v3    = v2 + n;
507       v4    = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1   = idx[0];
511         i2   = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1){
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1  = 0;
536       sum2  = 0;
537       sum3  = 0;
538       sum4  = 0;
539       sum5  = 0;
540       v2    = v1 + n;
541       v3    = v2 + n;
542       v4    = v3 + n;
543       v5    = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1   = idx[0];
547         i2   = idx[1];
548         idx += 2;
549         tmp0 = x[i1];
550         tmp1 = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1){
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default :
574       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_Inode"
586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   MatScalar      *v1,*v2,*v3,*v4,*v5;
591   PetscScalar    *x,*y,*z,*zt;
592   PetscErrorCode ierr;
593   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594 
595   PetscFunctionBegin;
596   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597   node_max = a->inode.node_count;
598   ns       = a->inode.size;     /* Node Size array */
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   if (zz != yy) {
602     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
603   } else {
604     z = y;
605   }
606   zt = z;
607 
608   idx  = a->j;
609   v1   = a->a;
610   ii   = a->i;
611 
612   for (i = 0,row = 0; i< node_max; ++i){
613     nsz  = ns[i];
614     n    = ii[1] - ii[0];
615     ii  += nsz;
616     sz   = n;                   /* No of non zeros in this row */
617                                 /* Switch on the size of Node */
618     switch (nsz){               /* Each loop in 'case' is unrolled */
619     case 1 :
620       sum1  = *zt++;
621 
622       for(n = 0; n< sz-1; n+=2) {
623         i1   = idx[0];          /* The instructions are ordered to */
624         i2   = idx[1];          /* make the compiler's job easy */
625         idx += 2;
626         tmp0 = x[i1];
627         tmp1 = x[i2];
628         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629        }
630 
631       if(n   == sz-1){          /* Take care of the last nonzero  */
632         tmp0  = x[*idx++];
633         sum1 += *v1++ * tmp0;
634       }
635       y[row++]=sum1;
636       break;
637     case 2:
638       sum1  = *zt++;
639       sum2  = *zt++;
640       v2    = v1 + n;
641 
642       for(n = 0; n< sz-1; n+=2) {
643         i1   = idx[0];
644         i2   = idx[1];
645         idx += 2;
646         tmp0 = x[i1];
647         tmp1 = x[i2];
648         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650       }
651       if(n   == sz-1){
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       v1      =v2;              /* Since the next block to be processed starts there*/
659       idx    +=sz;
660       break;
661     case 3:
662       sum1  = *zt++;
663       sum2  = *zt++;
664       sum3  = *zt++;
665       v2    = v1 + n;
666       v3    = v2 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1   = idx[0];
670         i2   = idx[1];
671         idx += 2;
672         tmp0 = x[i1];
673         tmp1 = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677       }
678       if (n == sz-1){
679         tmp0  = x[*idx++];
680         sum1 += *v1++ * tmp0;
681         sum2 += *v2++ * tmp0;
682         sum3 += *v3++ * tmp0;
683       }
684       y[row++]=sum1;
685       y[row++]=sum2;
686       y[row++]=sum3;
687       v1       =v3;             /* Since the next block to be processed starts there*/
688       idx     +=2*sz;
689       break;
690     case 4:
691       sum1  = *zt++;
692       sum2  = *zt++;
693       sum3  = *zt++;
694       sum4  = *zt++;
695       v2    = v1 + n;
696       v3    = v2 + n;
697       v4    = v3 + n;
698 
699       for (n = 0; n< sz-1; n+=2) {
700         i1   = idx[0];
701         i2   = idx[1];
702         idx += 2;
703         tmp0 = x[i1];
704         tmp1 = x[i2];
705         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709       }
710       if (n == sz-1){
711         tmp0  = x[*idx++];
712         sum1 += *v1++ * tmp0;
713         sum2 += *v2++ * tmp0;
714         sum3 += *v3++ * tmp0;
715         sum4 += *v4++ * tmp0;
716       }
717       y[row++]=sum1;
718       y[row++]=sum2;
719       y[row++]=sum3;
720       y[row++]=sum4;
721       v1      =v4;              /* Since the next block to be processed starts there*/
722       idx    +=3*sz;
723       break;
724     case 5:
725       sum1  = *zt++;
726       sum2  = *zt++;
727       sum3  = *zt++;
728       sum4  = *zt++;
729       sum5  = *zt++;
730       v2    = v1 + n;
731       v3    = v2 + n;
732       v4    = v3 + n;
733       v5    = v4 + n;
734 
735       for (n = 0; n<sz-1; n+=2) {
736         i1   = idx[0];
737         i2   = idx[1];
738         idx += 2;
739         tmp0 = x[i1];
740         tmp1 = x[i2];
741         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746       }
747       if(n   == sz-1){
748         tmp0  = x[*idx++];
749         sum1 += *v1++ * tmp0;
750         sum2 += *v2++ * tmp0;
751         sum3 += *v3++ * tmp0;
752         sum4 += *v4++ * tmp0;
753         sum5 += *v5++ * tmp0;
754       }
755       y[row++]=sum1;
756       y[row++]=sum2;
757       y[row++]=sum3;
758       y[row++]=sum4;
759       y[row++]=sum5;
760       v1      =v5;       /* Since the next block to be processed starts there */
761       idx    +=4*sz;
762       break;
763     default :
764       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765     }
766   }
767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
768   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
769   if (zz != yy) {
770     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771   }
772   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 /* ----------------------------------------------------------- */
777 #undef __FUNCT__
778 #define __FUNCT__ "MatSolve_Inode"
779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
780 {
781   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782   IS                iscol = a->col,isrow = a->row;
783   PetscErrorCode    ierr;
784   PetscInt          *r,*c,i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
785   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
786   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
787   PetscScalar       sum1,sum2,sum3,sum4,sum5;
788   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
789   const PetscScalar *b;
790 
791   PetscFunctionBegin;
792   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
793   node_max = a->inode.node_count;
794   ns       = a->inode.size;     /* Node Size array */
795 
796   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
797   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
798   tmp  = a->solve_work;
799 
800   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
801   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
802 
803   /* forward solve the lower triangular */
804   tmps = tmp ;
805   aa   = a_a ;
806   aj   = a_j ;
807   ad   = a->diag;
808 
809   for (i = 0,row = 0; i< node_max; ++i){
810     nsz = ns[i];
811     aii = ai[row];
812     v1  = aa + aii;
813     vi  = aj + aii;
814     nz  = ad[row]- aii;
815 
816     switch (nsz){               /* Each loop in 'case' is unrolled */
817     case 1 :
818       sum1 = b[*r++];
819       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
820       for(j=0; j<nz-1; j+=2){
821         i0   = vi[0];
822         i1   = vi[1];
823         vi  +=2;
824         tmp0 = tmps[i0];
825         tmp1 = tmps[i1];
826         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827       }
828       if(j == nz-1){
829         tmp0 = tmps[*vi++];
830         sum1 -= *v1++ *tmp0;
831       }
832       tmp[row ++]=sum1;
833       break;
834     case 2:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       v2   = aa + ai[row+1];
838 
839       for(j=0; j<nz-1; j+=2){
840         i0   = vi[0];
841         i1   = vi[1];
842         vi  +=2;
843         tmp0 = tmps[i0];
844         tmp1 = tmps[i1];
845         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847       }
848       if(j == nz-1){
849         tmp0 = tmps[*vi++];
850         sum1 -= *v1++ *tmp0;
851         sum2 -= *v2++ *tmp0;
852       }
853       sum2 -= *v2++ * sum1;
854       tmp[row ++]=sum1;
855       tmp[row ++]=sum2;
856       break;
857     case 3:
858       sum1 = b[*r++];
859       sum2 = b[*r++];
860       sum3 = b[*r++];
861       v2   = aa + ai[row+1];
862       v3   = aa + ai[row+2];
863 
864       for (j=0; j<nz-1; j+=2){
865         i0   = vi[0];
866         i1   = vi[1];
867         vi  +=2;
868         tmp0 = tmps[i0];
869         tmp1 = tmps[i1];
870         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873       }
874       if (j == nz-1){
875         tmp0 = tmps[*vi++];
876         sum1 -= *v1++ *tmp0;
877         sum2 -= *v2++ *tmp0;
878         sum3 -= *v3++ *tmp0;
879       }
880       sum2 -= *v2++ * sum1;
881       sum3 -= *v3++ * sum1;
882       sum3 -= *v3++ * sum2;
883       tmp[row ++]=sum1;
884       tmp[row ++]=sum2;
885       tmp[row ++]=sum3;
886       break;
887 
888     case 4:
889       sum1 = b[*r++];
890       sum2 = b[*r++];
891       sum3 = b[*r++];
892       sum4 = b[*r++];
893       v2   = aa + ai[row+1];
894       v3   = aa + ai[row+2];
895       v4   = aa + ai[row+3];
896 
897       for (j=0; j<nz-1; j+=2){
898         i0   = vi[0];
899         i1   = vi[1];
900         vi  +=2;
901         tmp0 = tmps[i0];
902         tmp1 = tmps[i1];
903         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
904         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
905         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
906         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
907       }
908       if (j == nz-1){
909         tmp0 = tmps[*vi++];
910         sum1 -= *v1++ *tmp0;
911         sum2 -= *v2++ *tmp0;
912         sum3 -= *v3++ *tmp0;
913         sum4 -= *v4++ *tmp0;
914       }
915       sum2 -= *v2++ * sum1;
916       sum3 -= *v3++ * sum1;
917       sum4 -= *v4++ * sum1;
918       sum3 -= *v3++ * sum2;
919       sum4 -= *v4++ * sum2;
920       sum4 -= *v4++ * sum3;
921 
922       tmp[row ++]=sum1;
923       tmp[row ++]=sum2;
924       tmp[row ++]=sum3;
925       tmp[row ++]=sum4;
926       break;
927     case 5:
928       sum1 = b[*r++];
929       sum2 = b[*r++];
930       sum3 = b[*r++];
931       sum4 = b[*r++];
932       sum5 = b[*r++];
933       v2   = aa + ai[row+1];
934       v3   = aa + ai[row+2];
935       v4   = aa + ai[row+3];
936       v5   = aa + ai[row+4];
937 
938       for (j=0; j<nz-1; j+=2){
939         i0   = vi[0];
940         i1   = vi[1];
941         vi  +=2;
942         tmp0 = tmps[i0];
943         tmp1 = tmps[i1];
944         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
945         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
946         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
947         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
948         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
949       }
950       if (j == nz-1){
951         tmp0 = tmps[*vi++];
952         sum1 -= *v1++ *tmp0;
953         sum2 -= *v2++ *tmp0;
954         sum3 -= *v3++ *tmp0;
955         sum4 -= *v4++ *tmp0;
956         sum5 -= *v5++ *tmp0;
957       }
958 
959       sum2 -= *v2++ * sum1;
960       sum3 -= *v3++ * sum1;
961       sum4 -= *v4++ * sum1;
962       sum5 -= *v5++ * sum1;
963       sum3 -= *v3++ * sum2;
964       sum4 -= *v4++ * sum2;
965       sum5 -= *v5++ * sum2;
966       sum4 -= *v4++ * sum3;
967       sum5 -= *v5++ * sum3;
968       sum5 -= *v5++ * sum4;
969 
970       tmp[row ++]=sum1;
971       tmp[row ++]=sum2;
972       tmp[row ++]=sum3;
973       tmp[row ++]=sum4;
974       tmp[row ++]=sum5;
975       break;
976     default:
977       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
978     }
979   }
980   /* backward solve the upper triangular */
981   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
982     nsz = ns[i];
983     aii = ai[row+1] -1;
984     v1  = aa + aii;
985     vi  = aj + aii;
986     nz  = aii- ad[row];
987     switch (nsz){               /* Each loop in 'case' is unrolled */
988     case 1 :
989       sum1 = tmp[row];
990 
991       for(j=nz ; j>1; j-=2){
992         vi  -=2;
993         i0   = vi[2];
994         i1   = vi[1];
995         tmp0 = tmps[i0];
996         tmp1 = tmps[i1];
997         v1   -= 2;
998         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
999       }
1000       if (j==1){
1001         tmp0  = tmps[*vi--];
1002         sum1 -= *v1-- * tmp0;
1003       }
1004       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1005       break;
1006     case 2 :
1007       sum1 = tmp[row];
1008       sum2 = tmp[row -1];
1009       v2   = aa + ai[row]-1;
1010       for (j=nz ; j>1; j-=2){
1011         vi  -=2;
1012         i0   = vi[2];
1013         i1   = vi[1];
1014         tmp0 = tmps[i0];
1015         tmp1 = tmps[i1];
1016         v1   -= 2;
1017         v2   -= 2;
1018         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020       }
1021       if (j==1){
1022         tmp0  = tmps[*vi--];
1023         sum1 -= *v1-- * tmp0;
1024         sum2 -= *v2-- * tmp0;
1025       }
1026 
1027       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1028       sum2   -= *v2-- * tmp0;
1029       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1030       break;
1031     case 3 :
1032       sum1 = tmp[row];
1033       sum2 = tmp[row -1];
1034       sum3 = tmp[row -2];
1035       v2   = aa + ai[row]-1;
1036       v3   = aa + ai[row -1]-1;
1037       for (j=nz ; j>1; j-=2){
1038         vi  -=2;
1039         i0   = vi[2];
1040         i1   = vi[1];
1041         tmp0 = tmps[i0];
1042         tmp1 = tmps[i1];
1043         v1   -= 2;
1044         v2   -= 2;
1045         v3   -= 2;
1046         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1047         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1048         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1049       }
1050       if (j==1){
1051         tmp0  = tmps[*vi--];
1052         sum1 -= *v1-- * tmp0;
1053         sum2 -= *v2-- * tmp0;
1054         sum3 -= *v3-- * tmp0;
1055       }
1056       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1057       sum2   -= *v2-- * tmp0;
1058       sum3   -= *v3-- * tmp0;
1059       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1060       sum3   -= *v3-- * tmp0;
1061       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1062 
1063       break;
1064     case 4 :
1065       sum1 = tmp[row];
1066       sum2 = tmp[row -1];
1067       sum3 = tmp[row -2];
1068       sum4 = tmp[row -3];
1069       v2   = aa + ai[row]-1;
1070       v3   = aa + ai[row -1]-1;
1071       v4   = aa + ai[row -2]-1;
1072 
1073       for (j=nz ; j>1; j-=2){
1074         vi  -=2;
1075         i0   = vi[2];
1076         i1   = vi[1];
1077         tmp0 = tmps[i0];
1078         tmp1 = tmps[i1];
1079         v1  -= 2;
1080         v2  -= 2;
1081         v3  -= 2;
1082         v4  -= 2;
1083         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1084         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1085         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1086         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1087       }
1088       if (j==1){
1089         tmp0  = tmps[*vi--];
1090         sum1 -= *v1-- * tmp0;
1091         sum2 -= *v2-- * tmp0;
1092         sum3 -= *v3-- * tmp0;
1093         sum4 -= *v4-- * tmp0;
1094       }
1095 
1096       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1097       sum2   -= *v2-- * tmp0;
1098       sum3   -= *v3-- * tmp0;
1099       sum4   -= *v4-- * tmp0;
1100       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1101       sum3   -= *v3-- * tmp0;
1102       sum4   -= *v4-- * tmp0;
1103       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1104       sum4   -= *v4-- * tmp0;
1105       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1106       break;
1107     case 5 :
1108       sum1 = tmp[row];
1109       sum2 = tmp[row -1];
1110       sum3 = tmp[row -2];
1111       sum4 = tmp[row -3];
1112       sum5 = tmp[row -4];
1113       v2   = aa + ai[row]-1;
1114       v3   = aa + ai[row -1]-1;
1115       v4   = aa + ai[row -2]-1;
1116       v5   = aa + ai[row -3]-1;
1117       for (j=nz ; j>1; j-=2){
1118         vi  -= 2;
1119         i0   = vi[2];
1120         i1   = vi[1];
1121         tmp0 = tmps[i0];
1122         tmp1 = tmps[i1];
1123         v1   -= 2;
1124         v2   -= 2;
1125         v3   -= 2;
1126         v4   -= 2;
1127         v5   -= 2;
1128         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1129         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1130         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1131         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1132         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1133       }
1134       if (j==1){
1135         tmp0  = tmps[*vi--];
1136         sum1 -= *v1-- * tmp0;
1137         sum2 -= *v2-- * tmp0;
1138         sum3 -= *v3-- * tmp0;
1139         sum4 -= *v4-- * tmp0;
1140         sum5 -= *v5-- * tmp0;
1141       }
1142 
1143       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1144       sum2   -= *v2-- * tmp0;
1145       sum3   -= *v3-- * tmp0;
1146       sum4   -= *v4-- * tmp0;
1147       sum5   -= *v5-- * tmp0;
1148       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1149       sum3   -= *v3-- * tmp0;
1150       sum4   -= *v4-- * tmp0;
1151       sum5   -= *v5-- * tmp0;
1152       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1153       sum4   -= *v4-- * tmp0;
1154       sum5   -= *v5-- * tmp0;
1155       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1156       sum5   -= *v5-- * tmp0;
1157       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1158       break;
1159     default:
1160       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1161     }
1162   }
1163   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1164   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1165   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1167   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 #undef __FUNCT__
1172 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1173 PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,MatFactorInfo *info)
1174 {
1175   Mat               C = B;
1176   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1177   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1178   PetscErrorCode    ierr;
1179   PetscInt          *r,*ic,*c,n = A->rmap->n,*bi = b->i;
1180   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1181   PetscInt          *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1182   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1183   PetscScalar       mul1,mul2,mul3,tmp;
1184   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1185   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1186   PetscReal         rs=0.0;
1187   LUShift_Ctx       sctx;
1188   PetscInt          newshift;
1189 
1190   PetscFunctionBegin;
1191   sctx.shift_top  = 0;
1192   sctx.nshift_max = 0;
1193   sctx.shift_lo   = 0;
1194   sctx.shift_hi   = 0;
1195 
1196   /* if both shift schemes are chosen by user, only use info->shiftpd */
1197   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1198   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1199     sctx.shift_top = 0;
1200     for (i=0; i<n; i++) {
1201       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1202       rs    = 0.0;
1203       ajtmp = aj + ai[i];
1204       rtmp1 = aa + ai[i];
1205       nz = ai[i+1] - ai[i];
1206       for (j=0; j<nz; j++){
1207         if (*ajtmp != i){
1208           rs += PetscAbsScalar(*rtmp1++);
1209         } else {
1210           rs -= PetscRealPart(*rtmp1++);
1211         }
1212         ajtmp++;
1213       }
1214       if (rs>sctx.shift_top) sctx.shift_top = rs;
1215     }
1216     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1217     sctx.shift_top *= 1.1;
1218     sctx.nshift_max = 5;
1219     sctx.shift_lo   = 0.;
1220     sctx.shift_hi   = 1.;
1221   }
1222   sctx.shift_amount = 0;
1223   sctx.nshift       = 0;
1224 
1225   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1227   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   ics   = ic ;
1231   rtmp22 = rtmp11 + n;
1232   rtmp33 = rtmp22 + n;
1233 
1234   node_max = a->inode.node_count;
1235   ns       = a->inode.size;
1236   if (!ns){
1237     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1238   }
1239 
1240   /* If max inode size > 3, split it into two inodes.*/
1241   /* also map the inode sizes according to the ordering */
1242   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1243   for (i=0,j=0; i<node_max; ++i,++j){
1244     if (ns[i]>3) {
1245       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1246       ++j;
1247       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1248     } else {
1249       tmp_vec1[j] = ns[i];
1250     }
1251   }
1252   /* Use the correct node_max */
1253   node_max = j;
1254 
1255   /* Now reorder the inode info based on mat re-ordering info */
1256   /* First create a row -> inode_size_array_index map */
1257   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1258   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1259   for (i=0,row=0; i<node_max; i++) {
1260     nodesz = tmp_vec1[i];
1261     for (j=0; j<nodesz; j++,row++) {
1262       nsmap[row] = i;
1263     }
1264   }
1265   /* Using nsmap, create a reordered ns structure */
1266   for (i=0,j=0; i< node_max; i++) {
1267     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1268     tmp_vec2[i]  = nodesz;
1269     j           += nodesz;
1270   }
1271   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1272   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1273   /* Now use the correct ns */
1274   ns = tmp_vec2;
1275 
1276   do {
1277     sctx.lushift = PETSC_FALSE;
1278     /* Now loop over each block-row, and do the factorization */
1279     for (i=0,row=0; i<node_max; i++) {
1280       nodesz = ns[i];
1281       nz     = bi[row+1] - bi[row];
1282       bjtmp  = bj + bi[row];
1283 
1284       switch (nodesz){
1285       case 1:
1286         for  (j=0; j<nz; j++){
1287           idx        = bjtmp[j];
1288           rtmp11[idx] = 0.0;
1289         }
1290 
1291         /* load in initial (unfactored row) */
1292         idx    = r[row];
1293         nz_tmp = ai[idx+1] - ai[idx];
1294         ajtmp  = aj + ai[idx];
1295         v1     = aa + ai[idx];
1296 
1297         for (j=0; j<nz_tmp; j++) {
1298           idx        = ics[ajtmp[j]];
1299           rtmp11[idx] = v1[j];
1300         }
1301         rtmp11[ics[r[row]]] += sctx.shift_amount;
1302 
1303         prow = *bjtmp++ ;
1304         while (prow < row) {
1305           pc1 = rtmp11 + prow;
1306           if (*pc1 != 0.0){
1307             pv   = ba + bd[prow];
1308             pj   = nbj + bd[prow];
1309             mul1 = *pc1 * *pv++;
1310             *pc1 = mul1;
1311             nz_tmp = bi[prow+1] - bd[prow] - 1;
1312             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1313             for (j=0; j<nz_tmp; j++) {
1314               tmp = pv[j];
1315               idx = pj[j];
1316               rtmp11[idx] -= mul1 * tmp;
1317             }
1318           }
1319           prow = *bjtmp++ ;
1320         }
1321         pj  = bj + bi[row];
1322         pc1 = ba + bi[row];
1323 
1324         sctx.pv    = rtmp11[row];
1325         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1326         rs         = 0.0;
1327         for (j=0; j<nz; j++) {
1328           idx    = pj[j];
1329           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1330           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1331         }
1332         sctx.rs  = rs;
1333         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1334         if (newshift == 1) goto endofwhile;
1335         break;
1336 
1337       case 2:
1338         for (j=0; j<nz; j++) {
1339           idx        = bjtmp[j];
1340           rtmp11[idx] = 0.0;
1341           rtmp22[idx] = 0.0;
1342         }
1343 
1344         /* load in initial (unfactored row) */
1345         idx    = r[row];
1346         nz_tmp = ai[idx+1] - ai[idx];
1347         ajtmp  = aj + ai[idx];
1348         v1     = aa + ai[idx];
1349         v2     = aa + ai[idx+1];
1350         for (j=0; j<nz_tmp; j++) {
1351           idx        = ics[ajtmp[j]];
1352           rtmp11[idx] = v1[j];
1353           rtmp22[idx] = v2[j];
1354         }
1355         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1356         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1357 
1358         prow = *bjtmp++ ;
1359         while (prow < row) {
1360           pc1 = rtmp11 + prow;
1361           pc2 = rtmp22 + prow;
1362           if (*pc1 != 0.0 || *pc2 != 0.0){
1363             pv   = ba + bd[prow];
1364             pj   = nbj + bd[prow];
1365             mul1 = *pc1 * *pv;
1366             mul2 = *pc2 * *pv;
1367             ++pv;
1368             *pc1 = mul1;
1369             *pc2 = mul2;
1370 
1371             nz_tmp = bi[prow+1] - bd[prow] - 1;
1372             for (j=0; j<nz_tmp; j++) {
1373               tmp = pv[j];
1374               idx = pj[j];
1375               rtmp11[idx] -= mul1 * tmp;
1376               rtmp22[idx] -= mul2 * tmp;
1377             }
1378             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1379           }
1380           prow = *bjtmp++ ;
1381         }
1382 
1383         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1384         pc1 = rtmp11 + prow;
1385         pc2 = rtmp22 + prow;
1386 
1387         sctx.pv = *pc1;
1388         pj      = bj + bi[prow];
1389         rs      = 0.0;
1390         for (j=0; j<nz; j++){
1391           idx = pj[j];
1392           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1393         }
1394         sctx.rs = rs;
1395         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1396         if (newshift == 1) goto endofwhile;
1397 
1398         if (*pc2 != 0.0){
1399           pj     = nbj + bd[prow];
1400           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1401           *pc2   = mul2;
1402           nz_tmp = bi[prow+1] - bd[prow] - 1;
1403           for (j=0; j<nz_tmp; j++) {
1404             idx = pj[j] ;
1405             tmp = rtmp11[idx];
1406             rtmp22[idx] -= mul2 * tmp;
1407           }
1408           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1409         }
1410 
1411         pj  = bj + bi[row];
1412         pc1 = ba + bi[row];
1413         pc2 = ba + bi[row+1];
1414 
1415         sctx.pv = rtmp22[row+1];
1416         rs = 0.0;
1417         rtmp11[row]   = 1.0/rtmp11[row];
1418         rtmp22[row+1] = 1.0/rtmp22[row+1];
1419         /* copy row entries from dense representation to sparse */
1420         for (j=0; j<nz; j++) {
1421           idx    = pj[j];
1422           pc1[j] = rtmp11[idx];
1423           pc2[j] = rtmp22[idx];
1424           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1425         }
1426         sctx.rs = rs;
1427         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1428         if (newshift == 1) goto endofwhile;
1429         break;
1430 
1431       case 3:
1432         for  (j=0; j<nz; j++) {
1433           idx        = bjtmp[j];
1434           rtmp11[idx] = 0.0;
1435           rtmp22[idx] = 0.0;
1436           rtmp33[idx] = 0.0;
1437         }
1438         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1439         idx    = r[row];
1440         nz_tmp = ai[idx+1] - ai[idx];
1441         ajtmp = aj + ai[idx];
1442         v1    = aa + ai[idx];
1443         v2    = aa + ai[idx+1];
1444         v3    = aa + ai[idx+2];
1445         for (j=0; j<nz_tmp; j++) {
1446           idx        = ics[ajtmp[j]];
1447           rtmp11[idx] = v1[j];
1448           rtmp22[idx] = v2[j];
1449           rtmp33[idx] = v3[j];
1450         }
1451         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1452         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1453         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1454 
1455         /* loop over all pivot row blocks above this row block */
1456         prow = *bjtmp++ ;
1457         while (prow < row) {
1458           pc1 = rtmp11 + prow;
1459           pc2 = rtmp22 + prow;
1460           pc3 = rtmp33 + prow;
1461           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1462             pv   = ba  + bd[prow];
1463             pj   = nbj + bd[prow];
1464             mul1 = *pc1 * *pv;
1465             mul2 = *pc2 * *pv;
1466             mul3 = *pc3 * *pv;
1467             ++pv;
1468             *pc1 = mul1;
1469             *pc2 = mul2;
1470             *pc3 = mul3;
1471 
1472             nz_tmp = bi[prow+1] - bd[prow] - 1;
1473             /* update this row based on pivot row */
1474             for (j=0; j<nz_tmp; j++) {
1475               tmp = pv[j];
1476               idx = pj[j];
1477               rtmp11[idx] -= mul1 * tmp;
1478               rtmp22[idx] -= mul2 * tmp;
1479               rtmp33[idx] -= mul3 * tmp;
1480             }
1481             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1482           }
1483           prow = *bjtmp++ ;
1484         }
1485 
1486         /* Now take care of diagonal 3x3 block in this set of rows */
1487         /* note: prow = row here */
1488         pc1 = rtmp11 + prow;
1489         pc2 = rtmp22 + prow;
1490         pc3 = rtmp33 + prow;
1491 
1492         sctx.pv = *pc1;
1493         pj      = bj + bi[prow];
1494         rs      = 0.0;
1495         for (j=0; j<nz; j++){
1496           idx = pj[j];
1497           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1498         }
1499         sctx.rs = rs;
1500         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1501         if (newshift == 1) goto endofwhile;
1502 
1503         if (*pc2 != 0.0 || *pc3 != 0.0){
1504           mul2 = (*pc2)/(*pc1);
1505           mul3 = (*pc3)/(*pc1);
1506           *pc2 = mul2;
1507           *pc3 = mul3;
1508           nz_tmp = bi[prow+1] - bd[prow] - 1;
1509           pj     = nbj + bd[prow];
1510           for (j=0; j<nz_tmp; j++) {
1511             idx = pj[j] ;
1512             tmp = rtmp11[idx];
1513             rtmp22[idx] -= mul2 * tmp;
1514             rtmp33[idx] -= mul3 * tmp;
1515           }
1516           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1517         }
1518         ++prow;
1519 
1520         pc2 = rtmp22 + prow;
1521         pc3 = rtmp33 + prow;
1522         sctx.pv = *pc2;
1523         pj      = bj + bi[prow];
1524         rs      = 0.0;
1525         for (j=0; j<nz; j++){
1526           idx = pj[j];
1527           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1528         }
1529         sctx.rs = rs;
1530         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1531         if (newshift == 1) goto endofwhile;
1532 
1533         if (*pc3 != 0.0){
1534           mul3   = (*pc3)/(*pc2);
1535           *pc3   = mul3;
1536           pj     = nbj + bd[prow];
1537           nz_tmp = bi[prow+1] - bd[prow] - 1;
1538           for (j=0; j<nz_tmp; j++) {
1539             idx = pj[j] ;
1540             tmp = rtmp22[idx];
1541             rtmp33[idx] -= mul3 * tmp;
1542           }
1543           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1544         }
1545 
1546         pj  = bj + bi[row];
1547         pc1 = ba + bi[row];
1548         pc2 = ba + bi[row+1];
1549         pc3 = ba + bi[row+2];
1550 
1551         sctx.pv = rtmp33[row+2];
1552         rs = 0.0;
1553         rtmp11[row]   = 1.0/rtmp11[row];
1554         rtmp22[row+1] = 1.0/rtmp22[row+1];
1555         rtmp33[row+2] = 1.0/rtmp33[row+2];
1556         /* copy row entries from dense representation to sparse */
1557         for (j=0; j<nz; j++) {
1558           idx    = pj[j];
1559           pc1[j] = rtmp11[idx];
1560           pc2[j] = rtmp22[idx];
1561           pc3[j] = rtmp33[idx];
1562           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1563         }
1564 
1565         sctx.rs = rs;
1566         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1567         if (newshift == 1) goto endofwhile;
1568         break;
1569 
1570       default:
1571         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1572       }
1573       row += nodesz;                 /* Update the row */
1574     }
1575     endofwhile:;
1576   } while (sctx.lushift);
1577   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1578   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1579   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1580   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1581   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1582   (B)->ops->solve           = MatSolve_Inode;
1583   /* do not set solve add, since MatSolve_Inode + Add is faster */
1584   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1585   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1586   C->assembled   = PETSC_TRUE;
1587   C->preallocated = PETSC_TRUE;
1588   if (sctx.nshift) {
1589     if (info->shiftnz) {
1590       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1591     } else if (info->shiftpd) {
1592       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1593     }
1594   }
1595   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 /*
1600      Makes a longer coloring[] array and calls the usual code with that
1601 */
1602 #undef __FUNCT__
1603 #define __FUNCT__ "MatColoringPatch_Inode"
1604 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1605 {
1606   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1607   PetscErrorCode  ierr;
1608   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1609   PetscInt        *colorused,i;
1610   ISColoringValue *newcolor;
1611 
1612   PetscFunctionBegin;
1613   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1614   /* loop over inodes, marking a color for each column*/
1615   row = 0;
1616   for (i=0; i<m; i++){
1617     for (j=0; j<ns[i]; j++) {
1618       newcolor[row++] = coloring[i] + j*ncolors;
1619     }
1620   }
1621 
1622   /* eliminate unneeded colors */
1623   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1624   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1625   for (i=0; i<n; i++) {
1626     colorused[newcolor[i]] = 1;
1627   }
1628 
1629   for (i=1; i<5*ncolors; i++) {
1630     colorused[i] += colorused[i-1];
1631   }
1632   ncolors = colorused[5*ncolors-1];
1633   for (i=0; i<n; i++) {
1634     newcolor[i] = colorused[newcolor[i]]-1;
1635   }
1636   ierr = PetscFree(colorused);CHKERRQ(ierr);
1637   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1638   ierr = PetscFree(coloring);CHKERRQ(ierr);
1639   PetscFunctionReturn(0);
1640 }
1641 
1642 #include "src/inline/ilu.h"
1643 
1644 #undef __FUNCT__
1645 #define __FUNCT__ "MatRelax_Inode"
1646 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1647 {
1648   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1649   PetscScalar        *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1650   MatScalar          *ibdiag,*bdiag;
1651   PetscScalar        *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1652   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1653   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1654   PetscErrorCode     ierr;
1655   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1656   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1657 
1658   PetscFunctionBegin;
1659   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1660   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1661   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1662 
1663   if (!a->inode.ibdiagvalid) {
1664     if (!a->inode.ibdiag) {
1665       /* calculate space needed for diagonal blocks */
1666       for (i=0; i<m; i++) {
1667 	cnt += sizes[i]*sizes[i];
1668       }
1669       a->inode.bdiagsize = cnt;
1670       ierr   = PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);CHKERRQ(ierr);
1671     }
1672 
1673     /* copy over the diagonal blocks and invert them */
1674     ibdiag = a->inode.ibdiag;
1675     bdiag  = a->inode.bdiag;
1676     cnt = 0;
1677     for (i=0, row = 0; i<m; i++) {
1678       for (j=0; j<sizes[i]; j++) {
1679         for (k=0; k<sizes[i]; k++) {
1680           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1681         }
1682       }
1683       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1684 
1685       switch(sizes[i]) {
1686         case 1:
1687           /* Create matrix data structure */
1688           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1689           ibdiag[cnt] = 1.0/ibdiag[cnt];
1690           break;
1691         case 2:
1692           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1693           break;
1694         case 3:
1695           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1696           break;
1697         case 4:
1698           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1699           break;
1700         case 5:
1701           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1702           break;
1703        default:
1704 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1705       }
1706       cnt += sizes[i]*sizes[i];
1707       row += sizes[i];
1708     }
1709     a->inode.ibdiagvalid = PETSC_TRUE;
1710   }
1711   ibdiag = a->inode.ibdiag;
1712   bdiag  = a->inode.bdiag;
1713 
1714   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1715   if (xx != bb) {
1716     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1717   } else {
1718     b = x;
1719   }
1720 
1721   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1722   xs   = x;
1723   if (flag & SOR_ZERO_INITIAL_GUESS) {
1724     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1725 
1726       for (i=0, row=0; i<m; i++) {
1727         sz  = diag[row] - ii[row];
1728         v1  = a->a + ii[row];
1729         idx = a->j + ii[row];
1730 
1731         /* see comments for MatMult_Inode() for how this is coded */
1732         switch (sizes[i]){
1733           case 1:
1734 
1735             sum1  = b[row];
1736             for(n = 0; n<sz-1; n+=2) {
1737               i1   = idx[0];
1738               i2   = idx[1];
1739               idx += 2;
1740               tmp0 = x[i1];
1741               tmp1 = x[i2];
1742               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1743             }
1744 
1745             if (n == sz-1){
1746               tmp0  = x[*idx];
1747               sum1 -= *v1 * tmp0;
1748             }
1749             x[row++] = sum1*(*ibdiag++);
1750             break;
1751           case 2:
1752             v2    = a->a + ii[row+1];
1753             sum1  = b[row];
1754             sum2  = b[row+1];
1755             for(n = 0; n<sz-1; n+=2) {
1756               i1   = idx[0];
1757               i2   = idx[1];
1758               idx += 2;
1759               tmp0 = x[i1];
1760               tmp1 = x[i2];
1761               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1762               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1763             }
1764 
1765             if (n == sz-1){
1766               tmp0  = x[*idx];
1767               sum1 -= v1[0] * tmp0;
1768               sum2 -= v2[0] * tmp0;
1769             }
1770             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1771             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1772             ibdiag  += 4;
1773             break;
1774           case 3:
1775             v2    = a->a + ii[row+1];
1776             v3    = a->a + ii[row+2];
1777             sum1  = b[row];
1778             sum2  = b[row+1];
1779             sum3  = b[row+2];
1780             for(n = 0; n<sz-1; n+=2) {
1781               i1   = idx[0];
1782               i2   = idx[1];
1783               idx += 2;
1784               tmp0 = x[i1];
1785               tmp1 = x[i2];
1786               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1787               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1788               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1789             }
1790 
1791             if (n == sz-1){
1792               tmp0  = x[*idx];
1793               sum1 -= v1[0] * tmp0;
1794               sum2 -= v2[0] * tmp0;
1795               sum3 -= v3[0] * tmp0;
1796             }
1797             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1798             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1799             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1800             ibdiag  += 9;
1801             break;
1802           case 4:
1803             v2    = a->a + ii[row+1];
1804             v3    = a->a + ii[row+2];
1805             v4    = a->a + ii[row+3];
1806             sum1  = b[row];
1807             sum2  = b[row+1];
1808             sum3  = b[row+2];
1809             sum4  = b[row+3];
1810             for(n = 0; n<sz-1; n+=2) {
1811               i1   = idx[0];
1812               i2   = idx[1];
1813               idx += 2;
1814               tmp0 = x[i1];
1815               tmp1 = x[i2];
1816               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1817               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1818               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1819               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1820             }
1821 
1822             if (n == sz-1){
1823               tmp0  = x[*idx];
1824               sum1 -= v1[0] * tmp0;
1825               sum2 -= v2[0] * tmp0;
1826               sum3 -= v3[0] * tmp0;
1827               sum4 -= v4[0] * tmp0;
1828             }
1829             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1830             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1831             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1832             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1833             ibdiag  += 16;
1834             break;
1835           case 5:
1836             v2    = a->a + ii[row+1];
1837             v3    = a->a + ii[row+2];
1838             v4    = a->a + ii[row+3];
1839             v5    = a->a + ii[row+4];
1840             sum1  = b[row];
1841             sum2  = b[row+1];
1842             sum3  = b[row+2];
1843             sum4  = b[row+3];
1844             sum5  = b[row+4];
1845             for(n = 0; n<sz-1; n+=2) {
1846               i1   = idx[0];
1847               i2   = idx[1];
1848               idx += 2;
1849               tmp0 = x[i1];
1850               tmp1 = x[i2];
1851               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1852               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1853               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1854               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1855               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1856             }
1857 
1858             if (n == sz-1){
1859               tmp0  = x[*idx];
1860               sum1 -= v1[0] * tmp0;
1861               sum2 -= v2[0] * tmp0;
1862               sum3 -= v3[0] * tmp0;
1863               sum4 -= v4[0] * tmp0;
1864               sum5 -= v5[0] * tmp0;
1865             }
1866             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1867             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1868             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1869             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1870             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1871             ibdiag  += 25;
1872             break;
1873 	  default:
1874    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1875         }
1876       }
1877 
1878       xb = x;
1879       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1880     } else xb = b;
1881     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1882         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1883       cnt = 0;
1884       for (i=0, row=0; i<m; i++) {
1885 
1886         switch (sizes[i]){
1887           case 1:
1888             x[row++] *= bdiag[cnt++];
1889             break;
1890           case 2:
1891             x1   = x[row]; x2 = x[row+1];
1892             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1893             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1894             x[row++] = tmp1;
1895             x[row++] = tmp2;
1896             cnt += 4;
1897             break;
1898           case 3:
1899             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1900             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1901             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1902             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1903             x[row++] = tmp1;
1904             x[row++] = tmp2;
1905             x[row++] = tmp3;
1906             cnt += 9;
1907             break;
1908           case 4:
1909             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1910             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1911             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1912             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1913             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1914             x[row++] = tmp1;
1915             x[row++] = tmp2;
1916             x[row++] = tmp3;
1917             x[row++] = tmp4;
1918             cnt += 16;
1919             break;
1920           case 5:
1921             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1922             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1923             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1924             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1925             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1926             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1927             x[row++] = tmp1;
1928             x[row++] = tmp2;
1929             x[row++] = tmp3;
1930             x[row++] = tmp4;
1931             x[row++] = tmp5;
1932             cnt += 25;
1933             break;
1934 	  default:
1935    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1936         }
1937       }
1938       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1939     }
1940     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1941 
1942       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1943       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1944         ibdiag -= sizes[i]*sizes[i];
1945         sz      = ii[row+1] - diag[row] - 1;
1946         v1      = a->a + diag[row] + 1;
1947         idx     = a->j + diag[row] + 1;
1948 
1949         /* see comments for MatMult_Inode() for how this is coded */
1950         switch (sizes[i]){
1951           case 1:
1952 
1953             sum1  = xb[row];
1954             for(n = 0; n<sz-1; n+=2) {
1955               i1   = idx[0];
1956               i2   = idx[1];
1957               idx += 2;
1958               tmp0 = x[i1];
1959               tmp1 = x[i2];
1960               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1961             }
1962 
1963             if (n == sz-1){
1964               tmp0  = x[*idx];
1965               sum1 -= *v1*tmp0;
1966             }
1967             x[row--] = sum1*(*ibdiag);
1968             break;
1969 
1970           case 2:
1971 
1972             sum1  = xb[row];
1973             sum2  = xb[row-1];
1974             /* note that sum1 is associated with the second of the two rows */
1975             v2    = a->a + diag[row-1] + 2;
1976             for(n = 0; n<sz-1; n+=2) {
1977               i1   = idx[0];
1978               i2   = idx[1];
1979               idx += 2;
1980               tmp0 = x[i1];
1981               tmp1 = x[i2];
1982               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1983               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1984             }
1985 
1986             if (n == sz-1){
1987               tmp0  = x[*idx];
1988               sum1 -= *v1*tmp0;
1989               sum2 -= *v2*tmp0;
1990             }
1991             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1992             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1993             break;
1994           case 3:
1995 
1996             sum1  = xb[row];
1997             sum2  = xb[row-1];
1998             sum3  = xb[row-2];
1999             v2    = a->a + diag[row-1] + 2;
2000             v3    = a->a + diag[row-2] + 3;
2001             for(n = 0; n<sz-1; n+=2) {
2002               i1   = idx[0];
2003               i2   = idx[1];
2004               idx += 2;
2005               tmp0 = x[i1];
2006               tmp1 = x[i2];
2007               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2008               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2009               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2010             }
2011 
2012             if (n == sz-1){
2013               tmp0  = x[*idx];
2014               sum1 -= *v1*tmp0;
2015               sum2 -= *v2*tmp0;
2016               sum3 -= *v3*tmp0;
2017             }
2018             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2019             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2020             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2021             break;
2022           case 4:
2023 
2024             sum1  = xb[row];
2025             sum2  = xb[row-1];
2026             sum3  = xb[row-2];
2027             sum4  = xb[row-3];
2028             v2    = a->a + diag[row-1] + 2;
2029             v3    = a->a + diag[row-2] + 3;
2030             v4    = a->a + diag[row-3] + 4;
2031             for(n = 0; n<sz-1; n+=2) {
2032               i1   = idx[0];
2033               i2   = idx[1];
2034               idx += 2;
2035               tmp0 = x[i1];
2036               tmp1 = x[i2];
2037               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2038               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2039               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2040               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2041             }
2042 
2043             if (n == sz-1){
2044               tmp0  = x[*idx];
2045               sum1 -= *v1*tmp0;
2046               sum2 -= *v2*tmp0;
2047               sum3 -= *v3*tmp0;
2048               sum4 -= *v4*tmp0;
2049             }
2050             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2051             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2052             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2053             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2054             break;
2055           case 5:
2056 
2057             sum1  = xb[row];
2058             sum2  = xb[row-1];
2059             sum3  = xb[row-2];
2060             sum4  = xb[row-3];
2061             sum5  = xb[row-4];
2062             v2    = a->a + diag[row-1] + 2;
2063             v3    = a->a + diag[row-2] + 3;
2064             v4    = a->a + diag[row-3] + 4;
2065             v5    = a->a + diag[row-4] + 5;
2066             for(n = 0; n<sz-1; n+=2) {
2067               i1   = idx[0];
2068               i2   = idx[1];
2069               idx += 2;
2070               tmp0 = x[i1];
2071               tmp1 = x[i2];
2072               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2073               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2074               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2075               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2076               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2077             }
2078 
2079             if (n == sz-1){
2080               tmp0  = x[*idx];
2081               sum1 -= *v1*tmp0;
2082               sum2 -= *v2*tmp0;
2083               sum3 -= *v3*tmp0;
2084               sum4 -= *v4*tmp0;
2085               sum5 -= *v5*tmp0;
2086             }
2087             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2088             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2089             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2090             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2091             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2092             break;
2093 	  default:
2094    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2095         }
2096       }
2097 
2098       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2099     }
2100     its--;
2101   }
2102   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2103   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2104   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2105   PetscFunctionReturn(0);
2106 }
2107 
2108 
2109 /*
2110     samestructure indicates that the matrix has not changed its nonzero structure so we
2111     do not need to recompute the inodes
2112 */
2113 #undef __FUNCT__
2114 #define __FUNCT__ "Mat_CheckInode"
2115 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2116 {
2117   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2118   PetscErrorCode ierr;
2119   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2120   PetscTruth     flag;
2121 
2122   PetscFunctionBegin;
2123   if (!a->inode.use)                     PetscFunctionReturn(0);
2124   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2125 
2126 
2127   m = A->rmap->n;
2128   if (a->inode.size) {ns = a->inode.size;}
2129   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2130 
2131   i          = 0;
2132   node_count = 0;
2133   idx        = a->j;
2134   ii         = a->i;
2135   while (i < m){                /* For each row */
2136     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2137     /* Limits the number of elements in a node to 'a->inode.limit' */
2138     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2139       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2140       if (nzy != nzx) break;
2141       idy  += nzx;             /* Same nonzero pattern */
2142       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2143       if (!flag) break;
2144     }
2145     ns[node_count++] = blk_size;
2146     idx += blk_size*nzx;
2147     i    = j;
2148   }
2149   /* If not enough inodes found,, do not use inode version of the routines */
2150   if (!a->inode.size && m && node_count > .9*m) {
2151     ierr = PetscFree(ns);CHKERRQ(ierr);
2152     a->inode.node_count     = 0;
2153     a->inode.size           = PETSC_NULL;
2154     a->inode.use            = PETSC_FALSE;
2155     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2156   } else {
2157     A->ops->mult            = MatMult_Inode;
2158     A->ops->relax           = MatRelax_Inode;
2159     A->ops->multadd         = MatMultAdd_Inode;
2160     A->ops->getrowij        = MatGetRowIJ_Inode;
2161     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2162     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2163     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2164     A->ops->coloringpatch   = MatColoringPatch_Inode;
2165     a->inode.node_count     = node_count;
2166     a->inode.size           = ns;
2167     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2168   }
2169   PetscFunctionReturn(0);
2170 }
2171 
2172 /*
2173      This is really ugly. if inodes are used this replaces the
2174   permutations with ones that correspond to rows/cols of the matrix
2175   rather then inode blocks
2176 */
2177 #undef __FUNCT__
2178 #define __FUNCT__ "MatInodeAdjustForInodes"
2179 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2180 {
2181   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2182 
2183   PetscFunctionBegin;
2184   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2185   if (f) {
2186     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2187   }
2188   PetscFunctionReturn(0);
2189 }
2190 
2191 EXTERN_C_BEGIN
2192 #undef __FUNCT__
2193 #define __FUNCT__ "MatAdjustForInodes_Inode"
2194 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2195 {
2196   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2197   PetscErrorCode ierr;
2198   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2199   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2200   PetscInt       nslim_col,*ns_col;
2201   IS             ris = *rperm,cis = *cperm;
2202 
2203   PetscFunctionBegin;
2204   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2205   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2206 
2207   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2208   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2209   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2210   permc = permr + m;
2211 
2212   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2213   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2214 
2215   /* Form the inode structure for the rows of permuted matric using inv perm*/
2216   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2217 
2218   /* Construct the permutations for rows*/
2219   for (i=0,row = 0; i<nslim_row; ++i){
2220     indx      = ridx[i];
2221     start_val = tns[indx];
2222     end_val   = tns[indx + 1];
2223     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2224   }
2225 
2226   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2227   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2228 
2229  /* Construct permutations for columns */
2230   for (i=0,col=0; i<nslim_col; ++i){
2231     indx      = cidx[i];
2232     start_val = tns[indx];
2233     end_val   = tns[indx + 1];
2234     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2235   }
2236 
2237   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2238   ISSetPermutation(*rperm);
2239   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2240   ISSetPermutation(*cperm);
2241 
2242   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2243   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2244 
2245   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2246   ierr = PetscFree(permr);CHKERRQ(ierr);
2247   ierr = ISDestroy(cis);CHKERRQ(ierr);
2248   ierr = ISDestroy(ris);CHKERRQ(ierr);
2249   ierr = PetscFree(tns);CHKERRQ(ierr);
2250   PetscFunctionReturn(0);
2251 }
2252 EXTERN_C_END
2253 
2254 #undef __FUNCT__
2255 #define __FUNCT__ "MatInodeGetInodeSizes"
2256 /*@C
2257    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2258 
2259    Collective on Mat
2260 
2261    Input Parameter:
2262 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2263 
2264    Output Parameter:
2265 +  node_count - no of inodes present in the matrix.
2266 .  sizes      - an array of size node_count,with sizes of each inode.
2267 -  limit      - the max size used to generate the inodes.
2268 
2269    Level: advanced
2270 
2271    Notes: This routine returns some internal storage information
2272    of the matrix, it is intended to be used by advanced users.
2273    It should be called after the matrix is assembled.
2274    The contents of the sizes[] array should not be changed.
2275    PETSC_NULL may be passed for information not requested.
2276 
2277 .keywords: matrix, seqaij, get, inode
2278 
2279 .seealso: MatGetInfo()
2280 @*/
2281 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2282 {
2283   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2284 
2285   PetscFunctionBegin;
2286   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2287   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2288   if (f) {
2289     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2290   }
2291   PetscFunctionReturn(0);
2292 }
2293 
2294 EXTERN_C_BEGIN
2295 #undef __FUNCT__
2296 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2297 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2298 {
2299   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2300 
2301   PetscFunctionBegin;
2302   if (node_count) *node_count = a->inode.node_count;
2303   if (sizes)      *sizes      = a->inode.size;
2304   if (limit)      *limit      = a->inode.limit;
2305   PetscFunctionReturn(0);
2306 }
2307 EXTERN_C_END
2308