xref: /petsc/src/mat/impls/aij/seq/inode.c (revision d32f9abdbc052d6e1fd06679b17a55415c3aae30)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap.n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     sz   = n;                   /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz){               /* Each loop in 'case' is unrolled */
429     case 1 :
430       sum1  = 0;
431 
432       for(n = 0; n< sz-1; n+=2) {
433         i1   = idx[0];          /* The instructions are ordered to */
434         i2   = idx[1];          /* make the compiler's job easy */
435         idx += 2;
436         tmp0 = x[i1];
437         tmp1 = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439        }
440 
441       if (n == sz-1){          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ * tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1  = 0;
449       sum2  = 0;
450       v2    = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1   = idx[0];
454         i2   = idx[1];
455         idx += 2;
456         tmp0 = x[i1];
457         tmp1 = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1){
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1  = 0;
473       sum2  = 0;
474       sum3  = 0;
475       v2    = v1 + n;
476       v3    = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1   = idx[0];
480         i2   = idx[1];
481         idx += 2;
482         tmp0 = x[i1];
483         tmp1 = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1){
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1       =v3;             /* Since the next block to be processed starts there*/
498       idx     +=2*sz;
499       break;
500     case 4:
501       sum1  = 0;
502       sum2  = 0;
503       sum3  = 0;
504       sum4  = 0;
505       v2    = v1 + n;
506       v3    = v2 + n;
507       v4    = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1   = idx[0];
511         i2   = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1){
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1  = 0;
536       sum2  = 0;
537       sum3  = 0;
538       sum4  = 0;
539       sum5  = 0;
540       v2    = v1 + n;
541       v3    = v2 + n;
542       v4    = v3 + n;
543       v5    = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1   = idx[0];
547         i2   = idx[1];
548         idx += 2;
549         tmp0 = x[i1];
550         tmp1 = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1){
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default :
574       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_Inode"
586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   MatScalar      *v1,*v2,*v3,*v4,*v5;
591   PetscScalar    *x,*y,*z,*zt;
592   PetscErrorCode ierr;
593   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594 
595   PetscFunctionBegin;
596   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597   node_max = a->inode.node_count;
598   ns       = a->inode.size;     /* Node Size array */
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   if (zz != yy) {
602     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
603   } else {
604     z = y;
605   }
606   zt = z;
607 
608   idx  = a->j;
609   v1   = a->a;
610   ii   = a->i;
611 
612   for (i = 0,row = 0; i< node_max; ++i){
613     nsz  = ns[i];
614     n    = ii[1] - ii[0];
615     ii  += nsz;
616     sz   = n;                   /* No of non zeros in this row */
617                                 /* Switch on the size of Node */
618     switch (nsz){               /* Each loop in 'case' is unrolled */
619     case 1 :
620       sum1  = *zt++;
621 
622       for(n = 0; n< sz-1; n+=2) {
623         i1   = idx[0];          /* The instructions are ordered to */
624         i2   = idx[1];          /* make the compiler's job easy */
625         idx += 2;
626         tmp0 = x[i1];
627         tmp1 = x[i2];
628         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629        }
630 
631       if(n   == sz-1){          /* Take care of the last nonzero  */
632         tmp0  = x[*idx++];
633         sum1 += *v1++ * tmp0;
634       }
635       y[row++]=sum1;
636       break;
637     case 2:
638       sum1  = *zt++;
639       sum2  = *zt++;
640       v2    = v1 + n;
641 
642       for(n = 0; n< sz-1; n+=2) {
643         i1   = idx[0];
644         i2   = idx[1];
645         idx += 2;
646         tmp0 = x[i1];
647         tmp1 = x[i2];
648         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650       }
651       if(n   == sz-1){
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       v1      =v2;              /* Since the next block to be processed starts there*/
659       idx    +=sz;
660       break;
661     case 3:
662       sum1  = *zt++;
663       sum2  = *zt++;
664       sum3  = *zt++;
665       v2    = v1 + n;
666       v3    = v2 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1   = idx[0];
670         i2   = idx[1];
671         idx += 2;
672         tmp0 = x[i1];
673         tmp1 = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677       }
678       if (n == sz-1){
679         tmp0  = x[*idx++];
680         sum1 += *v1++ * tmp0;
681         sum2 += *v2++ * tmp0;
682         sum3 += *v3++ * tmp0;
683       }
684       y[row++]=sum1;
685       y[row++]=sum2;
686       y[row++]=sum3;
687       v1       =v3;             /* Since the next block to be processed starts there*/
688       idx     +=2*sz;
689       break;
690     case 4:
691       sum1  = *zt++;
692       sum2  = *zt++;
693       sum3  = *zt++;
694       sum4  = *zt++;
695       v2    = v1 + n;
696       v3    = v2 + n;
697       v4    = v3 + n;
698 
699       for (n = 0; n< sz-1; n+=2) {
700         i1   = idx[0];
701         i2   = idx[1];
702         idx += 2;
703         tmp0 = x[i1];
704         tmp1 = x[i2];
705         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709       }
710       if (n == sz-1){
711         tmp0  = x[*idx++];
712         sum1 += *v1++ * tmp0;
713         sum2 += *v2++ * tmp0;
714         sum3 += *v3++ * tmp0;
715         sum4 += *v4++ * tmp0;
716       }
717       y[row++]=sum1;
718       y[row++]=sum2;
719       y[row++]=sum3;
720       y[row++]=sum4;
721       v1      =v4;              /* Since the next block to be processed starts there*/
722       idx    +=3*sz;
723       break;
724     case 5:
725       sum1  = *zt++;
726       sum2  = *zt++;
727       sum3  = *zt++;
728       sum4  = *zt++;
729       sum5  = *zt++;
730       v2    = v1 + n;
731       v3    = v2 + n;
732       v4    = v3 + n;
733       v5    = v4 + n;
734 
735       for (n = 0; n<sz-1; n+=2) {
736         i1   = idx[0];
737         i2   = idx[1];
738         idx += 2;
739         tmp0 = x[i1];
740         tmp1 = x[i2];
741         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746       }
747       if(n   == sz-1){
748         tmp0  = x[*idx++];
749         sum1 += *v1++ * tmp0;
750         sum2 += *v2++ * tmp0;
751         sum3 += *v3++ * tmp0;
752         sum4 += *v4++ * tmp0;
753         sum5 += *v5++ * tmp0;
754       }
755       y[row++]=sum1;
756       y[row++]=sum2;
757       y[row++]=sum3;
758       y[row++]=sum4;
759       y[row++]=sum5;
760       v1      =v5;       /* Since the next block to be processed starts there */
761       idx    +=4*sz;
762       break;
763     default :
764       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765     }
766   }
767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
768   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
769   if (zz != yy) {
770     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771   }
772   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 /* ----------------------------------------------------------- */
777 #undef __FUNCT__
778 #define __FUNCT__ "MatSolve_Inode"
779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
780 {
781   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782   IS                iscol = a->col,isrow = a->row;
783   PetscErrorCode    ierr;
784   PetscInt          *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
785   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
786   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
787   PetscScalar       sum1,sum2,sum3,sum4,sum5;
788   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
789   const PetscScalar *b;
790 
791   PetscFunctionBegin;
792   if (A->factor != MAT_FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
793   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp ;
806   aa   = a_a ;
807   aj   = a_j ;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i){
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816 
817     switch (nsz){               /* Each loop in 'case' is unrolled */
818     case 1 :
819       sum1 = b[*r++];
820       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
821       for(j=0; j<nz-1; j+=2){
822         i0   = vi[0];
823         i1   = vi[1];
824         vi  +=2;
825         tmp0 = tmps[i0];
826         tmp1 = tmps[i1];
827         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828       }
829       if(j == nz-1){
830         tmp0 = tmps[*vi++];
831         sum1 -= *v1++ *tmp0;
832       }
833       tmp[row ++]=sum1;
834       break;
835     case 2:
836       sum1 = b[*r++];
837       sum2 = b[*r++];
838       v2   = aa + ai[row+1];
839 
840       for(j=0; j<nz-1; j+=2){
841         i0   = vi[0];
842         i1   = vi[1];
843         vi  +=2;
844         tmp0 = tmps[i0];
845         tmp1 = tmps[i1];
846         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848       }
849       if(j == nz-1){
850         tmp0 = tmps[*vi++];
851         sum1 -= *v1++ *tmp0;
852         sum2 -= *v2++ *tmp0;
853       }
854       sum2 -= *v2++ * sum1;
855       tmp[row ++]=sum1;
856       tmp[row ++]=sum2;
857       break;
858     case 3:
859       sum1 = b[*r++];
860       sum2 = b[*r++];
861       sum3 = b[*r++];
862       v2   = aa + ai[row+1];
863       v3   = aa + ai[row+2];
864 
865       for (j=0; j<nz-1; j+=2){
866         i0   = vi[0];
867         i1   = vi[1];
868         vi  +=2;
869         tmp0 = tmps[i0];
870         tmp1 = tmps[i1];
871         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
872         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
873         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
874       }
875       if (j == nz-1){
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ *tmp0;
878         sum2 -= *v2++ *tmp0;
879         sum3 -= *v3++ *tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884       tmp[row ++]=sum1;
885       tmp[row ++]=sum2;
886       tmp[row ++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2){
899         i0   = vi[0];
900         i1   = vi[1];
901         vi  +=2;
902         tmp0 = tmps[i0];
903         tmp1 = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1){
910         tmp0 = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row ++]=sum1;
924       tmp[row ++]=sum2;
925       tmp[row ++]=sum3;
926       tmp[row ++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2){
940         i0   = vi[0];
941         i1   = vi[1];
942         vi  +=2;
943         tmp0 = tmps[i0];
944         tmp1 = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1){
952         tmp0 = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row ++]=sum1;
972       tmp[row ++]=sum2;
973       tmp[row ++]=sum3;
974       tmp[row ++]=sum4;
975       tmp[row ++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz){               /* Each loop in 'case' is unrolled */
989     case 1 :
990       sum1 = tmp[row];
991 
992       for(j=nz ; j>1; j-=2){
993         vi  -=2;
994         i0   = vi[2];
995         i1   = vi[1];
996         tmp0 = tmps[i0];
997         tmp1 = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1){
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2 :
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz ; j>1; j-=2){
1012         vi  -=2;
1013         i0   = vi[2];
1014         i1   = vi[1];
1015         tmp0 = tmps[i0];
1016         tmp1 = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1){
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3 :
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz ; j>1; j-=2){
1039         vi  -=2;
1040         i0   = vi[2];
1041         i1   = vi[1];
1042         tmp0 = tmps[i0];
1043         tmp1 = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1){
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4 :
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz ; j>1; j-=2){
1075         vi  -=2;
1076         i0   = vi[2];
1077         i1   = vi[1];
1078         tmp0 = tmps[i0];
1079         tmp1 = tmps[i1];
1080         v1  -= 2;
1081         v2  -= 2;
1082         v3  -= 2;
1083         v4  -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1){
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5 :
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz ; j>1; j-=2){
1119         vi  -= 2;
1120         i0   = vi[2];
1121         i1   = vi[1];
1122         tmp0 = tmps[i0];
1123         tmp1 = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1){
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1174 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1175 {
1176   Mat               C = *B;
1177   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1178   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1179   PetscErrorCode    ierr;
1180   PetscInt          *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1181   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1182   PetscInt          *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1183   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1184   PetscScalar       mul1,mul2,mul3,tmp;
1185   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1186   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1187   PetscReal         rs=0.0;
1188   LUShift_Ctx       sctx;
1189   PetscInt          newshift;
1190 
1191   PetscFunctionBegin;
1192   sctx.shift_top  = 0;
1193   sctx.nshift_max = 0;
1194   sctx.shift_lo   = 0;
1195   sctx.shift_hi   = 0;
1196 
1197   /* if both shift schemes are chosen by user, only use info->shiftpd */
1198   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1199   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1200     sctx.shift_top = 0;
1201     for (i=0; i<n; i++) {
1202       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203       rs    = 0.0;
1204       ajtmp = aj + ai[i];
1205       rtmp1 = aa + ai[i];
1206       nz = ai[i+1] - ai[i];
1207       for (j=0; j<nz; j++){
1208         if (*ajtmp != i){
1209           rs += PetscAbsScalar(*rtmp1++);
1210         } else {
1211           rs -= PetscRealPart(*rtmp1++);
1212         }
1213         ajtmp++;
1214       }
1215       if (rs>sctx.shift_top) sctx.shift_top = rs;
1216     }
1217     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1218     sctx.shift_top *= 1.1;
1219     sctx.nshift_max = 5;
1220     sctx.shift_lo   = 0.;
1221     sctx.shift_hi   = 1.;
1222   }
1223   sctx.shift_amount = 0;
1224   sctx.nshift       = 0;
1225 
1226   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1228   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1229   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   ics   = ic ;
1232   rtmp22 = rtmp11 + n;
1233   rtmp33 = rtmp22 + n;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns){
1238     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1239   }
1240 
1241   /* If max inode size > 3, split it into two inodes.*/
1242   /* also map the inode sizes according to the ordering */
1243   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1244   for (i=0,j=0; i<node_max; ++i,++j){
1245     if (ns[i]>3) {
1246       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1247       ++j;
1248       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1249     } else {
1250       tmp_vec1[j] = ns[i];
1251     }
1252   }
1253   /* Use the correct node_max */
1254   node_max = j;
1255 
1256   /* Now reorder the inode info based on mat re-ordering info */
1257   /* First create a row -> inode_size_array_index map */
1258   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1259   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1260   for (i=0,row=0; i<node_max; i++) {
1261     nodesz = tmp_vec1[i];
1262     for (j=0; j<nodesz; j++,row++) {
1263       nsmap[row] = i;
1264     }
1265   }
1266   /* Using nsmap, create a reordered ns structure */
1267   for (i=0,j=0; i< node_max; i++) {
1268     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1269     tmp_vec2[i]  = nodesz;
1270     j           += nodesz;
1271   }
1272   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1273   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.lushift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (i=0,row=0; i<node_max; i++) {
1281       nodesz = ns[i];
1282       nz     = bi[row+1] - bi[row];
1283       bjtmp  = bj + bi[row];
1284 
1285       switch (nodesz){
1286       case 1:
1287         for  (j=0; j<nz; j++){
1288           idx        = bjtmp[j];
1289           rtmp11[idx] = 0.0;
1290         }
1291 
1292         /* load in initial (unfactored row) */
1293         idx    = r[row];
1294         nz_tmp = ai[idx+1] - ai[idx];
1295         ajtmp  = aj + ai[idx];
1296         v1     = aa + ai[idx];
1297 
1298         for (j=0; j<nz_tmp; j++) {
1299           idx        = ics[ajtmp[j]];
1300           rtmp11[idx] = v1[j];
1301         }
1302         rtmp11[ics[r[row]]] += sctx.shift_amount;
1303 
1304         prow = *bjtmp++ ;
1305         while (prow < row) {
1306           pc1 = rtmp11 + prow;
1307           if (*pc1 != 0.0){
1308             pv   = ba + bd[prow];
1309             pj   = nbj + bd[prow];
1310             mul1 = *pc1 * *pv++;
1311             *pc1 = mul1;
1312             nz_tmp = bi[prow+1] - bd[prow] - 1;
1313             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1314             for (j=0; j<nz_tmp; j++) {
1315               tmp = pv[j];
1316               idx = pj[j];
1317               rtmp11[idx] -= mul1 * tmp;
1318             }
1319           }
1320           prow = *bjtmp++ ;
1321         }
1322         pj  = bj + bi[row];
1323         pc1 = ba + bi[row];
1324 
1325         sctx.pv    = rtmp11[row];
1326         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1327         rs         = 0.0;
1328         for (j=0; j<nz; j++) {
1329           idx    = pj[j];
1330           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1331           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1332         }
1333         sctx.rs  = rs;
1334         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1335         if (newshift == 1) goto endofwhile;
1336         break;
1337 
1338       case 2:
1339         for (j=0; j<nz; j++) {
1340           idx        = bjtmp[j];
1341           rtmp11[idx] = 0.0;
1342           rtmp22[idx] = 0.0;
1343         }
1344 
1345         /* load in initial (unfactored row) */
1346         idx    = r[row];
1347         nz_tmp = ai[idx+1] - ai[idx];
1348         ajtmp  = aj + ai[idx];
1349         v1     = aa + ai[idx];
1350         v2     = aa + ai[idx+1];
1351         for (j=0; j<nz_tmp; j++) {
1352           idx        = ics[ajtmp[j]];
1353           rtmp11[idx] = v1[j];
1354           rtmp22[idx] = v2[j];
1355         }
1356         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1357         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1358 
1359         prow = *bjtmp++ ;
1360         while (prow < row) {
1361           pc1 = rtmp11 + prow;
1362           pc2 = rtmp22 + prow;
1363           if (*pc1 != 0.0 || *pc2 != 0.0){
1364             pv   = ba + bd[prow];
1365             pj   = nbj + bd[prow];
1366             mul1 = *pc1 * *pv;
1367             mul2 = *pc2 * *pv;
1368             ++pv;
1369             *pc1 = mul1;
1370             *pc2 = mul2;
1371 
1372             nz_tmp = bi[prow+1] - bd[prow] - 1;
1373             for (j=0; j<nz_tmp; j++) {
1374               tmp = pv[j];
1375               idx = pj[j];
1376               rtmp11[idx] -= mul1 * tmp;
1377               rtmp22[idx] -= mul2 * tmp;
1378             }
1379             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1380           }
1381           prow = *bjtmp++ ;
1382         }
1383 
1384         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1385         pc1 = rtmp11 + prow;
1386         pc2 = rtmp22 + prow;
1387 
1388         sctx.pv = *pc1;
1389         pj      = bj + bi[prow];
1390         rs      = 0.0;
1391         for (j=0; j<nz; j++){
1392           idx = pj[j];
1393           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1394         }
1395         sctx.rs = rs;
1396         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1397         if (newshift == 1) goto endofwhile;
1398 
1399         if (*pc2 != 0.0){
1400           pj     = nbj + bd[prow];
1401           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1402           *pc2   = mul2;
1403           nz_tmp = bi[prow+1] - bd[prow] - 1;
1404           for (j=0; j<nz_tmp; j++) {
1405             idx = pj[j] ;
1406             tmp = rtmp11[idx];
1407             rtmp22[idx] -= mul2 * tmp;
1408           }
1409           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1410         }
1411 
1412         pj  = bj + bi[row];
1413         pc1 = ba + bi[row];
1414         pc2 = ba + bi[row+1];
1415 
1416         sctx.pv = rtmp22[row+1];
1417         rs = 0.0;
1418         rtmp11[row]   = 1.0/rtmp11[row];
1419         rtmp22[row+1] = 1.0/rtmp22[row+1];
1420         /* copy row entries from dense representation to sparse */
1421         for (j=0; j<nz; j++) {
1422           idx    = pj[j];
1423           pc1[j] = rtmp11[idx];
1424           pc2[j] = rtmp22[idx];
1425           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1426         }
1427         sctx.rs = rs;
1428         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1429         if (newshift == 1) goto endofwhile;
1430         break;
1431 
1432       case 3:
1433         for  (j=0; j<nz; j++) {
1434           idx        = bjtmp[j];
1435           rtmp11[idx] = 0.0;
1436           rtmp22[idx] = 0.0;
1437           rtmp33[idx] = 0.0;
1438         }
1439         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1440         idx    = r[row];
1441         nz_tmp = ai[idx+1] - ai[idx];
1442         ajtmp = aj + ai[idx];
1443         v1    = aa + ai[idx];
1444         v2    = aa + ai[idx+1];
1445         v3    = aa + ai[idx+2];
1446         for (j=0; j<nz_tmp; j++) {
1447           idx        = ics[ajtmp[j]];
1448           rtmp11[idx] = v1[j];
1449           rtmp22[idx] = v2[j];
1450           rtmp33[idx] = v3[j];
1451         }
1452         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1453         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1454         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1455 
1456         /* loop over all pivot row blocks above this row block */
1457         prow = *bjtmp++ ;
1458         while (prow < row) {
1459           pc1 = rtmp11 + prow;
1460           pc2 = rtmp22 + prow;
1461           pc3 = rtmp33 + prow;
1462           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1463             pv   = ba  + bd[prow];
1464             pj   = nbj + bd[prow];
1465             mul1 = *pc1 * *pv;
1466             mul2 = *pc2 * *pv;
1467             mul3 = *pc3 * *pv;
1468             ++pv;
1469             *pc1 = mul1;
1470             *pc2 = mul2;
1471             *pc3 = mul3;
1472 
1473             nz_tmp = bi[prow+1] - bd[prow] - 1;
1474             /* update this row based on pivot row */
1475             for (j=0; j<nz_tmp; j++) {
1476               tmp = pv[j];
1477               idx = pj[j];
1478               rtmp11[idx] -= mul1 * tmp;
1479               rtmp22[idx] -= mul2 * tmp;
1480               rtmp33[idx] -= mul3 * tmp;
1481             }
1482             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1483           }
1484           prow = *bjtmp++ ;
1485         }
1486 
1487         /* Now take care of diagonal 3x3 block in this set of rows */
1488         /* note: prow = row here */
1489         pc1 = rtmp11 + prow;
1490         pc2 = rtmp22 + prow;
1491         pc3 = rtmp33 + prow;
1492 
1493         sctx.pv = *pc1;
1494         pj      = bj + bi[prow];
1495         rs      = 0.0;
1496         for (j=0; j<nz; j++){
1497           idx = pj[j];
1498           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1499         }
1500         sctx.rs = rs;
1501         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1502         if (newshift == 1) goto endofwhile;
1503 
1504         if (*pc2 != 0.0 || *pc3 != 0.0){
1505           mul2 = (*pc2)/(*pc1);
1506           mul3 = (*pc3)/(*pc1);
1507           *pc2 = mul2;
1508           *pc3 = mul3;
1509           nz_tmp = bi[prow+1] - bd[prow] - 1;
1510           pj     = nbj + bd[prow];
1511           for (j=0; j<nz_tmp; j++) {
1512             idx = pj[j] ;
1513             tmp = rtmp11[idx];
1514             rtmp22[idx] -= mul2 * tmp;
1515             rtmp33[idx] -= mul3 * tmp;
1516           }
1517           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1518         }
1519         ++prow;
1520 
1521         pc2 = rtmp22 + prow;
1522         pc3 = rtmp33 + prow;
1523         sctx.pv = *pc2;
1524         pj      = bj + bi[prow];
1525         rs      = 0.0;
1526         for (j=0; j<nz; j++){
1527           idx = pj[j];
1528           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1529         }
1530         sctx.rs = rs;
1531         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1532         if (newshift == 1) goto endofwhile;
1533 
1534         if (*pc3 != 0.0){
1535           mul3   = (*pc3)/(*pc2);
1536           *pc3   = mul3;
1537           pj     = nbj + bd[prow];
1538           nz_tmp = bi[prow+1] - bd[prow] - 1;
1539           for (j=0; j<nz_tmp; j++) {
1540             idx = pj[j] ;
1541             tmp = rtmp22[idx];
1542             rtmp33[idx] -= mul3 * tmp;
1543           }
1544           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1545         }
1546 
1547         pj  = bj + bi[row];
1548         pc1 = ba + bi[row];
1549         pc2 = ba + bi[row+1];
1550         pc3 = ba + bi[row+2];
1551 
1552         sctx.pv = rtmp33[row+2];
1553         rs = 0.0;
1554         rtmp11[row]   = 1.0/rtmp11[row];
1555         rtmp22[row+1] = 1.0/rtmp22[row+1];
1556         rtmp33[row+2] = 1.0/rtmp33[row+2];
1557         /* copy row entries from dense representation to sparse */
1558         for (j=0; j<nz; j++) {
1559           idx    = pj[j];
1560           pc1[j] = rtmp11[idx];
1561           pc2[j] = rtmp22[idx];
1562           pc3[j] = rtmp33[idx];
1563           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1564         }
1565 
1566         sctx.rs = rs;
1567         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1568         if (newshift == 1) goto endofwhile;
1569         break;
1570 
1571       default:
1572         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1573       }
1574       row += nodesz;                 /* Update the row */
1575     }
1576     endofwhile:;
1577   } while (sctx.lushift);
1578   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1579   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1580   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1581   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1582   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1583   C->factor      = MAT_FACTOR_LU;
1584   C->assembled   = PETSC_TRUE;
1585   C->preallocated = PETSC_TRUE;
1586   if (sctx.nshift) {
1587     if (info->shiftnz) {
1588       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1589     } else if (info->shiftpd) {
1590       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1591     }
1592   }
1593   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 /*
1598      Makes a longer coloring[] array and calls the usual code with that
1599 */
1600 #undef __FUNCT__
1601 #define __FUNCT__ "MatColoringPatch_Inode"
1602 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1603 {
1604   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1605   PetscErrorCode  ierr;
1606   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1607   PetscInt        *colorused,i;
1608   ISColoringValue *newcolor;
1609 
1610   PetscFunctionBegin;
1611   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1612   /* loop over inodes, marking a color for each column*/
1613   row = 0;
1614   for (i=0; i<m; i++){
1615     for (j=0; j<ns[i]; j++) {
1616       newcolor[row++] = coloring[i] + j*ncolors;
1617     }
1618   }
1619 
1620   /* eliminate unneeded colors */
1621   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1622   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1623   for (i=0; i<n; i++) {
1624     colorused[newcolor[i]] = 1;
1625   }
1626 
1627   for (i=1; i<5*ncolors; i++) {
1628     colorused[i] += colorused[i-1];
1629   }
1630   ncolors = colorused[5*ncolors-1];
1631   for (i=0; i<n; i++) {
1632     newcolor[i] = colorused[newcolor[i]]-1;
1633   }
1634   ierr = PetscFree(colorused);CHKERRQ(ierr);
1635   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1636   ierr = PetscFree(coloring);CHKERRQ(ierr);
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 #include "src/inline/ilu.h"
1641 
1642 #undef __FUNCT__
1643 #define __FUNCT__ "MatRelax_Inode"
1644 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1645 {
1646   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1647   PetscScalar        *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1648   MatScalar          *ibdiag,*bdiag;
1649   PetscScalar        *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1650   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1651   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1652   PetscErrorCode     ierr;
1653   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1654   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1655 
1656   PetscFunctionBegin;
1657   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1658   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1659   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1660 
1661   if (!a->inode.ibdiagvalid) {
1662     if (!a->inode.ibdiag) {
1663       /* calculate space needed for diagonal blocks */
1664       for (i=0; i<m; i++) {
1665 	cnt += sizes[i]*sizes[i];
1666       }
1667       a->inode.bdiagsize = cnt;
1668       ierr   = PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);CHKERRQ(ierr);
1669     }
1670 
1671     /* copy over the diagonal blocks and invert them */
1672     ibdiag = a->inode.ibdiag;
1673     bdiag  = a->inode.bdiag;
1674     cnt = 0;
1675     for (i=0, row = 0; i<m; i++) {
1676       for (j=0; j<sizes[i]; j++) {
1677         for (k=0; k<sizes[i]; k++) {
1678           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1679         }
1680       }
1681       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1682 
1683       switch(sizes[i]) {
1684         case 1:
1685           /* Create matrix data structure */
1686           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1687           ibdiag[cnt] = 1.0/ibdiag[cnt];
1688           break;
1689         case 2:
1690           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1691           break;
1692         case 3:
1693           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1694           break;
1695         case 4:
1696           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1697           break;
1698         case 5:
1699           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1700           break;
1701        default:
1702 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1703       }
1704       cnt += sizes[i]*sizes[i];
1705       row += sizes[i];
1706     }
1707     a->inode.ibdiagvalid = PETSC_TRUE;
1708   }
1709   ibdiag = a->inode.ibdiag;
1710   bdiag  = a->inode.bdiag;
1711 
1712   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1713   if (xx != bb) {
1714     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1715   } else {
1716     b = x;
1717   }
1718 
1719   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1720   xs   = x;
1721   if (flag & SOR_ZERO_INITIAL_GUESS) {
1722     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1723 
1724       for (i=0, row=0; i<m; i++) {
1725         sz  = diag[row] - ii[row];
1726         v1  = a->a + ii[row];
1727         idx = a->j + ii[row];
1728 
1729         /* see comments for MatMult_Inode() for how this is coded */
1730         switch (sizes[i]){
1731           case 1:
1732 
1733             sum1  = b[row];
1734             for(n = 0; n<sz-1; n+=2) {
1735               i1   = idx[0];
1736               i2   = idx[1];
1737               idx += 2;
1738               tmp0 = x[i1];
1739               tmp1 = x[i2];
1740               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1741             }
1742 
1743             if (n == sz-1){
1744               tmp0  = x[*idx];
1745               sum1 -= *v1 * tmp0;
1746             }
1747             x[row++] = sum1*(*ibdiag++);
1748             break;
1749           case 2:
1750             v2    = a->a + ii[row+1];
1751             sum1  = b[row];
1752             sum2  = b[row+1];
1753             for(n = 0; n<sz-1; n+=2) {
1754               i1   = idx[0];
1755               i2   = idx[1];
1756               idx += 2;
1757               tmp0 = x[i1];
1758               tmp1 = x[i2];
1759               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1760               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1761             }
1762 
1763             if (n == sz-1){
1764               tmp0  = x[*idx];
1765               sum1 -= v1[0] * tmp0;
1766               sum2 -= v2[0] * tmp0;
1767             }
1768             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1769             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1770             ibdiag  += 4;
1771             break;
1772           case 3:
1773             v2    = a->a + ii[row+1];
1774             v3    = a->a + ii[row+2];
1775             sum1  = b[row];
1776             sum2  = b[row+1];
1777             sum3  = b[row+2];
1778             for(n = 0; n<sz-1; n+=2) {
1779               i1   = idx[0];
1780               i2   = idx[1];
1781               idx += 2;
1782               tmp0 = x[i1];
1783               tmp1 = x[i2];
1784               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1785               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1786               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1787             }
1788 
1789             if (n == sz-1){
1790               tmp0  = x[*idx];
1791               sum1 -= v1[0] * tmp0;
1792               sum2 -= v2[0] * tmp0;
1793               sum3 -= v3[0] * tmp0;
1794             }
1795             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1796             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1797             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1798             ibdiag  += 9;
1799             break;
1800           case 4:
1801             v2    = a->a + ii[row+1];
1802             v3    = a->a + ii[row+2];
1803             v4    = a->a + ii[row+3];
1804             sum1  = b[row];
1805             sum2  = b[row+1];
1806             sum3  = b[row+2];
1807             sum4  = b[row+3];
1808             for(n = 0; n<sz-1; n+=2) {
1809               i1   = idx[0];
1810               i2   = idx[1];
1811               idx += 2;
1812               tmp0 = x[i1];
1813               tmp1 = x[i2];
1814               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1815               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1816               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1817               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1818             }
1819 
1820             if (n == sz-1){
1821               tmp0  = x[*idx];
1822               sum1 -= v1[0] * tmp0;
1823               sum2 -= v2[0] * tmp0;
1824               sum3 -= v3[0] * tmp0;
1825               sum4 -= v4[0] * tmp0;
1826             }
1827             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1828             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1829             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1830             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1831             ibdiag  += 16;
1832             break;
1833           case 5:
1834             v2    = a->a + ii[row+1];
1835             v3    = a->a + ii[row+2];
1836             v4    = a->a + ii[row+3];
1837             v5    = a->a + ii[row+4];
1838             sum1  = b[row];
1839             sum2  = b[row+1];
1840             sum3  = b[row+2];
1841             sum4  = b[row+3];
1842             sum5  = b[row+4];
1843             for(n = 0; n<sz-1; n+=2) {
1844               i1   = idx[0];
1845               i2   = idx[1];
1846               idx += 2;
1847               tmp0 = x[i1];
1848               tmp1 = x[i2];
1849               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1850               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1851               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1852               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1853               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1854             }
1855 
1856             if (n == sz-1){
1857               tmp0  = x[*idx];
1858               sum1 -= v1[0] * tmp0;
1859               sum2 -= v2[0] * tmp0;
1860               sum3 -= v3[0] * tmp0;
1861               sum4 -= v4[0] * tmp0;
1862               sum5 -= v5[0] * tmp0;
1863             }
1864             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1865             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1866             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1867             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1868             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1869             ibdiag  += 25;
1870             break;
1871 	  default:
1872    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1873         }
1874       }
1875 
1876       xb = x;
1877       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1878     } else xb = b;
1879     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1880         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1881       cnt = 0;
1882       for (i=0, row=0; i<m; i++) {
1883 
1884         switch (sizes[i]){
1885           case 1:
1886             x[row++] *= bdiag[cnt++];
1887             break;
1888           case 2:
1889             x1   = x[row]; x2 = x[row+1];
1890             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1891             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1892             x[row++] = tmp1;
1893             x[row++] = tmp2;
1894             cnt += 4;
1895             break;
1896           case 3:
1897             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1898             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1899             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1900             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1901             x[row++] = tmp1;
1902             x[row++] = tmp2;
1903             x[row++] = tmp3;
1904             cnt += 9;
1905             break;
1906           case 4:
1907             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1908             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1909             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1910             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1911             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1912             x[row++] = tmp1;
1913             x[row++] = tmp2;
1914             x[row++] = tmp3;
1915             x[row++] = tmp4;
1916             cnt += 16;
1917             break;
1918           case 5:
1919             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1920             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1921             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1922             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1923             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1924             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1925             x[row++] = tmp1;
1926             x[row++] = tmp2;
1927             x[row++] = tmp3;
1928             x[row++] = tmp4;
1929             x[row++] = tmp5;
1930             cnt += 25;
1931             break;
1932 	  default:
1933    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1934         }
1935       }
1936       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1937     }
1938     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1939 
1940       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1941       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1942         ibdiag -= sizes[i]*sizes[i];
1943         sz      = ii[row+1] - diag[row] - 1;
1944         v1      = a->a + diag[row] + 1;
1945         idx     = a->j + diag[row] + 1;
1946 
1947         /* see comments for MatMult_Inode() for how this is coded */
1948         switch (sizes[i]){
1949           case 1:
1950 
1951             sum1  = xb[row];
1952             for(n = 0; n<sz-1; n+=2) {
1953               i1   = idx[0];
1954               i2   = idx[1];
1955               idx += 2;
1956               tmp0 = x[i1];
1957               tmp1 = x[i2];
1958               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1959             }
1960 
1961             if (n == sz-1){
1962               tmp0  = x[*idx];
1963               sum1 -= *v1*tmp0;
1964             }
1965             x[row--] = sum1*(*ibdiag);
1966             break;
1967 
1968           case 2:
1969 
1970             sum1  = xb[row];
1971             sum2  = xb[row-1];
1972             /* note that sum1 is associated with the second of the two rows */
1973             v2    = a->a + diag[row-1] + 2;
1974             for(n = 0; n<sz-1; n+=2) {
1975               i1   = idx[0];
1976               i2   = idx[1];
1977               idx += 2;
1978               tmp0 = x[i1];
1979               tmp1 = x[i2];
1980               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1981               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1982             }
1983 
1984             if (n == sz-1){
1985               tmp0  = x[*idx];
1986               sum1 -= *v1*tmp0;
1987               sum2 -= *v2*tmp0;
1988             }
1989             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1990             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1991             break;
1992           case 3:
1993 
1994             sum1  = xb[row];
1995             sum2  = xb[row-1];
1996             sum3  = xb[row-2];
1997             v2    = a->a + diag[row-1] + 2;
1998             v3    = a->a + diag[row-2] + 3;
1999             for(n = 0; n<sz-1; n+=2) {
2000               i1   = idx[0];
2001               i2   = idx[1];
2002               idx += 2;
2003               tmp0 = x[i1];
2004               tmp1 = x[i2];
2005               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2006               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2007               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2008             }
2009 
2010             if (n == sz-1){
2011               tmp0  = x[*idx];
2012               sum1 -= *v1*tmp0;
2013               sum2 -= *v2*tmp0;
2014               sum3 -= *v3*tmp0;
2015             }
2016             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2017             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2018             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2019             break;
2020           case 4:
2021 
2022             sum1  = xb[row];
2023             sum2  = xb[row-1];
2024             sum3  = xb[row-2];
2025             sum4  = xb[row-3];
2026             v2    = a->a + diag[row-1] + 2;
2027             v3    = a->a + diag[row-2] + 3;
2028             v4    = a->a + diag[row-3] + 4;
2029             for(n = 0; n<sz-1; n+=2) {
2030               i1   = idx[0];
2031               i2   = idx[1];
2032               idx += 2;
2033               tmp0 = x[i1];
2034               tmp1 = x[i2];
2035               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2036               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2037               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2038               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2039             }
2040 
2041             if (n == sz-1){
2042               tmp0  = x[*idx];
2043               sum1 -= *v1*tmp0;
2044               sum2 -= *v2*tmp0;
2045               sum3 -= *v3*tmp0;
2046               sum4 -= *v4*tmp0;
2047             }
2048             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2049             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2050             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2051             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2052             break;
2053           case 5:
2054 
2055             sum1  = xb[row];
2056             sum2  = xb[row-1];
2057             sum3  = xb[row-2];
2058             sum4  = xb[row-3];
2059             sum5  = xb[row-4];
2060             v2    = a->a + diag[row-1] + 2;
2061             v3    = a->a + diag[row-2] + 3;
2062             v4    = a->a + diag[row-3] + 4;
2063             v5    = a->a + diag[row-4] + 5;
2064             for(n = 0; n<sz-1; n+=2) {
2065               i1   = idx[0];
2066               i2   = idx[1];
2067               idx += 2;
2068               tmp0 = x[i1];
2069               tmp1 = x[i2];
2070               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2071               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2072               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2073               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2074               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2075             }
2076 
2077             if (n == sz-1){
2078               tmp0  = x[*idx];
2079               sum1 -= *v1*tmp0;
2080               sum2 -= *v2*tmp0;
2081               sum3 -= *v3*tmp0;
2082               sum4 -= *v4*tmp0;
2083               sum5 -= *v5*tmp0;
2084             }
2085             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2086             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2087             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2088             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2089             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2090             break;
2091 	  default:
2092    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2093         }
2094       }
2095 
2096       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2097     }
2098     its--;
2099   }
2100   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2101   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2102   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 
2107 /*
2108     samestructure indicates that the matrix has not changed its nonzero structure so we
2109     do not need to recompute the inodes
2110 */
2111 #undef __FUNCT__
2112 #define __FUNCT__ "Mat_CheckInode"
2113 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2114 {
2115   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2116   PetscErrorCode ierr;
2117   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2118   PetscTruth     flag;
2119 
2120   PetscFunctionBegin;
2121   if (!a->inode.use)                     PetscFunctionReturn(0);
2122   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2123 
2124 
2125   m = A->rmap.n;
2126   if (a->inode.size) {ns = a->inode.size;}
2127   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2128 
2129   i          = 0;
2130   node_count = 0;
2131   idx        = a->j;
2132   ii         = a->i;
2133   while (i < m){                /* For each row */
2134     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2135     /* Limits the number of elements in a node to 'a->inode.limit' */
2136     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2137       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2138       if (nzy != nzx) break;
2139       idy  += nzx;             /* Same nonzero pattern */
2140       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2141       if (!flag) break;
2142     }
2143     ns[node_count++] = blk_size;
2144     idx += blk_size*nzx;
2145     i    = j;
2146   }
2147   /* If not enough inodes found,, do not use inode version of the routines */
2148   if (!a->inode.size && m && node_count > .9*m) {
2149     ierr = PetscFree(ns);CHKERRQ(ierr);
2150     a->inode.node_count     = 0;
2151     a->inode.size           = PETSC_NULL;
2152     a->inode.use            = PETSC_FALSE;
2153     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2154   } else {
2155     A->ops->mult            = MatMult_Inode;
2156     A->ops->relax           = MatRelax_Inode;
2157     A->ops->multadd         = MatMultAdd_Inode;
2158     A->ops->solve           = MatSolve_Inode;
2159     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2160     A->ops->getrowij        = MatGetRowIJ_Inode;
2161     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2162     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2163     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2164     A->ops->coloringpatch   = MatColoringPatch_Inode;
2165     a->inode.node_count     = node_count;
2166     a->inode.size           = ns;
2167     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2168   }
2169   PetscFunctionReturn(0);
2170 }
2171 
2172 /*
2173      This is really ugly. if inodes are used this replaces the
2174   permutations with ones that correspond to rows/cols of the matrix
2175   rather then inode blocks
2176 */
2177 #undef __FUNCT__
2178 #define __FUNCT__ "MatInodeAdjustForInodes"
2179 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2180 {
2181   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2182 
2183   PetscFunctionBegin;
2184   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2185   if (f) {
2186     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2187   }
2188   PetscFunctionReturn(0);
2189 }
2190 
2191 EXTERN_C_BEGIN
2192 #undef __FUNCT__
2193 #define __FUNCT__ "MatAdjustForInodes_Inode"
2194 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2195 {
2196   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2197   PetscErrorCode ierr;
2198   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2199   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2200   PetscInt       nslim_col,*ns_col;
2201   IS             ris = *rperm,cis = *cperm;
2202 
2203   PetscFunctionBegin;
2204   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2205   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2206 
2207   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2208   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2209   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2210   permc = permr + m;
2211 
2212   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2213   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2214 
2215   /* Form the inode structure for the rows of permuted matric using inv perm*/
2216   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2217 
2218   /* Construct the permutations for rows*/
2219   for (i=0,row = 0; i<nslim_row; ++i){
2220     indx      = ridx[i];
2221     start_val = tns[indx];
2222     end_val   = tns[indx + 1];
2223     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2224   }
2225 
2226   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2227   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2228 
2229  /* Construct permutations for columns */
2230   for (i=0,col=0; i<nslim_col; ++i){
2231     indx      = cidx[i];
2232     start_val = tns[indx];
2233     end_val   = tns[indx + 1];
2234     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2235   }
2236 
2237   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2238   ISSetPermutation(*rperm);
2239   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2240   ISSetPermutation(*cperm);
2241 
2242   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2243   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2244 
2245   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2246   ierr = PetscFree(permr);CHKERRQ(ierr);
2247   ierr = ISDestroy(cis);CHKERRQ(ierr);
2248   ierr = ISDestroy(ris);CHKERRQ(ierr);
2249   ierr = PetscFree(tns);CHKERRQ(ierr);
2250   PetscFunctionReturn(0);
2251 }
2252 EXTERN_C_END
2253 
2254 #undef __FUNCT__
2255 #define __FUNCT__ "MatInodeGetInodeSizes"
2256 /*@C
2257    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2258 
2259    Collective on Mat
2260 
2261    Input Parameter:
2262 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2263 
2264    Output Parameter:
2265 +  node_count - no of inodes present in the matrix.
2266 .  sizes      - an array of size node_count,with sizes of each inode.
2267 -  limit      - the max size used to generate the inodes.
2268 
2269    Level: advanced
2270 
2271    Notes: This routine returns some internal storage information
2272    of the matrix, it is intended to be used by advanced users.
2273    It should be called after the matrix is assembled.
2274    The contents of the sizes[] array should not be changed.
2275    PETSC_NULL may be passed for information not requested.
2276 
2277 .keywords: matrix, seqaij, get, inode
2278 
2279 .seealso: MatGetInfo()
2280 @*/
2281 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2282 {
2283   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2284 
2285   PetscFunctionBegin;
2286   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2287   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2288   if (f) {
2289     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2290   }
2291   PetscFunctionReturn(0);
2292 }
2293 
2294 EXTERN_C_BEGIN
2295 #undef __FUNCT__
2296 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2297 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2298 {
2299   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2300 
2301   PetscFunctionBegin;
2302   if (node_count) *node_count = a->inode.node_count;
2303   if (sizes)      *sizes      = a->inode.size;
2304   if (limit)      *limit      = a->inode.limit;
2305   PetscFunctionReturn(0);
2306 }
2307 EXTERN_C_END
2308