xref: /petsc/src/mat/impls/aij/seq/inode.c (revision b29801fcf26fbf6d208dd8c8ba676caeae7da2a8)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242 
243   if (symmetric) {
244     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
245   } else {
246     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNCT__
252 #define __FUNCT__ "MatRestoreRowIJ_Inode"
253 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
254 {
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   if (!ia) PetscFunctionReturn(0);
259   ierr = PetscFree(*ia);CHKERRQ(ierr);
260   ierr = PetscFree(*ja);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /* ----------------------------------------------------------- */
265 
266 #undef __FUNCT__
267 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
268 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269 {
270   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
271   PetscErrorCode ierr;
272   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
274 
275   PetscFunctionBegin;
276   nslim_row = a->inode.node_count;
277   n         = A->cmap.n;
278 
279   /* Create The column_inode for this matrix */
280   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
281 
282   /* allocate space for reformated column_inode structure */
283   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
284   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
285   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
286 
287   for (i1=0,col=0; i1<nslim_col; ++i1){
288     nsz = ns_col[i1];
289     for (i2=0; i2<nsz; ++i2,++col)
290       tvc[col] = i1;
291   }
292   /* allocate space for column pointers */
293   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
294   *iia = ia;
295   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
296   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
297 
298   /* determine the number of columns in each row */
299   ia[0] = oshift;
300   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
301     j   = aj + ai[row] + ishift;
302     col = *j++ + ishift;
303     i2  = tvc[col];
304     nz  = ai[row+1] - ai[row];
305     while (nz-- > 0) {           /* off-diagonal elemets */
306       /* ia[i1+1]++; */
307       ia[i2+1]++;
308       i2++;
309       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
310       if (nz > 0) i2 = tvc[col];
311     }
312   }
313 
314   /* shift ia[i] to point to next col */
315   for (i1=1; i1<nslim_col+1; i1++) {
316     col        = ia[i1-1];
317     ia[i1]    += col;
318     work[i1-1] = col - oshift;
319   }
320 
321   /* allocate space for column pointers */
322   nz   = ia[nslim_col] + (!ishift);
323   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
324   *jja = ja;
325 
326  /* loop over matrix putting into ja */
327   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
328     j   = aj + ai[row] + ishift;
329     i2  = 0;                     /* Col inode index */
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (symmetric) {
359     /* Since the indices are symmetric it does'nt matter */
360     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
361   } else {
362     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   }
364   PetscFunctionReturn(0);
365 }
366 
367 #undef __FUNCT__
368 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
369 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
370 {
371   PetscErrorCode ierr;
372 
373   PetscFunctionBegin;
374   if (!ia) PetscFunctionReturn(0);
375   ierr = PetscFree(*ia);CHKERRQ(ierr);
376   ierr = PetscFree(*ja);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 /* ----------------------------------------------------------- */
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatMult_Inode"
384 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
385 {
386   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
387   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
388   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
389   PetscErrorCode ierr;
390   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
391 
392 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
393 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
394 #endif
395 
396   PetscFunctionBegin;
397   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
398   node_max = a->inode.node_count;
399   ns       = a->inode.size;     /* Node Size array */
400   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
401   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
402   idx  = a->j;
403   v1   = a->a;
404   ii   = a->i;
405 
406   for (i = 0,row = 0; i< node_max; ++i){
407     nsz  = ns[i];
408     n    = ii[1] - ii[0];
409     ii  += nsz;
410     sz   = n;                   /* No of non zeros in this row */
411                                 /* Switch on the size of Node */
412     switch (nsz){               /* Each loop in 'case' is unrolled */
413     case 1 :
414       sum1  = 0;
415 
416       for(n = 0; n< sz-1; n+=2) {
417         i1   = idx[0];          /* The instructions are ordered to */
418         i2   = idx[1];          /* make the compiler's job easy */
419         idx += 2;
420         tmp0 = x[i1];
421         tmp1 = x[i2];
422         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
423        }
424 
425       if (n == sz-1){          /* Take care of the last nonzero  */
426         tmp0  = x[*idx++];
427         sum1 += *v1++ * tmp0;
428       }
429       y[row++]=sum1;
430       break;
431     case 2:
432       sum1  = 0;
433       sum2  = 0;
434       v2    = v1 + n;
435 
436       for (n = 0; n< sz-1; n+=2) {
437         i1   = idx[0];
438         i2   = idx[1];
439         idx += 2;
440         tmp0 = x[i1];
441         tmp1 = x[i2];
442         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
443         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
444       }
445       if (n == sz-1){
446         tmp0  = x[*idx++];
447         sum1 += *v1++ * tmp0;
448         sum2 += *v2++ * tmp0;
449       }
450       y[row++]=sum1;
451       y[row++]=sum2;
452       v1      =v2;              /* Since the next block to be processed starts there*/
453       idx    +=sz;
454       break;
455     case 3:
456       sum1  = 0;
457       sum2  = 0;
458       sum3  = 0;
459       v2    = v1 + n;
460       v3    = v2 + n;
461 
462       for (n = 0; n< sz-1; n+=2) {
463         i1   = idx[0];
464         i2   = idx[1];
465         idx += 2;
466         tmp0 = x[i1];
467         tmp1 = x[i2];
468         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
469         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
470         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
471       }
472       if (n == sz-1){
473         tmp0  = x[*idx++];
474         sum1 += *v1++ * tmp0;
475         sum2 += *v2++ * tmp0;
476         sum3 += *v3++ * tmp0;
477       }
478       y[row++]=sum1;
479       y[row++]=sum2;
480       y[row++]=sum3;
481       v1       =v3;             /* Since the next block to be processed starts there*/
482       idx     +=2*sz;
483       break;
484     case 4:
485       sum1  = 0;
486       sum2  = 0;
487       sum3  = 0;
488       sum4  = 0;
489       v2    = v1 + n;
490       v3    = v2 + n;
491       v4    = v3 + n;
492 
493       for (n = 0; n< sz-1; n+=2) {
494         i1   = idx[0];
495         i2   = idx[1];
496         idx += 2;
497         tmp0 = x[i1];
498         tmp1 = x[i2];
499         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
500         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
501         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
502         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
503       }
504       if (n == sz-1){
505         tmp0  = x[*idx++];
506         sum1 += *v1++ * tmp0;
507         sum2 += *v2++ * tmp0;
508         sum3 += *v3++ * tmp0;
509         sum4 += *v4++ * tmp0;
510       }
511       y[row++]=sum1;
512       y[row++]=sum2;
513       y[row++]=sum3;
514       y[row++]=sum4;
515       v1      =v4;              /* Since the next block to be processed starts there*/
516       idx    +=3*sz;
517       break;
518     case 5:
519       sum1  = 0;
520       sum2  = 0;
521       sum3  = 0;
522       sum4  = 0;
523       sum5  = 0;
524       v2    = v1 + n;
525       v3    = v2 + n;
526       v4    = v3 + n;
527       v5    = v4 + n;
528 
529       for (n = 0; n<sz-1; n+=2) {
530         i1   = idx[0];
531         i2   = idx[1];
532         idx += 2;
533         tmp0 = x[i1];
534         tmp1 = x[i2];
535         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
536         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
537         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
538         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
539         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
540       }
541       if (n == sz-1){
542         tmp0  = x[*idx++];
543         sum1 += *v1++ * tmp0;
544         sum2 += *v2++ * tmp0;
545         sum3 += *v3++ * tmp0;
546         sum4 += *v4++ * tmp0;
547         sum5 += *v5++ * tmp0;
548       }
549       y[row++]=sum1;
550       y[row++]=sum2;
551       y[row++]=sum3;
552       y[row++]=sum4;
553       y[row++]=sum5;
554       v1      =v5;       /* Since the next block to be processed starts there */
555       idx    +=4*sz;
556       break;
557     default :
558       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
559     }
560   }
561   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
562   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 /* ----------------------------------------------------------- */
567 /* Almost same code as the MatMult_Inode() */
568 #undef __FUNCT__
569 #define __FUNCT__ "MatMultAdd_Inode"
570 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
571 {
572   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
573   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
574   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
575   PetscErrorCode ierr;
576   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
577 
578   PetscFunctionBegin;
579   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
580   node_max = a->inode.node_count;
581   ns       = a->inode.size;     /* Node Size array */
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
584   if (zz != yy) {
585     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
586   } else {
587     z = y;
588   }
589   zt = z;
590 
591   idx  = a->j;
592   v1   = a->a;
593   ii   = a->i;
594 
595   for (i = 0,row = 0; i< node_max; ++i){
596     nsz  = ns[i];
597     n    = ii[1] - ii[0];
598     ii  += nsz;
599     sz   = n;                   /* No of non zeros in this row */
600                                 /* Switch on the size of Node */
601     switch (nsz){               /* Each loop in 'case' is unrolled */
602     case 1 :
603       sum1  = *zt++;
604 
605       for(n = 0; n< sz-1; n+=2) {
606         i1   = idx[0];          /* The instructions are ordered to */
607         i2   = idx[1];          /* make the compiler's job easy */
608         idx += 2;
609         tmp0 = x[i1];
610         tmp1 = x[i2];
611         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
612        }
613 
614       if(n   == sz-1){          /* Take care of the last nonzero  */
615         tmp0  = x[*idx++];
616         sum1 += *v1++ * tmp0;
617       }
618       y[row++]=sum1;
619       break;
620     case 2:
621       sum1  = *zt++;
622       sum2  = *zt++;
623       v2    = v1 + n;
624 
625       for(n = 0; n< sz-1; n+=2) {
626         i1   = idx[0];
627         i2   = idx[1];
628         idx += 2;
629         tmp0 = x[i1];
630         tmp1 = x[i2];
631         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
632         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
633       }
634       if(n   == sz-1){
635         tmp0  = x[*idx++];
636         sum1 += *v1++ * tmp0;
637         sum2 += *v2++ * tmp0;
638       }
639       y[row++]=sum1;
640       y[row++]=sum2;
641       v1      =v2;              /* Since the next block to be processed starts there*/
642       idx    +=sz;
643       break;
644     case 3:
645       sum1  = *zt++;
646       sum2  = *zt++;
647       sum3  = *zt++;
648       v2    = v1 + n;
649       v3    = v2 + n;
650 
651       for (n = 0; n< sz-1; n+=2) {
652         i1   = idx[0];
653         i2   = idx[1];
654         idx += 2;
655         tmp0 = x[i1];
656         tmp1 = x[i2];
657         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
658         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
659         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
660       }
661       if (n == sz-1){
662         tmp0  = x[*idx++];
663         sum1 += *v1++ * tmp0;
664         sum2 += *v2++ * tmp0;
665         sum3 += *v3++ * tmp0;
666       }
667       y[row++]=sum1;
668       y[row++]=sum2;
669       y[row++]=sum3;
670       v1       =v3;             /* Since the next block to be processed starts there*/
671       idx     +=2*sz;
672       break;
673     case 4:
674       sum1  = *zt++;
675       sum2  = *zt++;
676       sum3  = *zt++;
677       sum4  = *zt++;
678       v2    = v1 + n;
679       v3    = v2 + n;
680       v4    = v3 + n;
681 
682       for (n = 0; n< sz-1; n+=2) {
683         i1   = idx[0];
684         i2   = idx[1];
685         idx += 2;
686         tmp0 = x[i1];
687         tmp1 = x[i2];
688         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
689         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
690         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
691         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
692       }
693       if (n == sz-1){
694         tmp0  = x[*idx++];
695         sum1 += *v1++ * tmp0;
696         sum2 += *v2++ * tmp0;
697         sum3 += *v3++ * tmp0;
698         sum4 += *v4++ * tmp0;
699       }
700       y[row++]=sum1;
701       y[row++]=sum2;
702       y[row++]=sum3;
703       y[row++]=sum4;
704       v1      =v4;              /* Since the next block to be processed starts there*/
705       idx    +=3*sz;
706       break;
707     case 5:
708       sum1  = *zt++;
709       sum2  = *zt++;
710       sum3  = *zt++;
711       sum4  = *zt++;
712       sum5  = *zt++;
713       v2    = v1 + n;
714       v3    = v2 + n;
715       v4    = v3 + n;
716       v5    = v4 + n;
717 
718       for (n = 0; n<sz-1; n+=2) {
719         i1   = idx[0];
720         i2   = idx[1];
721         idx += 2;
722         tmp0 = x[i1];
723         tmp1 = x[i2];
724         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
725         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
726         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
727         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
728         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
729       }
730       if(n   == sz-1){
731         tmp0  = x[*idx++];
732         sum1 += *v1++ * tmp0;
733         sum2 += *v2++ * tmp0;
734         sum3 += *v3++ * tmp0;
735         sum4 += *v4++ * tmp0;
736         sum5 += *v5++ * tmp0;
737       }
738       y[row++]=sum1;
739       y[row++]=sum2;
740       y[row++]=sum3;
741       y[row++]=sum4;
742       y[row++]=sum5;
743       v1      =v5;       /* Since the next block to be processed starts there */
744       idx    +=4*sz;
745       break;
746     default :
747       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
748     }
749   }
750   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
751   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
752   if (zz != yy) {
753     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
754   }
755   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
756   PetscFunctionReturn(0);
757 }
758 
759 /* ----------------------------------------------------------- */
760 #undef __FUNCT__
761 #define __FUNCT__ "MatSolve_Inode"
762 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
763 {
764   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
765   IS             iscol = a->col,isrow = a->row;
766   PetscErrorCode ierr;
767   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
768   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
769   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
770   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
771 
772   PetscFunctionBegin;
773   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
774   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
775   node_max = a->inode.node_count;
776   ns       = a->inode.size;     /* Node Size array */
777 
778   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
779   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
780   tmp  = a->solve_work;
781 
782   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
783   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
784 
785   /* forward solve the lower triangular */
786   tmps = tmp ;
787   aa   = a_a ;
788   aj   = a_j ;
789   ad   = a->diag;
790 
791   for (i = 0,row = 0; i< node_max; ++i){
792     nsz = ns[i];
793     aii = ai[row];
794     v1  = aa + aii;
795     vi  = aj + aii;
796     nz  = ad[row]- aii;
797 
798     switch (nsz){               /* Each loop in 'case' is unrolled */
799     case 1 :
800       sum1 = b[*r++];
801       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
802       for(j=0; j<nz-1; j+=2){
803         i0   = vi[0];
804         i1   = vi[1];
805         vi  +=2;
806         tmp0 = tmps[i0];
807         tmp1 = tmps[i1];
808         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809       }
810       if(j == nz-1){
811         tmp0 = tmps[*vi++];
812         sum1 -= *v1++ *tmp0;
813       }
814       tmp[row ++]=sum1;
815       break;
816     case 2:
817       sum1 = b[*r++];
818       sum2 = b[*r++];
819       v2   = aa + ai[row+1];
820 
821       for(j=0; j<nz-1; j+=2){
822         i0   = vi[0];
823         i1   = vi[1];
824         vi  +=2;
825         tmp0 = tmps[i0];
826         tmp1 = tmps[i1];
827         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829       }
830       if(j == nz-1){
831         tmp0 = tmps[*vi++];
832         sum1 -= *v1++ *tmp0;
833         sum2 -= *v2++ *tmp0;
834       }
835       sum2 -= *v2++ * sum1;
836       tmp[row ++]=sum1;
837       tmp[row ++]=sum2;
838       break;
839     case 3:
840       sum1 = b[*r++];
841       sum2 = b[*r++];
842       sum3 = b[*r++];
843       v2   = aa + ai[row+1];
844       v3   = aa + ai[row+2];
845 
846       for (j=0; j<nz-1; j+=2){
847         i0   = vi[0];
848         i1   = vi[1];
849         vi  +=2;
850         tmp0 = tmps[i0];
851         tmp1 = tmps[i1];
852         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855       }
856       if (j == nz-1){
857         tmp0 = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860         sum3 -= *v3++ *tmp0;
861       }
862       sum2 -= *v2++ * sum1;
863       sum3 -= *v3++ * sum1;
864       sum3 -= *v3++ * sum2;
865       tmp[row ++]=sum1;
866       tmp[row ++]=sum2;
867       tmp[row ++]=sum3;
868       break;
869 
870     case 4:
871       sum1 = b[*r++];
872       sum2 = b[*r++];
873       sum3 = b[*r++];
874       sum4 = b[*r++];
875       v2   = aa + ai[row+1];
876       v3   = aa + ai[row+2];
877       v4   = aa + ai[row+3];
878 
879       for (j=0; j<nz-1; j+=2){
880         i0   = vi[0];
881         i1   = vi[1];
882         vi  +=2;
883         tmp0 = tmps[i0];
884         tmp1 = tmps[i1];
885         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
886         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
887         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
888         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
889       }
890       if (j == nz-1){
891         tmp0 = tmps[*vi++];
892         sum1 -= *v1++ *tmp0;
893         sum2 -= *v2++ *tmp0;
894         sum3 -= *v3++ *tmp0;
895         sum4 -= *v4++ *tmp0;
896       }
897       sum2 -= *v2++ * sum1;
898       sum3 -= *v3++ * sum1;
899       sum4 -= *v4++ * sum1;
900       sum3 -= *v3++ * sum2;
901       sum4 -= *v4++ * sum2;
902       sum4 -= *v4++ * sum3;
903 
904       tmp[row ++]=sum1;
905       tmp[row ++]=sum2;
906       tmp[row ++]=sum3;
907       tmp[row ++]=sum4;
908       break;
909     case 5:
910       sum1 = b[*r++];
911       sum2 = b[*r++];
912       sum3 = b[*r++];
913       sum4 = b[*r++];
914       sum5 = b[*r++];
915       v2   = aa + ai[row+1];
916       v3   = aa + ai[row+2];
917       v4   = aa + ai[row+3];
918       v5   = aa + ai[row+4];
919 
920       for (j=0; j<nz-1; j+=2){
921         i0   = vi[0];
922         i1   = vi[1];
923         vi  +=2;
924         tmp0 = tmps[i0];
925         tmp1 = tmps[i1];
926         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
927         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
928         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
929         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
930         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
931       }
932       if (j == nz-1){
933         tmp0 = tmps[*vi++];
934         sum1 -= *v1++ *tmp0;
935         sum2 -= *v2++ *tmp0;
936         sum3 -= *v3++ *tmp0;
937         sum4 -= *v4++ *tmp0;
938         sum5 -= *v5++ *tmp0;
939       }
940 
941       sum2 -= *v2++ * sum1;
942       sum3 -= *v3++ * sum1;
943       sum4 -= *v4++ * sum1;
944       sum5 -= *v5++ * sum1;
945       sum3 -= *v3++ * sum2;
946       sum4 -= *v4++ * sum2;
947       sum5 -= *v5++ * sum2;
948       sum4 -= *v4++ * sum3;
949       sum5 -= *v5++ * sum3;
950       sum5 -= *v5++ * sum4;
951 
952       tmp[row ++]=sum1;
953       tmp[row ++]=sum2;
954       tmp[row ++]=sum3;
955       tmp[row ++]=sum4;
956       tmp[row ++]=sum5;
957       break;
958     default:
959       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
960     }
961   }
962   /* backward solve the upper triangular */
963   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
964     nsz = ns[i];
965     aii = ai[row+1] -1;
966     v1  = aa + aii;
967     vi  = aj + aii;
968     nz  = aii- ad[row];
969     switch (nsz){               /* Each loop in 'case' is unrolled */
970     case 1 :
971       sum1 = tmp[row];
972 
973       for(j=nz ; j>1; j-=2){
974         vi  -=2;
975         i0   = vi[2];
976         i1   = vi[1];
977         tmp0 = tmps[i0];
978         tmp1 = tmps[i1];
979         v1   -= 2;
980         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
981       }
982       if (j==1){
983         tmp0  = tmps[*vi--];
984         sum1 -= *v1-- * tmp0;
985       }
986       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
987       break;
988     case 2 :
989       sum1 = tmp[row];
990       sum2 = tmp[row -1];
991       v2   = aa + ai[row]-1;
992       for (j=nz ; j>1; j-=2){
993         vi  -=2;
994         i0   = vi[2];
995         i1   = vi[1];
996         tmp0 = tmps[i0];
997         tmp1 = tmps[i1];
998         v1   -= 2;
999         v2   -= 2;
1000         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1001         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1002       }
1003       if (j==1){
1004         tmp0  = tmps[*vi--];
1005         sum1 -= *v1-- * tmp0;
1006         sum2 -= *v2-- * tmp0;
1007       }
1008 
1009       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010       sum2   -= *v2-- * tmp0;
1011       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1012       break;
1013     case 3 :
1014       sum1 = tmp[row];
1015       sum2 = tmp[row -1];
1016       sum3 = tmp[row -2];
1017       v2   = aa + ai[row]-1;
1018       v3   = aa + ai[row -1]-1;
1019       for (j=nz ; j>1; j-=2){
1020         vi  -=2;
1021         i0   = vi[2];
1022         i1   = vi[1];
1023         tmp0 = tmps[i0];
1024         tmp1 = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         v3   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1031       }
1032       if (j==1){
1033         tmp0  = tmps[*vi--];
1034         sum1 -= *v1-- * tmp0;
1035         sum2 -= *v2-- * tmp0;
1036         sum3 -= *v3-- * tmp0;
1037       }
1038       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1039       sum2   -= *v2-- * tmp0;
1040       sum3   -= *v3-- * tmp0;
1041       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1042       sum3   -= *v3-- * tmp0;
1043       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1044 
1045       break;
1046     case 4 :
1047       sum1 = tmp[row];
1048       sum2 = tmp[row -1];
1049       sum3 = tmp[row -2];
1050       sum4 = tmp[row -3];
1051       v2   = aa + ai[row]-1;
1052       v3   = aa + ai[row -1]-1;
1053       v4   = aa + ai[row -2]-1;
1054 
1055       for (j=nz ; j>1; j-=2){
1056         vi  -=2;
1057         i0   = vi[2];
1058         i1   = vi[1];
1059         tmp0 = tmps[i0];
1060         tmp1 = tmps[i1];
1061         v1  -= 2;
1062         v2  -= 2;
1063         v3  -= 2;
1064         v4  -= 2;
1065         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1066         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1067         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1068         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1069       }
1070       if (j==1){
1071         tmp0  = tmps[*vi--];
1072         sum1 -= *v1-- * tmp0;
1073         sum2 -= *v2-- * tmp0;
1074         sum3 -= *v3-- * tmp0;
1075         sum4 -= *v4-- * tmp0;
1076       }
1077 
1078       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1079       sum2   -= *v2-- * tmp0;
1080       sum3   -= *v3-- * tmp0;
1081       sum4   -= *v4-- * tmp0;
1082       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1083       sum3   -= *v3-- * tmp0;
1084       sum4   -= *v4-- * tmp0;
1085       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1086       sum4   -= *v4-- * tmp0;
1087       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1088       break;
1089     case 5 :
1090       sum1 = tmp[row];
1091       sum2 = tmp[row -1];
1092       sum3 = tmp[row -2];
1093       sum4 = tmp[row -3];
1094       sum5 = tmp[row -4];
1095       v2   = aa + ai[row]-1;
1096       v3   = aa + ai[row -1]-1;
1097       v4   = aa + ai[row -2]-1;
1098       v5   = aa + ai[row -3]-1;
1099       for (j=nz ; j>1; j-=2){
1100         vi  -= 2;
1101         i0   = vi[2];
1102         i1   = vi[1];
1103         tmp0 = tmps[i0];
1104         tmp1 = tmps[i1];
1105         v1   -= 2;
1106         v2   -= 2;
1107         v3   -= 2;
1108         v4   -= 2;
1109         v5   -= 2;
1110         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1111         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1112         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1113         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1114         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1115       }
1116       if (j==1){
1117         tmp0  = tmps[*vi--];
1118         sum1 -= *v1-- * tmp0;
1119         sum2 -= *v2-- * tmp0;
1120         sum3 -= *v3-- * tmp0;
1121         sum4 -= *v4-- * tmp0;
1122         sum5 -= *v5-- * tmp0;
1123       }
1124 
1125       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1126       sum2   -= *v2-- * tmp0;
1127       sum3   -= *v3-- * tmp0;
1128       sum4   -= *v4-- * tmp0;
1129       sum5   -= *v5-- * tmp0;
1130       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1131       sum3   -= *v3-- * tmp0;
1132       sum4   -= *v4-- * tmp0;
1133       sum5   -= *v5-- * tmp0;
1134       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1135       sum4   -= *v4-- * tmp0;
1136       sum5   -= *v5-- * tmp0;
1137       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1138       sum5   -= *v5-- * tmp0;
1139       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1140       break;
1141     default:
1142       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1143     }
1144   }
1145   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1146   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1147   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1148   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1149   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 #undef __FUNCT__
1154 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1155 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1156 {
1157   Mat            C = *B;
1158   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1159   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1160   PetscErrorCode ierr;
1161   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1162   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1163   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1164   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1165   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1166   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv;
1167   PetscReal      rs=0.0;
1168   LUShift_Ctx    sctx;
1169   PetscInt       newshift;
1170 
1171   PetscFunctionBegin;
1172   sctx.shift_top  = 0;
1173   sctx.nshift_max = 0;
1174   sctx.shift_lo   = 0;
1175   sctx.shift_hi   = 0;
1176 
1177   /* if both shift schemes are chosen by user, only use info->shiftpd */
1178   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1179   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1180     sctx.shift_top = 0;
1181     for (i=0; i<n; i++) {
1182       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183       rs    = 0.0;
1184       ajtmp = aj + ai[i];
1185       rtmp1 = aa + ai[i];
1186       nz = ai[i+1] - ai[i];
1187       for (j=0; j<nz; j++){
1188         if (*ajtmp != i){
1189           rs += PetscAbsScalar(*rtmp1++);
1190         } else {
1191           rs -= PetscRealPart(*rtmp1++);
1192         }
1193         ajtmp++;
1194       }
1195       if (rs>sctx.shift_top) sctx.shift_top = rs;
1196     }
1197     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1198     sctx.shift_top *= 1.1;
1199     sctx.nshift_max = 5;
1200     sctx.shift_lo   = 0.;
1201     sctx.shift_hi   = 1.;
1202   }
1203   sctx.shift_amount = 0;
1204   sctx.nshift       = 0;
1205 
1206   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1207   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1208   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1209   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1210   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1211   ics   = ic ;
1212   rtmp2 = rtmp1 + n;
1213   rtmp3 = rtmp2 + n;
1214 
1215   node_max = a->inode.node_count;
1216   ns       = a->inode.size ;
1217   if (!ns){
1218     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1219   }
1220 
1221   /* If max inode size > 3, split it into two inodes.*/
1222   /* also map the inode sizes according to the ordering */
1223   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1224   for (i=0,j=0; i<node_max; ++i,++j){
1225     if (ns[i]>3) {
1226       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1227       ++j;
1228       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1229     } else {
1230       tmp_vec1[j] = ns[i];
1231     }
1232   }
1233   /* Use the correct node_max */
1234   node_max = j;
1235 
1236   /* Now reorder the inode info based on mat re-ordering info */
1237   /* First create a row -> inode_size_array_index map */
1238   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1239   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1240   for (i=0,row=0; i<node_max; i++) {
1241     nodesz = tmp_vec1[i];
1242     for (j=0; j<nodesz; j++,row++) {
1243       nsmap[row] = i;
1244     }
1245   }
1246   /* Using nsmap, create a reordered ns structure */
1247   for (i=0,j=0; i< node_max; i++) {
1248     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1249     tmp_vec2[i]  = nodesz;
1250     j           += nodesz;
1251   }
1252   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1253   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1254   /* Now use the correct ns */
1255   ns = tmp_vec2;
1256 
1257   do {
1258     sctx.lushift = PETSC_FALSE;
1259     /* Now loop over each block-row, and do the factorization */
1260     for (i=0,row=0; i<node_max; i++) {
1261       nodesz = ns[i];
1262       nz     = bi[row+1] - bi[row];
1263       bjtmp  = bj + bi[row];
1264 
1265       switch (nodesz){
1266       case 1:
1267         for  (j=0; j<nz; j++){
1268           idx        = bjtmp[j];
1269           rtmp1[idx] = 0.0;
1270         }
1271 
1272         /* load in initial (unfactored row) */
1273         idx    = r[row];
1274         nz_tmp = ai[idx+1] - ai[idx];
1275         ajtmp  = aj + ai[idx];
1276         v1     = aa + ai[idx];
1277 
1278         for (j=0; j<nz_tmp; j++) {
1279           idx        = ics[ajtmp[j]];
1280           rtmp1[idx] = v1[j];
1281         }
1282         rtmp1[ics[r[row]]] += sctx.shift_amount;
1283 
1284         prow = *bjtmp++ ;
1285         while (prow < row) {
1286           pc1 = rtmp1 + prow;
1287           if (*pc1 != 0.0){
1288             pv   = ba + bd[prow];
1289             pj   = nbj + bd[prow];
1290             mul1 = *pc1 * *pv++;
1291             *pc1 = mul1;
1292             nz_tmp = bi[prow+1] - bd[prow] - 1;
1293             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1294             for (j=0; j<nz_tmp; j++) {
1295               tmp = pv[j];
1296               idx = pj[j];
1297               rtmp1[idx] -= mul1 * tmp;
1298             }
1299           }
1300           prow = *bjtmp++ ;
1301         }
1302         pj  = bj + bi[row];
1303         pc1 = ba + bi[row];
1304 
1305         sctx.pv    = rtmp1[row];
1306         rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1307         rs         = 0.0;
1308         for (j=0; j<nz; j++) {
1309           idx    = pj[j];
1310           pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1311           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1312         }
1313         sctx.rs  = rs;
1314         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1315         if (newshift == 1) goto endofwhile;
1316         break;
1317 
1318       case 2:
1319         for (j=0; j<nz; j++) {
1320           idx        = bjtmp[j];
1321           rtmp1[idx] = 0.0;
1322           rtmp2[idx] = 0.0;
1323         }
1324 
1325         /* load in initial (unfactored row) */
1326         idx    = r[row];
1327         nz_tmp = ai[idx+1] - ai[idx];
1328         ajtmp  = aj + ai[idx];
1329         v1     = aa + ai[idx];
1330         v2     = aa + ai[idx+1];
1331         for (j=0; j<nz_tmp; j++) {
1332           idx        = ics[ajtmp[j]];
1333           rtmp1[idx] = v1[j];
1334           rtmp2[idx] = v2[j];
1335         }
1336         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1337         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1338 
1339         prow = *bjtmp++ ;
1340         while (prow < row) {
1341           pc1 = rtmp1 + prow;
1342           pc2 = rtmp2 + prow;
1343           if (*pc1 != 0.0 || *pc2 != 0.0){
1344             pv   = ba + bd[prow];
1345             pj   = nbj + bd[prow];
1346             mul1 = *pc1 * *pv;
1347             mul2 = *pc2 * *pv;
1348             ++pv;
1349             *pc1 = mul1;
1350             *pc2 = mul2;
1351 
1352             nz_tmp = bi[prow+1] - bd[prow] - 1;
1353             for (j=0; j<nz_tmp; j++) {
1354               tmp = pv[j];
1355               idx = pj[j];
1356               rtmp1[idx] -= mul1 * tmp;
1357               rtmp2[idx] -= mul2 * tmp;
1358             }
1359             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1360           }
1361           prow = *bjtmp++ ;
1362         }
1363 
1364         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1365         pc1 = rtmp1 + prow;
1366         pc2 = rtmp2 + prow;
1367 
1368         sctx.pv = *pc1;
1369         pj      = bj + bi[prow];
1370         rs      = 0.0;
1371         for (j=0; j<nz; j++){
1372           idx = pj[j];
1373           if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1374         }
1375         sctx.rs = rs;
1376         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1377         if (newshift == 1) goto endofwhile;
1378 
1379         if (*pc2 != 0.0){
1380           pj     = nbj + bd[prow];
1381           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1382           *pc2   = mul2;
1383           nz_tmp = bi[prow+1] - bd[prow] - 1;
1384           for (j=0; j<nz_tmp; j++) {
1385             idx = pj[j] ;
1386             tmp = rtmp1[idx];
1387             rtmp2[idx] -= mul2 * tmp;
1388           }
1389           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1390         }
1391 
1392         pj  = bj + bi[row];
1393         pc1 = ba + bi[row];
1394         pc2 = ba + bi[row+1];
1395 
1396         sctx.pv = rtmp2[row+1];
1397         rs = 0.0;
1398         rtmp1[row]   = 1.0/rtmp1[row];
1399         rtmp2[row+1] = 1.0/rtmp2[row+1];
1400         /* copy row entries from dense representation to sparse */
1401         for (j=0; j<nz; j++) {
1402           idx    = pj[j];
1403           pc1[j] = rtmp1[idx];
1404           pc2[j] = rtmp2[idx];
1405           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1406         }
1407         sctx.rs = rs;
1408         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1409         if (newshift == 1) goto endofwhile;
1410         break;
1411 
1412       case 3:
1413         for  (j=0; j<nz; j++) {
1414           idx        = bjtmp[j];
1415           rtmp1[idx] = 0.0;
1416           rtmp2[idx] = 0.0;
1417           rtmp3[idx] = 0.0;
1418         }
1419         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1420         idx    = r[row];
1421         nz_tmp = ai[idx+1] - ai[idx];
1422         ajtmp = aj + ai[idx];
1423         v1    = aa + ai[idx];
1424         v2    = aa + ai[idx+1];
1425         v3    = aa + ai[idx+2];
1426         for (j=0; j<nz_tmp; j++) {
1427           idx        = ics[ajtmp[j]];
1428           rtmp1[idx] = v1[j];
1429           rtmp2[idx] = v2[j];
1430           rtmp3[idx] = v3[j];
1431         }
1432         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1433         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1434         rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1435 
1436         /* loop over all pivot row blocks above this row block */
1437         prow = *bjtmp++ ;
1438         while (prow < row) {
1439           pc1 = rtmp1 + prow;
1440           pc2 = rtmp2 + prow;
1441           pc3 = rtmp3 + prow;
1442           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1443             pv   = ba  + bd[prow];
1444             pj   = nbj + bd[prow];
1445             mul1 = *pc1 * *pv;
1446             mul2 = *pc2 * *pv;
1447             mul3 = *pc3 * *pv;
1448             ++pv;
1449             *pc1 = mul1;
1450             *pc2 = mul2;
1451             *pc3 = mul3;
1452 
1453             nz_tmp = bi[prow+1] - bd[prow] - 1;
1454             /* update this row based on pivot row */
1455             for (j=0; j<nz_tmp; j++) {
1456               tmp = pv[j];
1457               idx = pj[j];
1458               rtmp1[idx] -= mul1 * tmp;
1459               rtmp2[idx] -= mul2 * tmp;
1460               rtmp3[idx] -= mul3 * tmp;
1461             }
1462             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1463           }
1464           prow = *bjtmp++ ;
1465         }
1466 
1467         /* Now take care of diagonal 3x3 block in this set of rows */
1468         /* note: prow = row here */
1469         pc1 = rtmp1 + prow;
1470         pc2 = rtmp2 + prow;
1471         pc3 = rtmp3 + prow;
1472 
1473         sctx.pv = *pc1;
1474         pj      = bj + bi[prow];
1475         rs      = 0.0;
1476         for (j=0; j<nz; j++){
1477           idx = pj[j];
1478           if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1479         }
1480         sctx.rs = rs;
1481         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1482         if (newshift == 1) goto endofwhile;
1483 
1484         if (*pc2 != 0.0 || *pc3 != 0.0){
1485           mul2 = (*pc2)/(*pc1);
1486           mul3 = (*pc3)/(*pc1);
1487           *pc2 = mul2;
1488           *pc3 = mul3;
1489           nz_tmp = bi[prow+1] - bd[prow] - 1;
1490           pj     = nbj + bd[prow];
1491           for (j=0; j<nz_tmp; j++) {
1492             idx = pj[j] ;
1493             tmp = rtmp1[idx];
1494             rtmp2[idx] -= mul2 * tmp;
1495             rtmp3[idx] -= mul3 * tmp;
1496           }
1497           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1498         }
1499         ++prow;
1500 
1501         pc2 = rtmp2 + prow;
1502         pc3 = rtmp3 + prow;
1503         sctx.pv = *pc2;
1504         pj      = bj + bi[prow];
1505         rs      = 0.0;
1506         for (j=0; j<nz; j++){
1507           idx = pj[j];
1508           if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1509         }
1510         sctx.rs = rs;
1511         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1512         if (newshift == 1) goto endofwhile;
1513 
1514         if (*pc3 != 0.0){
1515           mul3   = (*pc3)/(*pc2);
1516           *pc3   = mul3;
1517           pj     = nbj + bd[prow];
1518           nz_tmp = bi[prow+1] - bd[prow] - 1;
1519           for (j=0; j<nz_tmp; j++) {
1520             idx = pj[j] ;
1521             tmp = rtmp2[idx];
1522             rtmp3[idx] -= mul3 * tmp;
1523           }
1524           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1525         }
1526 
1527         pj  = bj + bi[row];
1528         pc1 = ba + bi[row];
1529         pc2 = ba + bi[row+1];
1530         pc3 = ba + bi[row+2];
1531 
1532         sctx.pv = rtmp3[row+2];
1533         rs = 0.0;
1534         rtmp1[row]   = 1.0/rtmp1[row];
1535         rtmp2[row+1] = 1.0/rtmp2[row+1];
1536         rtmp3[row+2] = 1.0/rtmp3[row+2];
1537         /* copy row entries from dense representation to sparse */
1538         for (j=0; j<nz; j++) {
1539           idx    = pj[j];
1540           pc1[j] = rtmp1[idx];
1541           pc2[j] = rtmp2[idx];
1542           pc3[j] = rtmp3[idx];
1543           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1544         }
1545 
1546         sctx.rs = rs;
1547         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1548         if (newshift == 1) goto endofwhile;
1549         break;
1550 
1551       default:
1552         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1553       }
1554       row += nodesz;                 /* Update the row */
1555     }
1556     endofwhile:;
1557   } while (sctx.lushift);
1558   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1559   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1560   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1561   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1562   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1563   C->factor      = FACTOR_LU;
1564   C->assembled   = PETSC_TRUE;
1565   if (sctx.nshift) {
1566     if (info->shiftnz) {
1567       ierr = PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1568     } else if (info->shiftpd) {
1569       ierr = PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1570     }
1571   }
1572   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 /*
1577      Makes a longer coloring[] array and calls the usual code with that
1578 */
1579 #undef __FUNCT__
1580 #define __FUNCT__ "MatColoringPatch_Inode"
1581 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1582 {
1583   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1584   PetscErrorCode  ierr;
1585   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1586   PetscInt        *colorused,i;
1587   ISColoringValue *newcolor;
1588 
1589   PetscFunctionBegin;
1590   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1591   /* loop over inodes, marking a color for each column*/
1592   row = 0;
1593   for (i=0; i<m; i++){
1594     for (j=0; j<ns[i]; j++) {
1595       newcolor[row++] = coloring[i] + j*ncolors;
1596     }
1597   }
1598 
1599   /* eliminate unneeded colors */
1600   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1601   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1602   for (i=0; i<n; i++) {
1603     colorused[newcolor[i]] = 1;
1604   }
1605 
1606   for (i=1; i<5*ncolors; i++) {
1607     colorused[i] += colorused[i-1];
1608   }
1609   ncolors = colorused[5*ncolors-1];
1610   for (i=0; i<n; i++) {
1611     newcolor[i] = colorused[newcolor[i]];
1612   }
1613   ierr = PetscFree(colorused);CHKERRQ(ierr);
1614   ierr = ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1615   ierr = PetscFree(coloring);CHKERRQ(ierr);
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 /*
1620     samestructure indicates that the matrix has not changed its nonzero structure so we
1621     do not need to recompute the inodes
1622 */
1623 #undef __FUNCT__
1624 #define __FUNCT__ "Mat_CheckInode"
1625 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1626 {
1627   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1628   PetscErrorCode ierr;
1629   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1630   PetscTruth     flag;
1631 
1632   PetscFunctionBegin;
1633   if (!a->inode.use)                     PetscFunctionReturn(0);
1634   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
1635 
1636 
1637   m = A->rmap.n;
1638   if (a->inode.size) {ns = a->inode.size;}
1639   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
1640 
1641   i          = 0;
1642   node_count = 0;
1643   idx        = a->j;
1644   ii         = a->i;
1645   while (i < m){                /* For each row */
1646     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
1647     /* Limits the number of elements in a node to 'a->inode.limit' */
1648     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1649       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
1650       if (nzy != nzx) break;
1651       idy  += nzx;             /* Same nonzero pattern */
1652       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1653       if (!flag) break;
1654     }
1655     ns[node_count++] = blk_size;
1656     idx += blk_size*nzx;
1657     i    = j;
1658   }
1659   /* If not enough inodes found,, do not use inode version of the routines */
1660   if (!a->inode.size && m && node_count > 0.9*m) {
1661     ierr = PetscFree(ns);CHKERRQ(ierr);
1662     a->inode.node_count     = 0;
1663     a->inode.size           = PETSC_NULL;
1664     a->inode.use            = PETSC_FALSE;
1665     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
1666   } else {
1667     A->ops->mult            = MatMult_Inode;
1668     A->ops->multadd         = MatMultAdd_Inode;
1669     A->ops->solve           = MatSolve_Inode;
1670     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1671     A->ops->getrowij        = MatGetRowIJ_Inode;
1672     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
1673     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
1674     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1675     A->ops->coloringpatch   = MatColoringPatch_Inode;
1676     a->inode.node_count     = node_count;
1677     a->inode.size           = ns;
1678     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
1679   }
1680   PetscFunctionReturn(0);
1681 }
1682 
1683 /*
1684      This is really ugly. if inodes are used this replaces the
1685   permutations with ones that correspond to rows/cols of the matrix
1686   rather then inode blocks
1687 */
1688 #undef __FUNCT__
1689 #define __FUNCT__ "MatInodeAdjustForInodes"
1690 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1691 {
1692   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
1693 
1694   PetscFunctionBegin;
1695   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
1696   if (f) {
1697     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
1698   }
1699   PetscFunctionReturn(0);
1700 }
1701 
1702 EXTERN_C_BEGIN
1703 #undef __FUNCT__
1704 #define __FUNCT__ "MatAdjustForInodes_Inode"
1705 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1706 {
1707   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
1708   PetscErrorCode ierr;
1709   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1710   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1711   PetscInt       nslim_col,*ns_col;
1712   IS             ris = *rperm,cis = *cperm;
1713 
1714   PetscFunctionBegin;
1715   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
1716   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
1717 
1718   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
1719   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
1720   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
1721   permc = permr + m;
1722 
1723   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
1724   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
1725 
1726   /* Form the inode structure for the rows of permuted matric using inv perm*/
1727   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1728 
1729   /* Construct the permutations for rows*/
1730   for (i=0,row = 0; i<nslim_row; ++i){
1731     indx      = ridx[i];
1732     start_val = tns[indx];
1733     end_val   = tns[indx + 1];
1734     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1735   }
1736 
1737   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1738   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1739 
1740  /* Construct permutations for columns */
1741   for (i=0,col=0; i<nslim_col; ++i){
1742     indx      = cidx[i];
1743     start_val = tns[indx];
1744     end_val   = tns[indx + 1];
1745     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1746   }
1747 
1748   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
1749   ISSetPermutation(*rperm);
1750   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
1751   ISSetPermutation(*cperm);
1752 
1753   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
1754   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
1755 
1756   ierr = PetscFree(ns_col);CHKERRQ(ierr);
1757   ierr = PetscFree(permr);CHKERRQ(ierr);
1758   ierr = ISDestroy(cis);CHKERRQ(ierr);
1759   ierr = ISDestroy(ris);CHKERRQ(ierr);
1760   ierr = PetscFree(tns);CHKERRQ(ierr);
1761   PetscFunctionReturn(0);
1762 }
1763 EXTERN_C_END
1764 
1765 #undef __FUNCT__
1766 #define __FUNCT__ "MatInodeGetInodeSizes"
1767 /*@C
1768    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
1769 
1770    Collective on Mat
1771 
1772    Input Parameter:
1773 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
1774 
1775    Output Parameter:
1776 +  node_count - no of inodes present in the matrix.
1777 .  sizes      - an array of size node_count,with sizes of each inode.
1778 -  limit      - the max size used to generate the inodes.
1779 
1780    Level: advanced
1781 
1782    Notes: This routine returns some internal storage information
1783    of the matrix, it is intended to be used by advanced users.
1784    It should be called after the matrix is assembled.
1785    The contents of the sizes[] array should not be changed.
1786    PETSC_NULL may be passed for information not requested.
1787 
1788 .keywords: matrix, seqaij, get, inode
1789 
1790 .seealso: MatGetInfo()
1791 @*/
1792 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1793 {
1794   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
1795 
1796   PetscFunctionBegin;
1797   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1798   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
1799   if (f) {
1800     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
1801   }
1802   PetscFunctionReturn(0);
1803 }
1804 
1805 EXTERN_C_BEGIN
1806 #undef __FUNCT__
1807 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
1808 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1809 {
1810   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1811 
1812   PetscFunctionBegin;
1813   if (node_count) *node_count = a->inode.node_count;
1814   if (sizes)      *sizes      = a->inode.size;
1815   if (limit)      *limit      = a->inode.limit;
1816   PetscFunctionReturn(0);
1817 }
1818 EXTERN_C_END
1819