xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 780a600e80509eba75ebd5b520fae1ebfeb16014)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap.n;
19   m      = A->rmap.n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap.n;
72   n         = A->cmap.n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap.n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap.n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
402   PetscErrorCode ierr;
403   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
404 
405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407 #endif
408 
409   PetscFunctionBegin;
410   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
411   node_max = a->inode.node_count;
412   ns       = a->inode.size;     /* Node Size array */
413   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
414   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
415   idx  = a->j;
416   v1   = a->a;
417   ii   = a->i;
418 
419   for (i = 0,row = 0; i< node_max; ++i){
420     nsz  = ns[i];
421     n    = ii[1] - ii[0];
422     ii  += nsz;
423     sz   = n;                   /* No of non zeros in this row */
424                                 /* Switch on the size of Node */
425     switch (nsz){               /* Each loop in 'case' is unrolled */
426     case 1 :
427       sum1  = 0;
428 
429       for(n = 0; n< sz-1; n+=2) {
430         i1   = idx[0];          /* The instructions are ordered to */
431         i2   = idx[1];          /* make the compiler's job easy */
432         idx += 2;
433         tmp0 = x[i1];
434         tmp1 = x[i2];
435         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436        }
437 
438       if (n == sz-1){          /* Take care of the last nonzero  */
439         tmp0  = x[*idx++];
440         sum1 += *v1++ * tmp0;
441       }
442       y[row++]=sum1;
443       break;
444     case 2:
445       sum1  = 0;
446       sum2  = 0;
447       v2    = v1 + n;
448 
449       for (n = 0; n< sz-1; n+=2) {
450         i1   = idx[0];
451         i2   = idx[1];
452         idx += 2;
453         tmp0 = x[i1];
454         tmp1 = x[i2];
455         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
456         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
457       }
458       if (n == sz-1){
459         tmp0  = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462       }
463       y[row++]=sum1;
464       y[row++]=sum2;
465       v1      =v2;              /* Since the next block to be processed starts there*/
466       idx    +=sz;
467       break;
468     case 3:
469       sum1  = 0;
470       sum2  = 0;
471       sum3  = 0;
472       v2    = v1 + n;
473       v3    = v2 + n;
474 
475       for (n = 0; n< sz-1; n+=2) {
476         i1   = idx[0];
477         i2   = idx[1];
478         idx += 2;
479         tmp0 = x[i1];
480         tmp1 = x[i2];
481         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
482         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
483         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
484       }
485       if (n == sz-1){
486         tmp0  = x[*idx++];
487         sum1 += *v1++ * tmp0;
488         sum2 += *v2++ * tmp0;
489         sum3 += *v3++ * tmp0;
490       }
491       y[row++]=sum1;
492       y[row++]=sum2;
493       y[row++]=sum3;
494       v1       =v3;             /* Since the next block to be processed starts there*/
495       idx     +=2*sz;
496       break;
497     case 4:
498       sum1  = 0;
499       sum2  = 0;
500       sum3  = 0;
501       sum4  = 0;
502       v2    = v1 + n;
503       v3    = v2 + n;
504       v4    = v3 + n;
505 
506       for (n = 0; n< sz-1; n+=2) {
507         i1   = idx[0];
508         i2   = idx[1];
509         idx += 2;
510         tmp0 = x[i1];
511         tmp1 = x[i2];
512         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
513         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
514         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
515         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
516       }
517       if (n == sz-1){
518         tmp0  = x[*idx++];
519         sum1 += *v1++ * tmp0;
520         sum2 += *v2++ * tmp0;
521         sum3 += *v3++ * tmp0;
522         sum4 += *v4++ * tmp0;
523       }
524       y[row++]=sum1;
525       y[row++]=sum2;
526       y[row++]=sum3;
527       y[row++]=sum4;
528       v1      =v4;              /* Since the next block to be processed starts there*/
529       idx    +=3*sz;
530       break;
531     case 5:
532       sum1  = 0;
533       sum2  = 0;
534       sum3  = 0;
535       sum4  = 0;
536       sum5  = 0;
537       v2    = v1 + n;
538       v3    = v2 + n;
539       v4    = v3 + n;
540       v5    = v4 + n;
541 
542       for (n = 0; n<sz-1; n+=2) {
543         i1   = idx[0];
544         i2   = idx[1];
545         idx += 2;
546         tmp0 = x[i1];
547         tmp1 = x[i2];
548         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
549         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
550         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
551         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
552         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
553       }
554       if (n == sz-1){
555         tmp0  = x[*idx++];
556         sum1 += *v1++ * tmp0;
557         sum2 += *v2++ * tmp0;
558         sum3 += *v3++ * tmp0;
559         sum4 += *v4++ * tmp0;
560         sum5 += *v5++ * tmp0;
561       }
562       y[row++]=sum1;
563       y[row++]=sum2;
564       y[row++]=sum3;
565       y[row++]=sum4;
566       y[row++]=sum5;
567       v1      =v5;       /* Since the next block to be processed starts there */
568       idx    +=4*sz;
569       break;
570     default :
571       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
572     }
573   }
574   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
575   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
576   ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr);
577   PetscFunctionReturn(0);
578 }
579 /* ----------------------------------------------------------- */
580 /* Almost same code as the MatMult_Inode() */
581 #undef __FUNCT__
582 #define __FUNCT__ "MatMultAdd_Inode"
583 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
584 {
585   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
586   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
587   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
588   PetscErrorCode ierr;
589   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
590 
591   PetscFunctionBegin;
592   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
593   node_max = a->inode.node_count;
594   ns       = a->inode.size;     /* Node Size array */
595   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
596   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
597   if (zz != yy) {
598     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
599   } else {
600     z = y;
601   }
602   zt = z;
603 
604   idx  = a->j;
605   v1   = a->a;
606   ii   = a->i;
607 
608   for (i = 0,row = 0; i< node_max; ++i){
609     nsz  = ns[i];
610     n    = ii[1] - ii[0];
611     ii  += nsz;
612     sz   = n;                   /* No of non zeros in this row */
613                                 /* Switch on the size of Node */
614     switch (nsz){               /* Each loop in 'case' is unrolled */
615     case 1 :
616       sum1  = *zt++;
617 
618       for(n = 0; n< sz-1; n+=2) {
619         i1   = idx[0];          /* The instructions are ordered to */
620         i2   = idx[1];          /* make the compiler's job easy */
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
625        }
626 
627       if(n   == sz-1){          /* Take care of the last nonzero  */
628         tmp0  = x[*idx++];
629         sum1 += *v1++ * tmp0;
630       }
631       y[row++]=sum1;
632       break;
633     case 2:
634       sum1  = *zt++;
635       sum2  = *zt++;
636       v2    = v1 + n;
637 
638       for(n = 0; n< sz-1; n+=2) {
639         i1   = idx[0];
640         i2   = idx[1];
641         idx += 2;
642         tmp0 = x[i1];
643         tmp1 = x[i2];
644         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
645         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
646       }
647       if(n   == sz-1){
648         tmp0  = x[*idx++];
649         sum1 += *v1++ * tmp0;
650         sum2 += *v2++ * tmp0;
651       }
652       y[row++]=sum1;
653       y[row++]=sum2;
654       v1      =v2;              /* Since the next block to be processed starts there*/
655       idx    +=sz;
656       break;
657     case 3:
658       sum1  = *zt++;
659       sum2  = *zt++;
660       sum3  = *zt++;
661       v2    = v1 + n;
662       v3    = v2 + n;
663 
664       for (n = 0; n< sz-1; n+=2) {
665         i1   = idx[0];
666         i2   = idx[1];
667         idx += 2;
668         tmp0 = x[i1];
669         tmp1 = x[i2];
670         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
671         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
672         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
673       }
674       if (n == sz-1){
675         tmp0  = x[*idx++];
676         sum1 += *v1++ * tmp0;
677         sum2 += *v2++ * tmp0;
678         sum3 += *v3++ * tmp0;
679       }
680       y[row++]=sum1;
681       y[row++]=sum2;
682       y[row++]=sum3;
683       v1       =v3;             /* Since the next block to be processed starts there*/
684       idx     +=2*sz;
685       break;
686     case 4:
687       sum1  = *zt++;
688       sum2  = *zt++;
689       sum3  = *zt++;
690       sum4  = *zt++;
691       v2    = v1 + n;
692       v3    = v2 + n;
693       v4    = v3 + n;
694 
695       for (n = 0; n< sz-1; n+=2) {
696         i1   = idx[0];
697         i2   = idx[1];
698         idx += 2;
699         tmp0 = x[i1];
700         tmp1 = x[i2];
701         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
702         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
703         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
704         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
705       }
706       if (n == sz-1){
707         tmp0  = x[*idx++];
708         sum1 += *v1++ * tmp0;
709         sum2 += *v2++ * tmp0;
710         sum3 += *v3++ * tmp0;
711         sum4 += *v4++ * tmp0;
712       }
713       y[row++]=sum1;
714       y[row++]=sum2;
715       y[row++]=sum3;
716       y[row++]=sum4;
717       v1      =v4;              /* Since the next block to be processed starts there*/
718       idx    +=3*sz;
719       break;
720     case 5:
721       sum1  = *zt++;
722       sum2  = *zt++;
723       sum3  = *zt++;
724       sum4  = *zt++;
725       sum5  = *zt++;
726       v2    = v1 + n;
727       v3    = v2 + n;
728       v4    = v3 + n;
729       v5    = v4 + n;
730 
731       for (n = 0; n<sz-1; n+=2) {
732         i1   = idx[0];
733         i2   = idx[1];
734         idx += 2;
735         tmp0 = x[i1];
736         tmp1 = x[i2];
737         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
738         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
739         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
740         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
741         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
742       }
743       if(n   == sz-1){
744         tmp0  = x[*idx++];
745         sum1 += *v1++ * tmp0;
746         sum2 += *v2++ * tmp0;
747         sum3 += *v3++ * tmp0;
748         sum4 += *v4++ * tmp0;
749         sum5 += *v5++ * tmp0;
750       }
751       y[row++]=sum1;
752       y[row++]=sum2;
753       y[row++]=sum3;
754       y[row++]=sum4;
755       y[row++]=sum5;
756       v1      =v5;       /* Since the next block to be processed starts there */
757       idx    +=4*sz;
758       break;
759     default :
760       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
761     }
762   }
763   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
764   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
765   if (zz != yy) {
766     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
767   }
768   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
769   PetscFunctionReturn(0);
770 }
771 
772 /* ----------------------------------------------------------- */
773 #undef __FUNCT__
774 #define __FUNCT__ "MatSolve_Inode"
775 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
778   IS             iscol = a->col,isrow = a->row;
779   PetscErrorCode ierr;
780   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
781   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
782   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
783   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
784 
785   PetscFunctionBegin;
786   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
787   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
788   node_max = a->inode.node_count;
789   ns       = a->inode.size;     /* Node Size array */
790 
791   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   tmp  = a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
797 
798   /* forward solve the lower triangular */
799   tmps = tmp ;
800   aa   = a_a ;
801   aj   = a_j ;
802   ad   = a->diag;
803 
804   for (i = 0,row = 0; i< node_max; ++i){
805     nsz = ns[i];
806     aii = ai[row];
807     v1  = aa + aii;
808     vi  = aj + aii;
809     nz  = ad[row]- aii;
810 
811     switch (nsz){               /* Each loop in 'case' is unrolled */
812     case 1 :
813       sum1 = b[*r++];
814       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
815       for(j=0; j<nz-1; j+=2){
816         i0   = vi[0];
817         i1   = vi[1];
818         vi  +=2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822       }
823       if(j == nz-1){
824         tmp0 = tmps[*vi++];
825         sum1 -= *v1++ *tmp0;
826       }
827       tmp[row ++]=sum1;
828       break;
829     case 2:
830       sum1 = b[*r++];
831       sum2 = b[*r++];
832       v2   = aa + ai[row+1];
833 
834       for(j=0; j<nz-1; j+=2){
835         i0   = vi[0];
836         i1   = vi[1];
837         vi  +=2;
838         tmp0 = tmps[i0];
839         tmp1 = tmps[i1];
840         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
841         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
842       }
843       if(j == nz-1){
844         tmp0 = tmps[*vi++];
845         sum1 -= *v1++ *tmp0;
846         sum2 -= *v2++ *tmp0;
847       }
848       sum2 -= *v2++ * sum1;
849       tmp[row ++]=sum1;
850       tmp[row ++]=sum2;
851       break;
852     case 3:
853       sum1 = b[*r++];
854       sum2 = b[*r++];
855       sum3 = b[*r++];
856       v2   = aa + ai[row+1];
857       v3   = aa + ai[row+2];
858 
859       for (j=0; j<nz-1; j+=2){
860         i0   = vi[0];
861         i1   = vi[1];
862         vi  +=2;
863         tmp0 = tmps[i0];
864         tmp1 = tmps[i1];
865         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
866         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
867         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
868       }
869       if (j == nz-1){
870         tmp0 = tmps[*vi++];
871         sum1 -= *v1++ *tmp0;
872         sum2 -= *v2++ *tmp0;
873         sum3 -= *v3++ *tmp0;
874       }
875       sum2 -= *v2++ * sum1;
876       sum3 -= *v3++ * sum1;
877       sum3 -= *v3++ * sum2;
878       tmp[row ++]=sum1;
879       tmp[row ++]=sum2;
880       tmp[row ++]=sum3;
881       break;
882 
883     case 4:
884       sum1 = b[*r++];
885       sum2 = b[*r++];
886       sum3 = b[*r++];
887       sum4 = b[*r++];
888       v2   = aa + ai[row+1];
889       v3   = aa + ai[row+2];
890       v4   = aa + ai[row+3];
891 
892       for (j=0; j<nz-1; j+=2){
893         i0   = vi[0];
894         i1   = vi[1];
895         vi  +=2;
896         tmp0 = tmps[i0];
897         tmp1 = tmps[i1];
898         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
899         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
900         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
901         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
902       }
903       if (j == nz-1){
904         tmp0 = tmps[*vi++];
905         sum1 -= *v1++ *tmp0;
906         sum2 -= *v2++ *tmp0;
907         sum3 -= *v3++ *tmp0;
908         sum4 -= *v4++ *tmp0;
909       }
910       sum2 -= *v2++ * sum1;
911       sum3 -= *v3++ * sum1;
912       sum4 -= *v4++ * sum1;
913       sum3 -= *v3++ * sum2;
914       sum4 -= *v4++ * sum2;
915       sum4 -= *v4++ * sum3;
916 
917       tmp[row ++]=sum1;
918       tmp[row ++]=sum2;
919       tmp[row ++]=sum3;
920       tmp[row ++]=sum4;
921       break;
922     case 5:
923       sum1 = b[*r++];
924       sum2 = b[*r++];
925       sum3 = b[*r++];
926       sum4 = b[*r++];
927       sum5 = b[*r++];
928       v2   = aa + ai[row+1];
929       v3   = aa + ai[row+2];
930       v4   = aa + ai[row+3];
931       v5   = aa + ai[row+4];
932 
933       for (j=0; j<nz-1; j+=2){
934         i0   = vi[0];
935         i1   = vi[1];
936         vi  +=2;
937         tmp0 = tmps[i0];
938         tmp1 = tmps[i1];
939         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
940         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
941         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
942         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
943         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
944       }
945       if (j == nz-1){
946         tmp0 = tmps[*vi++];
947         sum1 -= *v1++ *tmp0;
948         sum2 -= *v2++ *tmp0;
949         sum3 -= *v3++ *tmp0;
950         sum4 -= *v4++ *tmp0;
951         sum5 -= *v5++ *tmp0;
952       }
953 
954       sum2 -= *v2++ * sum1;
955       sum3 -= *v3++ * sum1;
956       sum4 -= *v4++ * sum1;
957       sum5 -= *v5++ * sum1;
958       sum3 -= *v3++ * sum2;
959       sum4 -= *v4++ * sum2;
960       sum5 -= *v5++ * sum2;
961       sum4 -= *v4++ * sum3;
962       sum5 -= *v5++ * sum3;
963       sum5 -= *v5++ * sum4;
964 
965       tmp[row ++]=sum1;
966       tmp[row ++]=sum2;
967       tmp[row ++]=sum3;
968       tmp[row ++]=sum4;
969       tmp[row ++]=sum5;
970       break;
971     default:
972       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
973     }
974   }
975   /* backward solve the upper triangular */
976   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
977     nsz = ns[i];
978     aii = ai[row+1] -1;
979     v1  = aa + aii;
980     vi  = aj + aii;
981     nz  = aii- ad[row];
982     switch (nsz){               /* Each loop in 'case' is unrolled */
983     case 1 :
984       sum1 = tmp[row];
985 
986       for(j=nz ; j>1; j-=2){
987         vi  -=2;
988         i0   = vi[2];
989         i1   = vi[1];
990         tmp0 = tmps[i0];
991         tmp1 = tmps[i1];
992         v1   -= 2;
993         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
994       }
995       if (j==1){
996         tmp0  = tmps[*vi--];
997         sum1 -= *v1-- * tmp0;
998       }
999       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1000       break;
1001     case 2 :
1002       sum1 = tmp[row];
1003       sum2 = tmp[row -1];
1004       v2   = aa + ai[row]-1;
1005       for (j=nz ; j>1; j-=2){
1006         vi  -=2;
1007         i0   = vi[2];
1008         i1   = vi[1];
1009         tmp0 = tmps[i0];
1010         tmp1 = tmps[i1];
1011         v1   -= 2;
1012         v2   -= 2;
1013         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1014         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1015       }
1016       if (j==1){
1017         tmp0  = tmps[*vi--];
1018         sum1 -= *v1-- * tmp0;
1019         sum2 -= *v2-- * tmp0;
1020       }
1021 
1022       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1023       sum2   -= *v2-- * tmp0;
1024       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1025       break;
1026     case 3 :
1027       sum1 = tmp[row];
1028       sum2 = tmp[row -1];
1029       sum3 = tmp[row -2];
1030       v2   = aa + ai[row]-1;
1031       v3   = aa + ai[row -1]-1;
1032       for (j=nz ; j>1; j-=2){
1033         vi  -=2;
1034         i0   = vi[2];
1035         i1   = vi[1];
1036         tmp0 = tmps[i0];
1037         tmp1 = tmps[i1];
1038         v1   -= 2;
1039         v2   -= 2;
1040         v3   -= 2;
1041         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1042         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1043         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1044       }
1045       if (j==1){
1046         tmp0  = tmps[*vi--];
1047         sum1 -= *v1-- * tmp0;
1048         sum2 -= *v2-- * tmp0;
1049         sum3 -= *v3-- * tmp0;
1050       }
1051       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1052       sum2   -= *v2-- * tmp0;
1053       sum3   -= *v3-- * tmp0;
1054       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1055       sum3   -= *v3-- * tmp0;
1056       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1057 
1058       break;
1059     case 4 :
1060       sum1 = tmp[row];
1061       sum2 = tmp[row -1];
1062       sum3 = tmp[row -2];
1063       sum4 = tmp[row -3];
1064       v2   = aa + ai[row]-1;
1065       v3   = aa + ai[row -1]-1;
1066       v4   = aa + ai[row -2]-1;
1067 
1068       for (j=nz ; j>1; j-=2){
1069         vi  -=2;
1070         i0   = vi[2];
1071         i1   = vi[1];
1072         tmp0 = tmps[i0];
1073         tmp1 = tmps[i1];
1074         v1  -= 2;
1075         v2  -= 2;
1076         v3  -= 2;
1077         v4  -= 2;
1078         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1079         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1080         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1081         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1082       }
1083       if (j==1){
1084         tmp0  = tmps[*vi--];
1085         sum1 -= *v1-- * tmp0;
1086         sum2 -= *v2-- * tmp0;
1087         sum3 -= *v3-- * tmp0;
1088         sum4 -= *v4-- * tmp0;
1089       }
1090 
1091       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1092       sum2   -= *v2-- * tmp0;
1093       sum3   -= *v3-- * tmp0;
1094       sum4   -= *v4-- * tmp0;
1095       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1096       sum3   -= *v3-- * tmp0;
1097       sum4   -= *v4-- * tmp0;
1098       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1099       sum4   -= *v4-- * tmp0;
1100       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1101       break;
1102     case 5 :
1103       sum1 = tmp[row];
1104       sum2 = tmp[row -1];
1105       sum3 = tmp[row -2];
1106       sum4 = tmp[row -3];
1107       sum5 = tmp[row -4];
1108       v2   = aa + ai[row]-1;
1109       v3   = aa + ai[row -1]-1;
1110       v4   = aa + ai[row -2]-1;
1111       v5   = aa + ai[row -3]-1;
1112       for (j=nz ; j>1; j-=2){
1113         vi  -= 2;
1114         i0   = vi[2];
1115         i1   = vi[1];
1116         tmp0 = tmps[i0];
1117         tmp1 = tmps[i1];
1118         v1   -= 2;
1119         v2   -= 2;
1120         v3   -= 2;
1121         v4   -= 2;
1122         v5   -= 2;
1123         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1124         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1125         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1126         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1127         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1128       }
1129       if (j==1){
1130         tmp0  = tmps[*vi--];
1131         sum1 -= *v1-- * tmp0;
1132         sum2 -= *v2-- * tmp0;
1133         sum3 -= *v3-- * tmp0;
1134         sum4 -= *v4-- * tmp0;
1135         sum5 -= *v5-- * tmp0;
1136       }
1137 
1138       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1139       sum2   -= *v2-- * tmp0;
1140       sum3   -= *v3-- * tmp0;
1141       sum4   -= *v4-- * tmp0;
1142       sum5   -= *v5-- * tmp0;
1143       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1144       sum3   -= *v3-- * tmp0;
1145       sum4   -= *v4-- * tmp0;
1146       sum5   -= *v5-- * tmp0;
1147       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1148       sum4   -= *v4-- * tmp0;
1149       sum5   -= *v5-- * tmp0;
1150       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1151       sum5   -= *v5-- * tmp0;
1152       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1153       break;
1154     default:
1155       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1156     }
1157   }
1158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1168 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1169 {
1170   Mat            C = *B;
1171   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1172   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1173   PetscErrorCode ierr;
1174   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1175   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1176   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1177   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1178   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1179   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv;
1180   PetscReal      rs=0.0;
1181   LUShift_Ctx    sctx;
1182   PetscInt       newshift;
1183 
1184   PetscFunctionBegin;
1185   sctx.shift_top  = 0;
1186   sctx.nshift_max = 0;
1187   sctx.shift_lo   = 0;
1188   sctx.shift_hi   = 0;
1189 
1190   /* if both shift schemes are chosen by user, only use info->shiftpd */
1191   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1192   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1193     sctx.shift_top = 0;
1194     for (i=0; i<n; i++) {
1195       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1196       rs    = 0.0;
1197       ajtmp = aj + ai[i];
1198       rtmp1 = aa + ai[i];
1199       nz = ai[i+1] - ai[i];
1200       for (j=0; j<nz; j++){
1201         if (*ajtmp != i){
1202           rs += PetscAbsScalar(*rtmp1++);
1203         } else {
1204           rs -= PetscRealPart(*rtmp1++);
1205         }
1206         ajtmp++;
1207       }
1208       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209     }
1210     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1211     sctx.shift_top *= 1.1;
1212     sctx.nshift_max = 5;
1213     sctx.shift_lo   = 0.;
1214     sctx.shift_hi   = 1.;
1215   }
1216   sctx.shift_amount = 0;
1217   sctx.nshift       = 0;
1218 
1219   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1220   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1221   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1222   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1223   ierr  = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1224   ics   = ic ;
1225   rtmp2 = rtmp1 + n;
1226   rtmp3 = rtmp2 + n;
1227 
1228   node_max = a->inode.node_count;
1229   ns       = a->inode.size ;
1230   if (!ns){
1231     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1232   }
1233 
1234   /* If max inode size > 3, split it into two inodes.*/
1235   /* also map the inode sizes according to the ordering */
1236   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1237   for (i=0,j=0; i<node_max; ++i,++j){
1238     if (ns[i]>3) {
1239       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1240       ++j;
1241       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242     } else {
1243       tmp_vec1[j] = ns[i];
1244     }
1245   }
1246   /* Use the correct node_max */
1247   node_max = j;
1248 
1249   /* Now reorder the inode info based on mat re-ordering info */
1250   /* First create a row -> inode_size_array_index map */
1251   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1252   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1253   for (i=0,row=0; i<node_max; i++) {
1254     nodesz = tmp_vec1[i];
1255     for (j=0; j<nodesz; j++,row++) {
1256       nsmap[row] = i;
1257     }
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i=0,j=0; i< node_max; i++) {
1261     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1262     tmp_vec2[i]  = nodesz;
1263     j           += nodesz;
1264   }
1265   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1266   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1267   /* Now use the correct ns */
1268   ns = tmp_vec2;
1269 
1270   do {
1271     sctx.lushift = PETSC_FALSE;
1272     /* Now loop over each block-row, and do the factorization */
1273     for (i=0,row=0; i<node_max; i++) {
1274       nodesz = ns[i];
1275       nz     = bi[row+1] - bi[row];
1276       bjtmp  = bj + bi[row];
1277 
1278       switch (nodesz){
1279       case 1:
1280         for  (j=0; j<nz; j++){
1281           idx        = bjtmp[j];
1282           rtmp1[idx] = 0.0;
1283         }
1284 
1285         /* load in initial (unfactored row) */
1286         idx    = r[row];
1287         nz_tmp = ai[idx+1] - ai[idx];
1288         ajtmp  = aj + ai[idx];
1289         v1     = aa + ai[idx];
1290 
1291         for (j=0; j<nz_tmp; j++) {
1292           idx        = ics[ajtmp[j]];
1293           rtmp1[idx] = v1[j];
1294         }
1295         rtmp1[ics[r[row]]] += sctx.shift_amount;
1296 
1297         prow = *bjtmp++ ;
1298         while (prow < row) {
1299           pc1 = rtmp1 + prow;
1300           if (*pc1 != 0.0){
1301             pv   = ba + bd[prow];
1302             pj   = nbj + bd[prow];
1303             mul1 = *pc1 * *pv++;
1304             *pc1 = mul1;
1305             nz_tmp = bi[prow+1] - bd[prow] - 1;
1306             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1307             for (j=0; j<nz_tmp; j++) {
1308               tmp = pv[j];
1309               idx = pj[j];
1310               rtmp1[idx] -= mul1 * tmp;
1311             }
1312           }
1313           prow = *bjtmp++ ;
1314         }
1315         pj  = bj + bi[row];
1316         pc1 = ba + bi[row];
1317 
1318         sctx.pv    = rtmp1[row];
1319         rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1320         rs         = 0.0;
1321         for (j=0; j<nz; j++) {
1322           idx    = pj[j];
1323           pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1324           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1325         }
1326         sctx.rs  = rs;
1327         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1328         if (newshift == 1) goto endofwhile;
1329         break;
1330 
1331       case 2:
1332         for (j=0; j<nz; j++) {
1333           idx        = bjtmp[j];
1334           rtmp1[idx] = 0.0;
1335           rtmp2[idx] = 0.0;
1336         }
1337 
1338         /* load in initial (unfactored row) */
1339         idx    = r[row];
1340         nz_tmp = ai[idx+1] - ai[idx];
1341         ajtmp  = aj + ai[idx];
1342         v1     = aa + ai[idx];
1343         v2     = aa + ai[idx+1];
1344         for (j=0; j<nz_tmp; j++) {
1345           idx        = ics[ajtmp[j]];
1346           rtmp1[idx] = v1[j];
1347           rtmp2[idx] = v2[j];
1348         }
1349         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1350         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1351 
1352         prow = *bjtmp++ ;
1353         while (prow < row) {
1354           pc1 = rtmp1 + prow;
1355           pc2 = rtmp2 + prow;
1356           if (*pc1 != 0.0 || *pc2 != 0.0){
1357             pv   = ba + bd[prow];
1358             pj   = nbj + bd[prow];
1359             mul1 = *pc1 * *pv;
1360             mul2 = *pc2 * *pv;
1361             ++pv;
1362             *pc1 = mul1;
1363             *pc2 = mul2;
1364 
1365             nz_tmp = bi[prow+1] - bd[prow] - 1;
1366             for (j=0; j<nz_tmp; j++) {
1367               tmp = pv[j];
1368               idx = pj[j];
1369               rtmp1[idx] -= mul1 * tmp;
1370               rtmp2[idx] -= mul2 * tmp;
1371             }
1372             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1373           }
1374           prow = *bjtmp++ ;
1375         }
1376 
1377         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1378         pc1 = rtmp1 + prow;
1379         pc2 = rtmp2 + prow;
1380 
1381         sctx.pv = *pc1;
1382         pj      = bj + bi[prow];
1383         rs      = 0.0;
1384         for (j=0; j<nz; j++){
1385           idx = pj[j];
1386           if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1387         }
1388         sctx.rs = rs;
1389         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1390         if (newshift == 1) goto endofwhile;
1391 
1392         if (*pc2 != 0.0){
1393           pj     = nbj + bd[prow];
1394           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1395           *pc2   = mul2;
1396           nz_tmp = bi[prow+1] - bd[prow] - 1;
1397           for (j=0; j<nz_tmp; j++) {
1398             idx = pj[j] ;
1399             tmp = rtmp1[idx];
1400             rtmp2[idx] -= mul2 * tmp;
1401           }
1402           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1403         }
1404 
1405         pj  = bj + bi[row];
1406         pc1 = ba + bi[row];
1407         pc2 = ba + bi[row+1];
1408 
1409         sctx.pv = rtmp2[row+1];
1410         rs = 0.0;
1411         rtmp1[row]   = 1.0/rtmp1[row];
1412         rtmp2[row+1] = 1.0/rtmp2[row+1];
1413         /* copy row entries from dense representation to sparse */
1414         for (j=0; j<nz; j++) {
1415           idx    = pj[j];
1416           pc1[j] = rtmp1[idx];
1417           pc2[j] = rtmp2[idx];
1418           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1419         }
1420         sctx.rs = rs;
1421         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1422         if (newshift == 1) goto endofwhile;
1423         break;
1424 
1425       case 3:
1426         for  (j=0; j<nz; j++) {
1427           idx        = bjtmp[j];
1428           rtmp1[idx] = 0.0;
1429           rtmp2[idx] = 0.0;
1430           rtmp3[idx] = 0.0;
1431         }
1432         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1433         idx    = r[row];
1434         nz_tmp = ai[idx+1] - ai[idx];
1435         ajtmp = aj + ai[idx];
1436         v1    = aa + ai[idx];
1437         v2    = aa + ai[idx+1];
1438         v3    = aa + ai[idx+2];
1439         for (j=0; j<nz_tmp; j++) {
1440           idx        = ics[ajtmp[j]];
1441           rtmp1[idx] = v1[j];
1442           rtmp2[idx] = v2[j];
1443           rtmp3[idx] = v3[j];
1444         }
1445         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1446         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1447         rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1448 
1449         /* loop over all pivot row blocks above this row block */
1450         prow = *bjtmp++ ;
1451         while (prow < row) {
1452           pc1 = rtmp1 + prow;
1453           pc2 = rtmp2 + prow;
1454           pc3 = rtmp3 + prow;
1455           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1456             pv   = ba  + bd[prow];
1457             pj   = nbj + bd[prow];
1458             mul1 = *pc1 * *pv;
1459             mul2 = *pc2 * *pv;
1460             mul3 = *pc3 * *pv;
1461             ++pv;
1462             *pc1 = mul1;
1463             *pc2 = mul2;
1464             *pc3 = mul3;
1465 
1466             nz_tmp = bi[prow+1] - bd[prow] - 1;
1467             /* update this row based on pivot row */
1468             for (j=0; j<nz_tmp; j++) {
1469               tmp = pv[j];
1470               idx = pj[j];
1471               rtmp1[idx] -= mul1 * tmp;
1472               rtmp2[idx] -= mul2 * tmp;
1473               rtmp3[idx] -= mul3 * tmp;
1474             }
1475             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
1476           }
1477           prow = *bjtmp++ ;
1478         }
1479 
1480         /* Now take care of diagonal 3x3 block in this set of rows */
1481         /* note: prow = row here */
1482         pc1 = rtmp1 + prow;
1483         pc2 = rtmp2 + prow;
1484         pc3 = rtmp3 + prow;
1485 
1486         sctx.pv = *pc1;
1487         pj      = bj + bi[prow];
1488         rs      = 0.0;
1489         for (j=0; j<nz; j++){
1490           idx = pj[j];
1491           if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1492         }
1493         sctx.rs = rs;
1494         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1495         if (newshift == 1) goto endofwhile;
1496 
1497         if (*pc2 != 0.0 || *pc3 != 0.0){
1498           mul2 = (*pc2)/(*pc1);
1499           mul3 = (*pc3)/(*pc1);
1500           *pc2 = mul2;
1501           *pc3 = mul3;
1502           nz_tmp = bi[prow+1] - bd[prow] - 1;
1503           pj     = nbj + bd[prow];
1504           for (j=0; j<nz_tmp; j++) {
1505             idx = pj[j] ;
1506             tmp = rtmp1[idx];
1507             rtmp2[idx] -= mul2 * tmp;
1508             rtmp3[idx] -= mul3 * tmp;
1509           }
1510           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1511         }
1512         ++prow;
1513 
1514         pc2 = rtmp2 + prow;
1515         pc3 = rtmp3 + prow;
1516         sctx.pv = *pc2;
1517         pj      = bj + bi[prow];
1518         rs      = 0.0;
1519         for (j=0; j<nz; j++){
1520           idx = pj[j];
1521           if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1522         }
1523         sctx.rs = rs;
1524         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1525         if (newshift == 1) goto endofwhile;
1526 
1527         if (*pc3 != 0.0){
1528           mul3   = (*pc3)/(*pc2);
1529           *pc3   = mul3;
1530           pj     = nbj + bd[prow];
1531           nz_tmp = bi[prow+1] - bd[prow] - 1;
1532           for (j=0; j<nz_tmp; j++) {
1533             idx = pj[j] ;
1534             tmp = rtmp2[idx];
1535             rtmp3[idx] -= mul3 * tmp;
1536           }
1537           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1538         }
1539 
1540         pj  = bj + bi[row];
1541         pc1 = ba + bi[row];
1542         pc2 = ba + bi[row+1];
1543         pc3 = ba + bi[row+2];
1544 
1545         sctx.pv = rtmp3[row+2];
1546         rs = 0.0;
1547         rtmp1[row]   = 1.0/rtmp1[row];
1548         rtmp2[row+1] = 1.0/rtmp2[row+1];
1549         rtmp3[row+2] = 1.0/rtmp3[row+2];
1550         /* copy row entries from dense representation to sparse */
1551         for (j=0; j<nz; j++) {
1552           idx    = pj[j];
1553           pc1[j] = rtmp1[idx];
1554           pc2[j] = rtmp2[idx];
1555           pc3[j] = rtmp3[idx];
1556           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1557         }
1558 
1559         sctx.rs = rs;
1560         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1561         if (newshift == 1) goto endofwhile;
1562         break;
1563 
1564       default:
1565         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1566       }
1567       row += nodesz;                 /* Update the row */
1568     }
1569     endofwhile:;
1570   } while (sctx.lushift);
1571   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1572   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1573   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1574   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1575   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1576   C->factor      = FACTOR_LU;
1577   C->assembled   = PETSC_TRUE;
1578   if (sctx.nshift) {
1579     if (info->shiftnz) {
1580       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1581     } else if (info->shiftpd) {
1582       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1583     }
1584   }
1585   ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 /*
1590      Makes a longer coloring[] array and calls the usual code with that
1591 */
1592 #undef __FUNCT__
1593 #define __FUNCT__ "MatColoringPatch_Inode"
1594 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1595 {
1596   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1597   PetscErrorCode  ierr;
1598   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1599   PetscInt        *colorused,i;
1600   ISColoringValue *newcolor;
1601 
1602   PetscFunctionBegin;
1603   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1604   /* loop over inodes, marking a color for each column*/
1605   row = 0;
1606   for (i=0; i<m; i++){
1607     for (j=0; j<ns[i]; j++) {
1608       newcolor[row++] = coloring[i] + j*ncolors;
1609     }
1610   }
1611 
1612   /* eliminate unneeded colors */
1613   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1614   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1615   for (i=0; i<n; i++) {
1616     colorused[newcolor[i]] = 1;
1617   }
1618 
1619   for (i=1; i<5*ncolors; i++) {
1620     colorused[i] += colorused[i-1];
1621   }
1622   ncolors = colorused[5*ncolors-1];
1623   for (i=0; i<n; i++) {
1624     newcolor[i] = colorused[newcolor[i]]-1;
1625   }
1626   ierr = PetscFree(colorused);CHKERRQ(ierr);
1627   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1628   ierr = PetscFree(coloring);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 #include "src/inline/ilu.h"
1633 
1634 #undef __FUNCT__
1635 #define __FUNCT__ "MatRelax_Inode"
1636 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1637 {
1638   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
1639   PetscScalar     *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3,*v1,*v2,*v3,*v4,*v5;
1640   PetscScalar     *v = a->a,*b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1641   PetscReal       zeropivot = 1.0e-15;
1642   PetscErrorCode  ierr;
1643   PetscInt        n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1644   PetscInt        *idx,*diag = a->diag,*ii = a->i,sz,k;
1645 
1646   PetscFunctionBegin;
1647   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1648   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1649   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode");
1650 
1651   if (!a->inode.ibdiagvalid) {
1652     if (!a->inode.ibdiag) {
1653       /* calculate space needed for diagonal blocks */
1654       for (i=0; i<m; i++) {
1655 	cnt += sizes[i]*sizes[i];
1656       }
1657       a->inode.bdiagsize = cnt;
1658       ierr   = PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);CHKERRQ(ierr);
1659     }
1660 
1661     /* copy over the diagonal blocks and invert them */
1662     ibdiag = a->inode.ibdiag;
1663     bdiag  = a->inode.bdiag;
1664     cnt = 0;
1665     for (i=0, row = 0; i<m; i++) {
1666       for (j=0; j<sizes[i]; j++) {
1667         for (k=0; k<sizes[i]; k++) {
1668           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1669         }
1670       }
1671       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));CHKERRQ(ierr);
1672 
1673       switch(sizes[i]) {
1674         case 1:
1675           /* Create matrix data structure */
1676           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1677           ibdiag[cnt] = 1.0/ibdiag[cnt];
1678           break;
1679         case 2:
1680           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt);CHKERRQ(ierr);
1681           break;
1682         case 3:
1683           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt);CHKERRQ(ierr);
1684           break;
1685         case 4:
1686           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt);CHKERRQ(ierr);
1687           break;
1688         case 5:
1689           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt);CHKERRQ(ierr);
1690           break;
1691        default:
1692 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1693       }
1694       cnt += sizes[i]*sizes[i];
1695       row += sizes[i];
1696     }
1697     a->inode.ibdiagvalid = PETSC_TRUE;
1698   }
1699   ibdiag = a->inode.ibdiag;
1700   bdiag  = a->inode.bdiag;
1701 
1702   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1703   if (xx != bb) {
1704     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1705   } else {
1706     b = x;
1707   }
1708 
1709   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1710   xs   = x;
1711   if (flag & SOR_ZERO_INITIAL_GUESS) {
1712     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1713 
1714       for (i=0, row=0; i<m; i++) {
1715         sz  = diag[row] - ii[row];
1716         v1  = a->a + ii[row];
1717         idx = a->j + ii[row];
1718 
1719         /* see comments for MatMult_Inode() for how this is coded */
1720         switch (sizes[i]){
1721           case 1:
1722 
1723             sum1  = b[row];
1724             for(n = 0; n<sz-1; n+=2) {
1725               i1   = idx[0];
1726               i2   = idx[1];
1727               idx += 2;
1728               tmp0 = x[i1];
1729               tmp1 = x[i2];
1730               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1731             }
1732 
1733             if (n == sz-1){
1734               tmp0  = x[*idx];
1735               sum1 -= *v1 * tmp0;
1736             }
1737             x[row++] = sum1*(*ibdiag++);
1738             break;
1739           case 2:
1740             v2    = a->a + ii[row+1];
1741             sum1  = b[row];
1742             sum2  = b[row+1];
1743             for(n = 0; n<sz-1; n+=2) {
1744               i1   = idx[0];
1745               i2   = idx[1];
1746               idx += 2;
1747               tmp0 = x[i1];
1748               tmp1 = x[i2];
1749               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1750               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1751             }
1752 
1753             if (n == sz-1){
1754               tmp0  = x[*idx];
1755               sum1 -= v1[0] * tmp0;
1756               sum2 -= v2[0] * tmp0;
1757             }
1758             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1759             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1760             ibdiag  += 4;
1761             break;
1762           case 3:
1763             v2    = a->a + ii[row+1];
1764             v3    = a->a + ii[row+2];
1765             sum1  = b[row];
1766             sum2  = b[row+1];
1767             sum3  = b[row+2];
1768             for(n = 0; n<sz-1; n+=2) {
1769               i1   = idx[0];
1770               i2   = idx[1];
1771               idx += 2;
1772               tmp0 = x[i1];
1773               tmp1 = x[i2];
1774               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1775               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1776               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1777             }
1778 
1779             if (n == sz-1){
1780               tmp0  = x[*idx];
1781               sum1 -= v1[0] * tmp0;
1782               sum2 -= v2[0] * tmp0;
1783               sum3 -= v3[0] * tmp0;
1784             }
1785             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1786             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1787             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1788             ibdiag  += 9;
1789             break;
1790           case 4:
1791             v2    = a->a + ii[row+1];
1792             v3    = a->a + ii[row+2];
1793             v4    = a->a + ii[row+3];
1794             sum1  = b[row];
1795             sum2  = b[row+1];
1796             sum3  = b[row+2];
1797             sum4  = b[row+3];
1798             for(n = 0; n<sz-1; n+=2) {
1799               i1   = idx[0];
1800               i2   = idx[1];
1801               idx += 2;
1802               tmp0 = x[i1];
1803               tmp1 = x[i2];
1804               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1805               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1806               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1807               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1808             }
1809 
1810             if (n == sz-1){
1811               tmp0  = x[*idx];
1812               sum1 -= v1[0] * tmp0;
1813               sum2 -= v2[0] * tmp0;
1814               sum3 -= v3[0] * tmp0;
1815               sum4 -= v4[0] * tmp0;
1816             }
1817             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1818             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1819             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1820             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1821             ibdiag  += 16;
1822             break;
1823           case 5:
1824             v2    = a->a + ii[row+1];
1825             v3    = a->a + ii[row+2];
1826             v4    = a->a + ii[row+3];
1827             v5    = a->a + ii[row+4];
1828             sum1  = b[row];
1829             sum2  = b[row+1];
1830             sum3  = b[row+2];
1831             sum4  = b[row+3];
1832             sum5  = b[row+4];
1833             for(n = 0; n<sz-1; n+=2) {
1834               i1   = idx[0];
1835               i2   = idx[1];
1836               idx += 2;
1837               tmp0 = x[i1];
1838               tmp1 = x[i2];
1839               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1840               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1841               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1842               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1843               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1844             }
1845 
1846             if (n == sz-1){
1847               tmp0  = x[*idx];
1848               sum1 -= v1[0] * tmp0;
1849               sum2 -= v2[0] * tmp0;
1850               sum3 -= v3[0] * tmp0;
1851               sum4 -= v4[0] * tmp0;
1852               sum5 -= v5[0] * tmp0;
1853             }
1854             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1855             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1856             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1857             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1858             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1859             ibdiag  += 25;
1860             break;
1861 	  default:
1862    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1863         }
1864       }
1865 
1866       xb = x;
1867       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1868     } else xb = b;
1869     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1870         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1871       cnt = 0;
1872       for (i=0, row=0; i<m; i++) {
1873 
1874         switch (sizes[i]){
1875           case 1:
1876             x[row++] *= bdiag[cnt++];
1877             break;
1878           case 2:
1879             x1   = x[row]; x2 = x[row+1];
1880             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1881             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1882             x[row++] = tmp1;
1883             x[row++] = tmp2;
1884             cnt += 4;
1885             break;
1886           case 3:
1887             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1888             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1889             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1890             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1891             x[row++] = tmp1;
1892             x[row++] = tmp2;
1893             x[row++] = tmp3;
1894             cnt += 9;
1895             break;
1896           case 4:
1897             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1898             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1899             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1900             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1901             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1902             x[row++] = tmp1;
1903             x[row++] = tmp2;
1904             x[row++] = tmp3;
1905             x[row++] = tmp4;
1906             cnt += 16;
1907             break;
1908           case 5:
1909             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1910             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1911             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1912             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1913             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1914             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1915             x[row++] = tmp1;
1916             x[row++] = tmp2;
1917             x[row++] = tmp3;
1918             x[row++] = tmp4;
1919             x[row++] = tmp5;
1920             cnt += 25;
1921             break;
1922 	  default:
1923    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1924         }
1925       }
1926       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1927     }
1928     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1929 
1930       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1931       for (i=m-1, row=A->rmap.n-1; i>=0; i--) {
1932         ibdiag -= sizes[i]*sizes[i];
1933         sz      = ii[row+1] - diag[row] - 1;
1934         v1      = a->a + diag[row] + 1;
1935         idx     = a->j + diag[row] + 1;
1936 
1937         /* see comments for MatMult_Inode() for how this is coded */
1938         switch (sizes[i]){
1939           case 1:
1940 
1941             sum1  = xb[row];
1942             for(n = 0; n<sz-1; n+=2) {
1943               i1   = idx[0];
1944               i2   = idx[1];
1945               idx += 2;
1946               tmp0 = x[i1];
1947               tmp1 = x[i2];
1948               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1949             }
1950 
1951             if (n == sz-1){
1952               tmp0  = x[*idx];
1953               sum1 -= *v1*tmp0;
1954             }
1955             x[row--] = sum1*(*ibdiag);
1956             break;
1957 
1958           case 2:
1959 
1960             sum1  = xb[row];
1961             sum2  = xb[row-1];
1962             /* note that sum1 is associated with the second of the two rows */
1963             v2    = a->a + diag[row-1] + 2;
1964             for(n = 0; n<sz-1; n+=2) {
1965               i1   = idx[0];
1966               i2   = idx[1];
1967               idx += 2;
1968               tmp0 = x[i1];
1969               tmp1 = x[i2];
1970               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1971               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1972             }
1973 
1974             if (n == sz-1){
1975               tmp0  = x[*idx];
1976               sum1 -= *v1*tmp0;
1977               sum2 -= *v2*tmp0;
1978             }
1979             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1980             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1981             break;
1982           case 3:
1983 
1984             sum1  = xb[row];
1985             sum2  = xb[row-1];
1986             sum3  = xb[row-2];
1987             v2    = a->a + diag[row-1] + 2;
1988             v3    = a->a + diag[row-2] + 3;
1989             for(n = 0; n<sz-1; n+=2) {
1990               i1   = idx[0];
1991               i2   = idx[1];
1992               idx += 2;
1993               tmp0 = x[i1];
1994               tmp1 = x[i2];
1995               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1996               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1997               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1998             }
1999 
2000             if (n == sz-1){
2001               tmp0  = x[*idx];
2002               sum1 -= *v1*tmp0;
2003               sum2 -= *v2*tmp0;
2004               sum3 -= *v3*tmp0;
2005             }
2006             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2007             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2008             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2009             break;
2010           case 4:
2011 
2012             sum1  = xb[row];
2013             sum2  = xb[row-1];
2014             sum3  = xb[row-2];
2015             sum4  = xb[row-3];
2016             v2    = a->a + diag[row-1] + 2;
2017             v3    = a->a + diag[row-2] + 3;
2018             v4    = a->a + diag[row-3] + 4;
2019             for(n = 0; n<sz-1; n+=2) {
2020               i1   = idx[0];
2021               i2   = idx[1];
2022               idx += 2;
2023               tmp0 = x[i1];
2024               tmp1 = x[i2];
2025               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2026               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2027               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2028               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2029             }
2030 
2031             if (n == sz-1){
2032               tmp0  = x[*idx];
2033               sum1 -= *v1*tmp0;
2034               sum2 -= *v2*tmp0;
2035               sum3 -= *v3*tmp0;
2036               sum4 -= *v4*tmp0;
2037             }
2038             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2039             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2040             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2041             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2042             break;
2043           case 5:
2044 
2045             sum1  = xb[row];
2046             sum2  = xb[row-1];
2047             sum3  = xb[row-2];
2048             sum4  = xb[row-3];
2049             sum5  = xb[row-4];
2050             v2    = a->a + diag[row-1] + 2;
2051             v3    = a->a + diag[row-2] + 3;
2052             v4    = a->a + diag[row-3] + 4;
2053             v5    = a->a + diag[row-4] + 5;
2054             for(n = 0; n<sz-1; n+=2) {
2055               i1   = idx[0];
2056               i2   = idx[1];
2057               idx += 2;
2058               tmp0 = x[i1];
2059               tmp1 = x[i2];
2060               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2061               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2062               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2063               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2064               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2065             }
2066 
2067             if (n == sz-1){
2068               tmp0  = x[*idx];
2069               sum1 -= *v1*tmp0;
2070               sum2 -= *v2*tmp0;
2071               sum3 -= *v3*tmp0;
2072               sum4 -= *v4*tmp0;
2073               sum5 -= *v5*tmp0;
2074             }
2075             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2076             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2077             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2078             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2079             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2080             break;
2081 	  default:
2082    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2083         }
2084       }
2085 
2086       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2087     }
2088     its--;
2089   }
2090   if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode");
2091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2092   if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2093   PetscFunctionReturn(0);
2094 }
2095 
2096 
2097 /*
2098     samestructure indicates that the matrix has not changed its nonzero structure so we
2099     do not need to recompute the inodes
2100 */
2101 #undef __FUNCT__
2102 #define __FUNCT__ "Mat_CheckInode"
2103 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2104 {
2105   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2106   PetscErrorCode ierr;
2107   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2108   PetscTruth     flag;
2109 
2110   PetscFunctionBegin;
2111   if (!a->inode.use)                     PetscFunctionReturn(0);
2112   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2113 
2114 
2115   m = A->rmap.n;
2116   if (a->inode.size) {ns = a->inode.size;}
2117   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2118 
2119   i          = 0;
2120   node_count = 0;
2121   idx        = a->j;
2122   ii         = a->i;
2123   while (i < m){                /* For each row */
2124     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2125     /* Limits the number of elements in a node to 'a->inode.limit' */
2126     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2127       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2128       if (nzy != nzx) break;
2129       idy  += nzx;             /* Same nonzero pattern */
2130       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2131       if (!flag) break;
2132     }
2133     ns[node_count++] = blk_size;
2134     idx += blk_size*nzx;
2135     i    = j;
2136   }
2137   /* If not enough inodes found,, do not use inode version of the routines */
2138   if (!a->inode.size && m && node_count > .9*m) {
2139     ierr = PetscFree(ns);CHKERRQ(ierr);
2140     a->inode.node_count     = 0;
2141     a->inode.size           = PETSC_NULL;
2142     a->inode.use            = PETSC_FALSE;
2143     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2144   } else {
2145     A->ops->mult            = MatMult_Inode;
2146     A->ops->relax           = MatRelax_Inode;
2147     A->ops->multadd         = MatMultAdd_Inode;
2148     A->ops->solve           = MatSolve_Inode;
2149     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
2150     A->ops->getrowij        = MatGetRowIJ_Inode;
2151     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
2152     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
2153     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
2154     A->ops->coloringpatch   = MatColoringPatch_Inode;
2155     a->inode.node_count     = node_count;
2156     a->inode.size           = ns;
2157     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2158   }
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 /*
2163      This is really ugly. if inodes are used this replaces the
2164   permutations with ones that correspond to rows/cols of the matrix
2165   rather then inode blocks
2166 */
2167 #undef __FUNCT__
2168 #define __FUNCT__ "MatInodeAdjustForInodes"
2169 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2170 {
2171   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2172 
2173   PetscFunctionBegin;
2174   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2175   if (f) {
2176     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2177   }
2178   PetscFunctionReturn(0);
2179 }
2180 
2181 EXTERN_C_BEGIN
2182 #undef __FUNCT__
2183 #define __FUNCT__ "MatAdjustForInodes_Inode"
2184 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2185 {
2186   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2187   PetscErrorCode ierr;
2188   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
2189   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2190   PetscInt       nslim_col,*ns_col;
2191   IS             ris = *rperm,cis = *cperm;
2192 
2193   PetscFunctionBegin;
2194   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2195   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2196 
2197   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2198   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2199   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2200   permc = permr + m;
2201 
2202   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2203   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2204 
2205   /* Form the inode structure for the rows of permuted matric using inv perm*/
2206   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2207 
2208   /* Construct the permutations for rows*/
2209   for (i=0,row = 0; i<nslim_row; ++i){
2210     indx      = ridx[i];
2211     start_val = tns[indx];
2212     end_val   = tns[indx + 1];
2213     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2214   }
2215 
2216   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2217   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2218 
2219  /* Construct permutations for columns */
2220   for (i=0,col=0; i<nslim_col; ++i){
2221     indx      = cidx[i];
2222     start_val = tns[indx];
2223     end_val   = tns[indx + 1];
2224     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2225   }
2226 
2227   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2228   ISSetPermutation(*rperm);
2229   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2230   ISSetPermutation(*cperm);
2231 
2232   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2233   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2234 
2235   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2236   ierr = PetscFree(permr);CHKERRQ(ierr);
2237   ierr = ISDestroy(cis);CHKERRQ(ierr);
2238   ierr = ISDestroy(ris);CHKERRQ(ierr);
2239   ierr = PetscFree(tns);CHKERRQ(ierr);
2240   PetscFunctionReturn(0);
2241 }
2242 EXTERN_C_END
2243 
2244 #undef __FUNCT__
2245 #define __FUNCT__ "MatInodeGetInodeSizes"
2246 /*@C
2247    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2248 
2249    Collective on Mat
2250 
2251    Input Parameter:
2252 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2253 
2254    Output Parameter:
2255 +  node_count - no of inodes present in the matrix.
2256 .  sizes      - an array of size node_count,with sizes of each inode.
2257 -  limit      - the max size used to generate the inodes.
2258 
2259    Level: advanced
2260 
2261    Notes: This routine returns some internal storage information
2262    of the matrix, it is intended to be used by advanced users.
2263    It should be called after the matrix is assembled.
2264    The contents of the sizes[] array should not be changed.
2265    PETSC_NULL may be passed for information not requested.
2266 
2267 .keywords: matrix, seqaij, get, inode
2268 
2269 .seealso: MatGetInfo()
2270 @*/
2271 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2272 {
2273   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2274 
2275   PetscFunctionBegin;
2276   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2277   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2278   if (f) {
2279     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2280   }
2281   PetscFunctionReturn(0);
2282 }
2283 
2284 EXTERN_C_BEGIN
2285 #undef __FUNCT__
2286 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2287 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2288 {
2289   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2290 
2291   PetscFunctionBegin;
2292   if (node_count) *node_count = a->inode.node_count;
2293   if (sizes)      *sizes      = a->inode.size;
2294   if (limit)      *limit      = a->inode.limit;
2295   PetscFunctionReturn(0);
2296 }
2297 EXTERN_C_END
2298