xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 5ed2d596fd9914de76abbaed56c65f4792ae57ae)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/vec/vecimpl.h"
5 #include "src/inline/spops.h"
6 #include "src/inline/ilu.h"
7 #include "petscbt.h"
8 #include "src/mat/impls/sbaij/seq/sbaij.h"
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
12 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
13 {
14   PetscFunctionBegin;
15   SETERRQ(1,"Function not yet written for SBAIJ format");
16   /* PetscFunctionReturn(0); */
17 }
18 
19 #undef __FUNCT__
20 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
21 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
22 {
23   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
24   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
25   int          row,mat_i,*mat_j,tcol,*mat_ilen;
26   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
27   int          *aj = a->j,*ai = a->i;
28   MatScalar    *mat_a;
29   Mat          C;
30   PetscTruth   flag;
31 
32   PetscFunctionBegin;
33 
34   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
35   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
36   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
37 
38   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
39   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
40 
41   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
42   ssmap = smap;
43   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
44   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
45   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
46   /* determine lens of each row */
47   for (i=0; i<nrows; i++) {
48     kstart  = ai[irow[i]];
49     kend    = kstart + a->ilen[irow[i]];
50     lens[i] = 0;
51       for (k=kstart; k<kend; k++) {
52         if (ssmap[aj[k]]) {
53           lens[i]++;
54         }
55       }
56     }
57   /* Create and fill new matrix */
58   if (scall == MAT_REUSE_MATRIX) {
59     c = (Mat_SeqSBAIJ *)((*B)->data);
60 
61     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
62     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
63     if (flag == PETSC_FALSE) {
64       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
65     }
66     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
67     C = *B;
68   } else {
69     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
70   }
71   c = (Mat_SeqSBAIJ *)(C->data);
72   for (i=0; i<nrows; i++) {
73     row    = irow[i];
74     kstart = ai[row];
75     kend   = kstart + a->ilen[row];
76     mat_i  = c->i[i];
77     mat_j  = c->j + mat_i;
78     mat_a  = c->a + mat_i*bs2;
79     mat_ilen = c->ilen + i;
80     for (k=kstart; k<kend; k++) {
81       if ((tcol=ssmap[a->j[k]])) {
82         *mat_j++ = tcol - 1;
83         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
84         mat_a   += bs2;
85         (*mat_ilen)++;
86       }
87     }
88   }
89 
90   /* Free work space */
91   ierr = PetscFree(smap);CHKERRQ(ierr);
92   ierr = PetscFree(lens);CHKERRQ(ierr);
93   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
94   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
95 
96   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
97   *B = C;
98   PetscFunctionReturn(0);
99 }
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
103 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
104 {
105   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
106   IS          is1;
107   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
108 
109   PetscFunctionBegin;
110   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   /* Verify if the indices corespond to each element in a block
116    and form the IS with compressed IS */
117   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
118   iary = vary + a->mbs;
119   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
121 
122   count = 0;
123   for (i=0; i<a->mbs; i++) {
124     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
125     if (vary[i]==bs) iary[count++] = i;
126   }
127   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
128 
129   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
130   ierr = PetscFree(vary);CHKERRQ(ierr);
131 
132   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
133   ISDestroy(is1);
134   PetscFunctionReturn(0);
135 }
136 
137 #undef __FUNCT__
138 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
139 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
140 {
141   int ierr,i;
142 
143   PetscFunctionBegin;
144   if (scall == MAT_INITIAL_MATRIX) {
145     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
146   }
147 
148   for (i=0; i<n; i++) {
149     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
150   }
151   PetscFunctionReturn(0);
152 }
153 
154 /* -------------------------------------------------------*/
155 /* Should check that shapes of vectors and matrices match */
156 /* -------------------------------------------------------*/
157 #include "petscblaslapack.h"
158 
159 #undef __FUNCT__
160 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
161 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
162 {
163   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
164   PetscScalar     *x,*z,*xb,x1,zero=0.0;
165   MatScalar       *v;
166   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
167 
168   PetscFunctionBegin;
169   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
170   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
171   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
172 
173   v  = a->a;
174   xb = x;
175 
176   for (i=0; i<mbs; i++) {
177     n  = ai[1] - ai[0];  /* length of i_th row of A */
178     x1 = xb[0];
179     ib = aj + *ai;
180     jmin = 0;
181     if (*ib == i) {      /* (diag of A)*x */
182       z[i] += *v++ * x[*ib++];
183       jmin++;
184     }
185     for (j=jmin; j<n; j++) {
186       cval    = *ib;
187       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
188       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
189     }
190     xb++; ai++;
191   }
192 
193   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
194   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
195   PetscLogFlops(2*(a->s_nz*2 - A->m) - A->m);  /* s_nz = (nz+m)/2 */
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
201 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
202 {
203   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
204   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
205   MatScalar        *v;
206   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
207 
208 
209   PetscFunctionBegin;
210   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
211   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
212   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
213 
214   v     = a->a;
215   xb = x;
216 
217   for (i=0; i<mbs; i++) {
218     n  = ai[1] - ai[0]; /* length of i_th block row of A */
219     x1 = xb[0]; x2 = xb[1];
220     ib = aj + *ai;
221     jmin = 0;
222     if (*ib == i){     /* (diag of A)*x */
223       z[2*i]   += v[0]*x1 + v[2]*x2;
224       z[2*i+1] += v[2]*x1 + v[3]*x2;
225       v += 4; jmin++;
226     }
227     for (j=jmin; j<n; j++) {
228       /* (strict lower triangular part of A)*x  */
229       cval       = ib[j]*2;
230       z[cval]     += v[0]*x1 + v[1]*x2;
231       z[cval+1]   += v[2]*x1 + v[3]*x2;
232       /* (strict upper triangular part of A)*x  */
233       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
234       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
235       v  += 4;
236     }
237     xb +=2; ai++;
238   }
239 
240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
241   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
242   PetscLogFlops(8*(a->s_nz*2 - A->m) - A->m);
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
248 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
249 {
250   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
251   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
252   MatScalar     *v;
253   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
254 
255 
256   PetscFunctionBegin;
257   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
258   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
259   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
260 
261   v     = a->a;
262   xb = x;
263 
264   for (i=0; i<mbs; i++) {
265     n  = ai[1] - ai[0]; /* length of i_th block row of A */
266     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
267     ib = aj + *ai;
268     jmin = 0;
269     if (*ib == i){     /* (diag of A)*x */
270       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
271       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
272       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
273       v += 9; jmin++;
274     }
275     for (j=jmin; j<n; j++) {
276       /* (strict lower triangular part of A)*x  */
277       cval       = ib[j]*3;
278       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
279       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
280       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
281       /* (strict upper triangular part of A)*x  */
282       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
283       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
284       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
285       v  += 9;
286     }
287     xb +=3; ai++;
288   }
289 
290   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
291   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
292   PetscLogFlops(18*(a->s_nz*2 - A->m) - A->m);
293   PetscFunctionReturn(0);
294 }
295 
296 #undef __FUNCT__
297 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
298 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
299 {
300   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
301   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
302   MatScalar        *v;
303   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
304 
305   PetscFunctionBegin;
306   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
307   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
309 
310   v     = a->a;
311   xb = x;
312 
313   for (i=0; i<mbs; i++) {
314     n  = ai[1] - ai[0]; /* length of i_th block row of A */
315     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
316     ib = aj + *ai;
317     jmin = 0;
318     if (*ib == i){     /* (diag of A)*x */
319       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
320       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
321       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
322       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
323       v += 16; jmin++;
324     }
325     for (j=jmin; j<n; j++) {
326       /* (strict lower triangular part of A)*x  */
327       cval       = ib[j]*4;
328       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
329       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
330       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
331       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
332       /* (strict upper triangular part of A)*x  */
333       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
334       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
335       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
336       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
337       v  += 16;
338     }
339     xb +=4; ai++;
340   }
341 
342   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
343   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
344   PetscLogFlops(32*(a->s_nz*2 - A->m) - A->m);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
350 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
351 {
352   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
353   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
354   MatScalar       *v;
355   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
356 
357   PetscFunctionBegin;
358   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
359   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
360   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
361 
362   v     = a->a;
363   xb = x;
364 
365   for (i=0; i<mbs; i++) {
366     n  = ai[1] - ai[0]; /* length of i_th block row of A */
367     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
368     ib = aj + *ai;
369     jmin = 0;
370     if (*ib == i){      /* (diag of A)*x */
371       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
372       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
373       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
374       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
375       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
376       v += 25; jmin++;
377     }
378     for (j=jmin; j<n; j++) {
379       /* (strict lower triangular part of A)*x  */
380       cval       = ib[j]*5;
381       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
382       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
383       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
384       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
385       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
386       /* (strict upper triangular part of A)*x  */
387       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
388       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
389       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
390       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
391       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
392       v  += 25;
393     }
394     xb +=5; ai++;
395   }
396 
397   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
398   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
399   PetscLogFlops(50*(a->s_nz*2 - A->m) - A->m);
400   PetscFunctionReturn(0);
401 }
402 
403 
404 #undef __FUNCT__
405 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
406 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
407 {
408   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
409   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
410   MatScalar       *v;
411   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
412 
413   PetscFunctionBegin;
414   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
415   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
416   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
417 
418   v     = a->a;
419   xb = x;
420 
421   for (i=0; i<mbs; i++) {
422     n  = ai[1] - ai[0]; /* length of i_th block row of A */
423     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
424     ib = aj + *ai;
425     jmin = 0;
426     if (*ib == i){      /* (diag of A)*x */
427       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
428       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
429       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
430       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
431       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
432       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
433       v += 36; jmin++;
434     }
435     for (j=jmin; j<n; j++) {
436       /* (strict lower triangular part of A)*x  */
437       cval       = ib[j]*6;
438       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
439       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
440       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
441       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
442       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
443       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
444       /* (strict upper triangular part of A)*x  */
445       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
446       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
447       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
448       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
449       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
450       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
451       v  += 36;
452     }
453     xb +=6; ai++;
454   }
455 
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
458   PetscLogFlops(72*(a->s_nz*2 - A->m) - A->m);
459   PetscFunctionReturn(0);
460 }
461 #undef __FUNCT__
462 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
463 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
464 {
465   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
466   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
467   MatScalar       *v;
468   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
469 
470   PetscFunctionBegin;
471   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
472   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
473   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
474 
475   v     = a->a;
476   xb = x;
477 
478   for (i=0; i<mbs; i++) {
479     n  = ai[1] - ai[0]; /* length of i_th block row of A */
480     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
481     ib = aj + *ai;
482     jmin = 0;
483     if (*ib == i){      /* (diag of A)*x */
484       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
485       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
486       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
487       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
488       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
489       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
490       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
491       v += 49; jmin++;
492     }
493     for (j=jmin; j<n; j++) {
494       /* (strict lower triangular part of A)*x  */
495       cval       = ib[j]*7;
496       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
497       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
498       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
499       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
500       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
501       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
502       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
503       /* (strict upper triangular part of A)*x  */
504       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
505       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
506       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
507       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
508       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
509       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
510       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
511       v  += 49;
512     }
513     xb +=7; ai++;
514   }
515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
516   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
517   PetscLogFlops(98*(a->s_nz*2 - A->m) - A->m);
518   PetscFunctionReturn(0);
519 }
520 
521 /*
522     This will not work with MatScalar == float because it calls the BLAS
523 */
524 #undef __FUNCT__
525 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
526 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
527 {
528   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
529   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
530   MatScalar       *v;
531   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
532   int             ncols,k;
533 
534   PetscFunctionBegin;
535   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
536   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
537   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
538 
539   aj   = a->j;
540   v    = a->a;
541   ii   = a->i;
542 
543   if (!a->mult_work) {
544     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
545   }
546   work = a->mult_work;
547 
548   for (i=0; i<mbs; i++) {
549     n     = ii[1] - ii[0]; ncols = n*bs;
550     workt = work; idx=aj+ii[0];
551 
552     /* upper triangular part */
553     for (j=0; j<n; j++) {
554       xb = x_ptr + bs*(*idx++);
555       for (k=0; k<bs; k++) workt[k] = xb[k];
556       workt += bs;
557     }
558     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
559     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
560 
561     /* strict lower triangular part */
562     idx = aj+ii[0];
563     if (*idx == i){
564       ncols -= bs; v += bs2; idx++; n--;
565     }
566 
567     if (ncols > 0){
568       workt = work;
569       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
570       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
571       for (j=0; j<n; j++) {
572         zb = z_ptr + bs*(*idx++);
573         for (k=0; k<bs; k++) zb[k] += workt[k] ;
574         workt += bs;
575       }
576     }
577     x += bs; v += n*bs2; z += bs; ii++;
578   }
579 
580   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
581   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
582   PetscLogFlops(2*(a->s_nz*2 - A->m)*bs2 - A->m);
583   PetscFunctionReturn(0);
584 }
585 
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
588 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
589 {
590   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
591   PetscScalar     *x,*y,*z,*xb,x1;
592   MatScalar       *v;
593   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
594 
595   PetscFunctionBegin;
596   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
597   if (yy != xx) {
598     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
599   } else {
600     y = x;
601   }
602   if (zz != yy) {
603     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
606   } else {
607     z = y;
608   }
609 
610   v  = a->a;
611   xb = x;
612 
613   for (i=0; i<mbs; i++) {
614     n  = ai[1] - ai[0];  /* length of i_th row of A */
615     x1 = xb[0];
616     ib = aj + *ai;
617     jmin = 0;
618     if (*ib == i) {            /* (diag of A)*x */
619       z[i] += *v++ * x[*ib++]; jmin++;
620     }
621     for (j=jmin; j<n; j++) {
622       cval    = *ib;
623       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
624       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
625     }
626     xb++; ai++;
627   }
628 
629   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
630   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
631   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
632 
633   PetscLogFlops(2*(a->s_nz*2 - A->m));
634   PetscFunctionReturn(0);
635 }
636 
637 #undef __FUNCT__
638 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
639 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
640 {
641   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
642   PetscScalar     *x,*y,*z,*xb,x1,x2;
643   MatScalar       *v;
644   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
645 
646   PetscFunctionBegin;
647   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
648   if (yy != xx) {
649     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
650   } else {
651     y = x;
652   }
653   if (zz != yy) {
654     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
655     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
656     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
657   } else {
658     z = y;
659   }
660 
661   v     = a->a;
662   xb = x;
663 
664   for (i=0; i<mbs; i++) {
665     n  = ai[1] - ai[0]; /* length of i_th block row of A */
666     x1 = xb[0]; x2 = xb[1];
667     ib = aj + *ai;
668     jmin = 0;
669     if (*ib == i){      /* (diag of A)*x */
670       z[2*i]   += v[0]*x1 + v[2]*x2;
671       z[2*i+1] += v[2]*x1 + v[3]*x2;
672       v += 4; jmin++;
673     }
674     for (j=jmin; j<n; j++) {
675       /* (strict lower triangular part of A)*x  */
676       cval       = ib[j]*2;
677       z[cval]     += v[0]*x1 + v[1]*x2;
678       z[cval+1]   += v[2]*x1 + v[3]*x2;
679       /* (strict upper triangular part of A)*x  */
680       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
681       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
682       v  += 4;
683     }
684     xb +=2; ai++;
685   }
686 
687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
688   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
689   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
690 
691   PetscLogFlops(4*(a->s_nz*2 - A->m));
692   PetscFunctionReturn(0);
693 }
694 
695 #undef __FUNCT__
696 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
697 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
698 {
699   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
700   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
701   MatScalar       *v;
702   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
703 
704   PetscFunctionBegin;
705   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
706   if (yy != xx) {
707     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
708   } else {
709     y = x;
710   }
711   if (zz != yy) {
712     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
713     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
714     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
715   } else {
716     z = y;
717   }
718 
719   v     = a->a;
720   xb = x;
721 
722   for (i=0; i<mbs; i++) {
723     n  = ai[1] - ai[0]; /* length of i_th block row of A */
724     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
725     ib = aj + *ai;
726     jmin = 0;
727     if (*ib == i){     /* (diag of A)*x */
728      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
729      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
730      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
731      v += 9; jmin++;
732     }
733     for (j=jmin; j<n; j++) {
734       /* (strict lower triangular part of A)*x  */
735       cval       = ib[j]*3;
736       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
737       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
738       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
739       /* (strict upper triangular part of A)*x  */
740       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
741       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
742       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
743       v  += 9;
744     }
745     xb +=3; ai++;
746   }
747 
748   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
749   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
750   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
751 
752   PetscLogFlops(18*(a->s_nz*2 - A->m));
753   PetscFunctionReturn(0);
754 }
755 
756 #undef __FUNCT__
757 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
758 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
759 {
760   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
761   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
762   MatScalar       *v;
763   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
764 
765   PetscFunctionBegin;
766   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
767   if (yy != xx) {
768     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
769   } else {
770     y = x;
771   }
772   if (zz != yy) {
773     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
774     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
775     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
776   } else {
777     z = y;
778   }
779 
780   v     = a->a;
781   xb = x;
782 
783   for (i=0; i<mbs; i++) {
784     n  = ai[1] - ai[0]; /* length of i_th block row of A */
785     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
786     ib = aj + *ai;
787     jmin = 0;
788     if (*ib == i){      /* (diag of A)*x */
789       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
790       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
791       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
792       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
793       v += 16; jmin++;
794     }
795     for (j=jmin; j<n; j++) {
796       /* (strict lower triangular part of A)*x  */
797       cval       = ib[j]*4;
798       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
799       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
800       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
801       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
802       /* (strict upper triangular part of A)*x  */
803       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
804       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
805       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
806       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
807       v  += 16;
808     }
809     xb +=4; ai++;
810   }
811 
812   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
813   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
814   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
815 
816   PetscLogFlops(32*(a->s_nz*2 - A->m));
817   PetscFunctionReturn(0);
818 }
819 
820 #undef __FUNCT__
821 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
822 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
823 {
824   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
825   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
826   MatScalar       *v;
827   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
828 
829   PetscFunctionBegin;
830   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
831   if (yy != xx) {
832     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
833   } else {
834     y = x;
835   }
836   if (zz != yy) {
837     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
838     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
839     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
840   } else {
841     z = y;
842   }
843 
844   v     = a->a;
845   xb = x;
846 
847   for (i=0; i<mbs; i++) {
848     n  = ai[1] - ai[0]; /* length of i_th block row of A */
849     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
850     ib = aj + *ai;
851     jmin = 0;
852     if (*ib == i){      /* (diag of A)*x */
853       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
854       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
855       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
856       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
857       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
858       v += 25; jmin++;
859     }
860     for (j=jmin; j<n; j++) {
861       /* (strict lower triangular part of A)*x  */
862       cval       = ib[j]*5;
863       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
864       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
865       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
866       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
867       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
868       /* (strict upper triangular part of A)*x  */
869       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
870       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
871       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
872       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
873       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
874       v  += 25;
875     }
876     xb +=5; ai++;
877   }
878 
879   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
880   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
881   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
882 
883   PetscLogFlops(50*(a->s_nz*2 - A->m));
884   PetscFunctionReturn(0);
885 }
886 #undef __FUNCT__
887 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
888 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
889 {
890   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
891   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
892   MatScalar       *v;
893   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
894 
895   PetscFunctionBegin;
896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
897   if (yy != xx) {
898     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
899   } else {
900     y = x;
901   }
902   if (zz != yy) {
903     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
904     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
905     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
906   } else {
907     z = y;
908   }
909 
910   v     = a->a;
911   xb = x;
912 
913   for (i=0; i<mbs; i++) {
914     n  = ai[1] - ai[0]; /* length of i_th block row of A */
915     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
916     ib = aj + *ai;
917     jmin = 0;
918     if (*ib == i){     /* (diag of A)*x */
919       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
920       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
921       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
922       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
923       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
924       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
925       v += 36; jmin++;
926     }
927     for (j=jmin; j<n; j++) {
928       /* (strict lower triangular part of A)*x  */
929       cval       = ib[j]*6;
930       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
931       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
932       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
933       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
934       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
935       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
936       /* (strict upper triangular part of A)*x  */
937       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
938       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
939       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
940       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
941       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
942       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
943       v  += 36;
944     }
945     xb +=6; ai++;
946   }
947 
948   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
949   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
950   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
951 
952   PetscLogFlops(72*(a->s_nz*2 - A->m));
953   PetscFunctionReturn(0);
954 }
955 
956 #undef __FUNCT__
957 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
958 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
959 {
960   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
961   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
962   MatScalar       *v;
963   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
964 
965   PetscFunctionBegin;
966   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
967   if (yy != xx) {
968     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
969   } else {
970     y = x;
971   }
972   if (zz != yy) {
973     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
974     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
975     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
976   } else {
977     z = y;
978   }
979 
980   v     = a->a;
981   xb = x;
982 
983   for (i=0; i<mbs; i++) {
984     n  = ai[1] - ai[0]; /* length of i_th block row of A */
985     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
986     ib = aj + *ai;
987     jmin = 0;
988     if (*ib == i){     /* (diag of A)*x */
989       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
990       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
991       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
992       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
993       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
994       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
995       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
996       v += 49; jmin++;
997     }
998     for (j=jmin; j<n; j++) {
999       /* (strict lower triangular part of A)*x  */
1000       cval       = ib[j]*7;
1001       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1002       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1003       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1004       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1005       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1006       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1007       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1008       /* (strict upper triangular part of A)*x  */
1009       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1010       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1011       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1012       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1013       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1014       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1015       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1016       v  += 49;
1017     }
1018     xb +=7; ai++;
1019   }
1020 
1021   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1022   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1023   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1024 
1025   PetscLogFlops(98*(a->s_nz*2 - A->m));
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 #undef __FUNCT__
1030 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1031 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1032 {
1033   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1034   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1035   MatScalar       *v;
1036   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1037   int             ncols,k;
1038 
1039   PetscFunctionBegin;
1040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1041   if (yy != xx) {
1042     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1043   } else {
1044     y = x;
1045   }
1046   if (zz != yy) {
1047     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1048     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1049     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1050   } else {
1051     z = y;
1052   }
1053 
1054   aj   = a->j;
1055   v    = a->a;
1056   ii   = a->i;
1057 
1058   if (!a->mult_work) {
1059     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1060   }
1061   work = a->mult_work;
1062 
1063 
1064   for (i=0; i<mbs; i++) {
1065     n     = ii[1] - ii[0]; ncols = n*bs;
1066     workt = work; idx=aj+ii[0];
1067 
1068     /* upper triangular part */
1069     for (j=0; j<n; j++) {
1070       xb = x_ptr + bs*(*idx++);
1071       for (k=0; k<bs; k++) workt[k] = xb[k];
1072       workt += bs;
1073     }
1074     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1075     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1076 
1077     /* strict lower triangular part */
1078     idx = aj+ii[0];
1079     if (*idx == i){
1080       ncols -= bs; v += bs2; idx++; n--;
1081     }
1082     if (ncols > 0){
1083       workt = work;
1084       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1085       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1086       for (j=0; j<n; j++) {
1087         zb = z_ptr + bs*(*idx++);
1088         /* idx++; */
1089         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1090         workt += bs;
1091       }
1092     }
1093 
1094     x += bs; v += n*bs2; z += bs; ii++;
1095   }
1096 
1097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1098   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1099   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1100 
1101   PetscLogFlops(2*(a->s_nz*2 - A->m));
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNCT__
1106 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1107 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1108 {
1109   PetscFunctionBegin;
1110   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1111   /* PetscFunctionReturn(0); */
1112 }
1113 
1114 #undef __FUNCT__
1115 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1116 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1117 
1118 {
1119   PetscFunctionBegin;
1120   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1121   /* PetscFunctionReturn(0); */
1122 }
1123 
1124 #undef __FUNCT__
1125 #define __FUNCT__ "MatScale_SeqSBAIJ"
1126 int MatScale_SeqSBAIJ(PetscScalar *alpha,Mat inA)
1127 {
1128   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1129   int         one = 1,totalnz = a->bs2*a->s_nz;
1130 
1131   PetscFunctionBegin;
1132   BLscal_(&totalnz,alpha,a->a,&one);
1133   PetscLogFlops(totalnz);
1134   PetscFunctionReturn(0);
1135 }
1136 
1137 #undef __FUNCT__
1138 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1139 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1140 {
1141   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1142   MatScalar   *v = a->a;
1143   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1144   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1145   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1146 
1147   PetscFunctionBegin;
1148   if (type == NORM_FROBENIUS) {
1149     for (k=0; k<mbs; k++){
1150       jmin = a->i[k]; jmax = a->i[k+1];
1151       col  = aj + jmin;
1152       if (*col == k){         /* diagonal block */
1153         for (i=0; i<bs2; i++){
1154 #if defined(PETSC_USE_COMPLEX)
1155           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1156 #else
1157           sum_diag += (*v)*(*v); v++;
1158 #endif
1159         }
1160         jmin++;
1161       }
1162       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1163         for (i=0; i<bs2; i++){
1164 #if defined(PETSC_USE_COMPLEX)
1165           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1166 #else
1167           sum_off += (*v)*(*v); v++;
1168 #endif
1169         }
1170       }
1171     }
1172     *norm = sqrt(sum_diag + 2*sum_off);
1173 
1174   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1175     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1176     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1177     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1178     for (i=0; i<mbs; i++) {
1179       jl[i] = mbs; il[0] = 0;
1180     }
1181 
1182     *norm = 0.0;
1183     for (k=0; k<mbs; k++) { /* k_th block row */
1184       for (j=0; j<bs; j++) sum[j]=0.0;
1185 
1186       /*-- col sum --*/
1187       i = jl[k]; /* first |A(i,k)| to be added */
1188       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1189                   at step k */
1190       while (i<mbs){
1191         nexti = jl[i];  /* next block row to be added */
1192         ik    = il[i];  /* block index of A(i,k) in the array a */
1193         for (j=0; j<bs; j++){
1194           v = a->a + ik*bs2 + j*bs;
1195           for (k1=0; k1<bs; k1++) {
1196             sum[j] += PetscAbsScalar(*v); v++;
1197           }
1198         }
1199         /* update il, jl */
1200         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1201         jmax = a->i[i+1];
1202         if (jmin < jmax){
1203           il[i] = jmin;
1204           j   = a->j[jmin];
1205           jl[i] = jl[j]; jl[j]=i;
1206         }
1207         i = nexti;
1208       }
1209 
1210       /*-- row sum --*/
1211       jmin = a->i[k]; jmax = a->i[k+1];
1212       for (i=jmin; i<jmax; i++) {
1213         for (j=0; j<bs; j++){
1214           v = a->a + i*bs2 + j;
1215           for (k1=0; k1<bs; k1++){
1216             sum[j] += PetscAbsScalar(*v);
1217             v   += bs;
1218           }
1219         }
1220       }
1221       /* add k_th block row to il, jl */
1222       col = aj+jmin;
1223       if (*col == k) jmin++;
1224       if (jmin < jmax){
1225         il[k] = jmin;
1226         j   = a->j[jmin];
1227         jl[k] = jl[j]; jl[j] = k;
1228       }
1229       for (j=0; j<bs; j++){
1230         if (sum[j] > *norm) *norm = sum[j];
1231       }
1232     }
1233     ierr = PetscFree(il);CHKERRQ(ierr);
1234     ierr = PetscFree(jl);CHKERRQ(ierr);
1235     ierr = PetscFree(sum);CHKERRQ(ierr);
1236   } else {
1237     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1238   }
1239   PetscFunctionReturn(0);
1240 }
1241 
1242 #undef __FUNCT__
1243 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1244 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1245 {
1246   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1247   int          ierr;
1248   PetscTruth   flag;
1249 
1250   PetscFunctionBegin;
1251   ierr = PetscTypeCompare((PetscObject)A,MATSEQBAIJ,&flag);CHKERRQ(ierr);
1252   if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1253 
1254   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1255   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1256     *flg = PETSC_FALSE;
1257     PetscFunctionReturn(0);
1258   }
1259 
1260   /* if the a->i are the same */
1261   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1262   if (*flg == PETSC_FALSE) {
1263     PetscFunctionReturn(0);
1264   }
1265 
1266   /* if a->j are the same */
1267   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1268   if (*flg == PETSC_FALSE) {
1269     PetscFunctionReturn(0);
1270   }
1271   /* if a->a are the same */
1272   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1273   PetscFunctionReturn(0);
1274 
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1279 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1280 {
1281   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1282   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1283   PetscScalar  *x,zero = 0.0;
1284   MatScalar    *aa,*aa_j;
1285 
1286   PetscFunctionBegin;
1287   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1288   bs   = a->bs;
1289   aa   = a->a;
1290   ai   = a->i;
1291   aj   = a->j;
1292   ambs = a->mbs;
1293   bs2  = a->bs2;
1294 
1295   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1296   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1297   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1298   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1299   for (i=0; i<ambs; i++) {
1300     j=ai[i];
1301     if (aj[j] == i) {             /* if this is a diagonal element */
1302       row  = i*bs;
1303       aa_j = aa + j*bs2;
1304       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1305     }
1306   }
1307   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 #undef __FUNCT__
1312 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1313 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1314 {
1315   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1316   PetscScalar  *l,*r,x,*li,*ri;
1317   MatScalar    *aa,*v;
1318   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1319 
1320   PetscFunctionBegin;
1321   ai  = a->i;
1322   aj  = a->j;
1323   aa  = a->a;
1324   m   = A->m;
1325   bs  = a->bs;
1326   mbs = a->mbs;
1327   bs2 = a->bs2;
1328 
1329   if (ll != rr) {
1330     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1331   }
1332   if (ll) {
1333     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1334     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1335     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1336     for (i=0; i<mbs; i++) { /* for each block row */
1337       M  = ai[i+1] - ai[i];
1338       li = l + i*bs;
1339       v  = aa + bs2*ai[i];
1340       for (j=0; j<M; j++) { /* for each block */
1341         for (k=0; k<bs2; k++) {
1342           (*v++) *= li[k%bs];
1343         }
1344 #ifdef CONT
1345         /* will be used to replace the above loop */
1346         ri = l + bs*aj[ai[i]+j];
1347         for (k=0; k<bs; k++) { /* column value */
1348           x = ri[k];
1349           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1350         }
1351 #endif
1352 
1353       }
1354     }
1355     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1356     PetscLogFlops(2*a->s_nz);
1357   }
1358   /* will be deleted */
1359   if (rr) {
1360     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1361     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1362     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1363     for (i=0; i<mbs; i++) { /* for each block row */
1364       M  = ai[i+1] - ai[i];
1365       v  = aa + bs2*ai[i];
1366       for (j=0; j<M; j++) { /* for each block */
1367         ri = r + bs*aj[ai[i]+j];
1368         for (k=0; k<bs; k++) {
1369           x = ri[k];
1370           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1371         }
1372       }
1373     }
1374     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1375     PetscLogFlops(a->s_nz);
1376   }
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 #undef __FUNCT__
1381 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1382 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1383 {
1384   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1385 
1386   PetscFunctionBegin;
1387   info->rows_global    = (double)A->m;
1388   info->columns_global = (double)A->m;
1389   info->rows_local     = (double)A->m;
1390   info->columns_local  = (double)A->m;
1391   info->block_size     = a->bs2;
1392   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1393   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1394   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1395   info->assemblies   = A->num_ass;
1396   info->mallocs      = a->reallocs;
1397   info->memory       = A->mem;
1398   if (A->factor) {
1399     info->fill_ratio_given  = A->info.fill_ratio_given;
1400     info->fill_ratio_needed = A->info.fill_ratio_needed;
1401     info->factor_mallocs    = A->info.factor_mallocs;
1402   } else {
1403     info->fill_ratio_given  = 0;
1404     info->fill_ratio_needed = 0;
1405     info->factor_mallocs    = 0;
1406   }
1407   PetscFunctionReturn(0);
1408 }
1409 
1410 
1411 #undef __FUNCT__
1412 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1413 int MatZeroEntries_SeqSBAIJ(Mat A)
1414 {
1415   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1416   int         ierr;
1417 
1418   PetscFunctionBegin;
1419   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1420   PetscFunctionReturn(0);
1421 }
1422 
1423 #undef __FUNCT__
1424 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1425 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1426 {
1427   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1428   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1429   PetscReal    atmp;
1430   MatScalar    *aa;
1431   PetscScalar  zero = 0.0,*x;
1432   int          ncols,brow,bcol,krow,kcol;
1433 
1434   PetscFunctionBegin;
1435   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1436   bs   = a->bs;
1437   aa   = a->a;
1438   ai   = a->i;
1439   aj   = a->j;
1440   mbs = a->mbs;
1441 
1442   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1443   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1444   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1445   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1446   for (i=0; i<mbs; i++) {
1447     ncols = ai[1] - ai[0]; ai++;
1448     brow  = bs*i;
1449     for (j=0; j<ncols; j++){
1450       bcol = bs*(*aj);
1451       for (kcol=0; kcol<bs; kcol++){
1452         col = bcol + kcol;      /* col index */
1453         for (krow=0; krow<bs; krow++){
1454           atmp = PetscAbsScalar(*aa); aa++;
1455           row = brow + krow;    /* row index */
1456           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1457           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1458           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1459         }
1460       }
1461       aj++;
1462     }
1463   }
1464   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1465   PetscFunctionReturn(0);
1466 }
1467