xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 7d6bfa3b9d7db0ccd4cc481237114ca8dbb0dbff)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/inline/spops.h"
5 #include "../src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "../src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   const PetscInt *idx;
17   PetscBT        table,table0;
18 
19   PetscFunctionBegin;
20   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
21   mbs = a->mbs;
22   ai  = a->i;
23   aj  = a->j;
24   bs  = A->rmap->bs;
25   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
26   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
27   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
28   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) { /* for each is */
31     isz  = 0;
32     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
33 
34     /* Extract the indices, assume there can be duplicate entries */
35     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
36     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
37 
38     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
39     bcol_max = 0;
40     for (j=0; j<n ; ++j){
41       brow = idx[j]/bs; /* convert the indices into block indices */
42       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,brow)) {
44         nidx[isz++] = brow;
45         if (bcol_max < brow) bcol_max = brow;
46       }
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++){ /* for each overlap */
53       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
54       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
55       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
56 
57       n = isz;  /* length of the updated is[i] */
58       for (brow=0; brow<mbs; brow++){
59         start = ai[brow]; end   = ai[brow+1];
60         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
61           for (l = start; l<end ; l++){
62             bcol = aj[l];
63             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end ; l++){
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table0,bcol)){
72               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++)
83         nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 #undef __FUNCT__
95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
97 {
98   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
99   PetscErrorCode ierr;
100   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
101   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
102   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
103   const PetscInt *irow,*aj = a->j,*ai = a->i;
104   MatScalar      *mat_a;
105   Mat            C;
106   PetscTruth     flag,sorted;
107 
108   PetscFunctionBegin;
109   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
110   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
111   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
117   ssmap = smap;
118   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
119   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121   /* determine lens of each row */
122   for (i=0; i<nrows; i++) {
123     kstart  = ai[irow[i]];
124     kend    = kstart + a->ilen[irow[i]];
125     lens[i] = 0;
126       for (k=kstart; k<kend; k++) {
127         if (ssmap[aj[k]]) {
128           lens[i]++;
129         }
130       }
131     }
132   /* Create and fill new matrix */
133   if (scall == MAT_REUSE_MATRIX) {
134     c = (Mat_SeqSBAIJ *)((*B)->data);
135 
136     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
137     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
138     if (!flag) {
139       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
140     }
141     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
142     C = *B;
143   } else {
144     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
145     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
146     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
147     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
148   }
149   c = (Mat_SeqSBAIJ *)(C->data);
150   for (i=0; i<nrows; i++) {
151     row    = irow[i];
152     kstart = ai[row];
153     kend   = kstart + a->ilen[row];
154     mat_i  = c->i[i];
155     mat_j  = c->j + mat_i;
156     mat_a  = c->a + mat_i*bs2;
157     mat_ilen = c->ilen + i;
158     for (k=kstart; k<kend; k++) {
159       if ((tcol=ssmap[a->j[k]])) {
160         *mat_j++ = tcol - 1;
161         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162         mat_a   += bs2;
163         (*mat_ilen)++;
164       }
165     }
166   }
167 
168   /* Free work space */
169   ierr = PetscFree(smap);CHKERRQ(ierr);
170   ierr = PetscFree(lens);CHKERRQ(ierr);
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173 
174   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
175   *B = C;
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
182 {
183   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
184   IS             is1;
185   PetscErrorCode ierr;
186   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
187   const PetscInt *irow;
188 
189   PetscFunctionBegin;
190   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
191 
192   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
193   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
194 
195   /* Verify if the indices corespond to each element in a block
196    and form the IS with compressed IS */
197   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
198   iary = vary + a->mbs;
199   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
200   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
201 
202   count = 0;
203   for (i=0; i<a->mbs; i++) {
204     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
205     if (vary[i]==bs) iary[count++] = i;
206   }
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
208 
209   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210   ierr = PetscFree(vary);CHKERRQ(ierr);
211 
212   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
213   ISDestroy(is1);
214   PetscFunctionReturn(0);
215 }
216 
217 #undef __FUNCT__
218 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
219 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
220 {
221   PetscErrorCode ierr;
222   PetscInt       i;
223 
224   PetscFunctionBegin;
225   if (scall == MAT_INITIAL_MATRIX) {
226     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
227   }
228 
229   for (i=0; i<n; i++) {
230     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
231   }
232   PetscFunctionReturn(0);
233 }
234 
235 /* -------------------------------------------------------*/
236 /* Should check that shapes of vectors and matrices match */
237 /* -------------------------------------------------------*/
238 #include "petscblaslapack.h"
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
242 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
243 {
244   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
245   PetscScalar    *x,*z,*xb,x1,zero=0.0;
246   MatScalar      *v;
247   PetscErrorCode ierr;
248   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
249   PetscInt       nonzerorow=0;
250 
251   PetscFunctionBegin;
252   ierr = VecSet(zz,zero);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
254   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
255 
256   v  = a->a;
257   xb = x;
258 
259   for (i=0; i<mbs; i++) {
260     n    = ai[1] - ai[0];  /* length of i_th row of A */
261     x1   = *xb++;
262     ib   = aj + *ai++;
263     jmin = 0;
264     nonzerorow += (n>0);
265     /* if we ALWAYS required a diagonal entry then could remove this if test */
266     /* should we use a tmp to hold the accumulated z[i] */
267     if (*ib == i) {      /* (diag of A)*x */
268       z[i] += *v++ * x[*ib++];
269       jmin++;
270     }
271     for (j=jmin; j<n; j++) {
272       cval    = *ib;
273       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
274       z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
275     }
276   }
277 
278   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
279   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
280   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);  /* nz = (nz+m)/2 */
281   PetscFunctionReturn(0);
282 }
283 
284 #undef __FUNCT__
285 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
286 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
287 {
288   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
289   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
290   MatScalar      *v;
291   PetscErrorCode ierr;
292   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
293   PetscInt       nonzerorow=0;
294 
295   PetscFunctionBegin;
296   ierr = VecSet(zz,zero);CHKERRQ(ierr);
297   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
298   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
299 
300   v     = a->a;
301   xb = x;
302 
303   for (i=0; i<mbs; i++) {
304     n  = ai[1] - ai[0]; /* length of i_th block row of A */
305     x1 = xb[0]; x2 = xb[1];
306     ib = aj + *ai;
307     jmin = 0;
308     nonzerorow += (n>0);
309     if (*ib == i){     /* (diag of A)*x */
310       z[2*i]   += v[0]*x1 + v[2]*x2;
311       z[2*i+1] += v[2]*x1 + v[3]*x2;
312       v += 4; jmin++;
313     }
314     for (j=jmin; j<n; j++) {
315       /* (strict lower triangular part of A)*x  */
316       cval       = ib[j]*2;
317       z[cval]     += v[0]*x1 + v[1]*x2;
318       z[cval+1]   += v[2]*x1 + v[3]*x2;
319       /* (strict upper triangular part of A)*x  */
320       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
321       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
322       v  += 4;
323     }
324     xb +=2; ai++;
325   }
326 
327   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
328   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
329   ierr = PetscLogFlops(8*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
330   PetscFunctionReturn(0);
331 }
332 
333 #undef __FUNCT__
334 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
335 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
336 {
337   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
338   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
339   MatScalar      *v;
340   PetscErrorCode ierr;
341   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
342   PetscInt       nonzerorow=0;
343 
344   PetscFunctionBegin;
345   ierr = VecSet(zz,zero);CHKERRQ(ierr);
346   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
347   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
348 
349   v    = a->a;
350   xb   = x;
351 
352   for (i=0; i<mbs; i++) {
353     n  = ai[1] - ai[0]; /* length of i_th block row of A */
354     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
355     ib = aj + *ai;
356     jmin = 0;
357     nonzerorow += (n>0);
358     if (*ib == i){     /* (diag of A)*x */
359       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
360       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
361       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
362       v += 9; jmin++;
363     }
364     for (j=jmin; j<n; j++) {
365       /* (strict lower triangular part of A)*x  */
366       cval       = ib[j]*3;
367       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
368       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
369       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
370       /* (strict upper triangular part of A)*x  */
371       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
372       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
373       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
374       v  += 9;
375     }
376     xb +=3; ai++;
377   }
378 
379   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
381   ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
382   PetscFunctionReturn(0);
383 }
384 
385 #undef __FUNCT__
386 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
387 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
388 {
389   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
390   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
391   MatScalar      *v;
392   PetscErrorCode ierr;
393   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
394   PetscInt       nonzerorow=0;
395 
396   PetscFunctionBegin;
397   ierr = VecSet(zz,zero);CHKERRQ(ierr);
398   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
399   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
400 
401   v     = a->a;
402   xb = x;
403 
404   for (i=0; i<mbs; i++) {
405     n  = ai[1] - ai[0]; /* length of i_th block row of A */
406     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
407     ib = aj + *ai;
408     jmin = 0;
409     nonzerorow += (n>0);
410     if (*ib == i){     /* (diag of A)*x */
411       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
412       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
413       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
414       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
415       v += 16; jmin++;
416     }
417     for (j=jmin; j<n; j++) {
418       /* (strict lower triangular part of A)*x  */
419       cval       = ib[j]*4;
420       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
421       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
422       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
423       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
424       /* (strict upper triangular part of A)*x  */
425       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
426       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
427       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
428       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
429       v  += 16;
430     }
431     xb +=4; ai++;
432   }
433 
434   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
435   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
436   ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
437   PetscFunctionReturn(0);
438 }
439 
440 #undef __FUNCT__
441 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
442 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
443 {
444   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
445   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
446   MatScalar      *v;
447   PetscErrorCode ierr;
448   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
449   PetscInt       nonzerorow=0;
450 
451   PetscFunctionBegin;
452   ierr = VecSet(zz,zero);CHKERRQ(ierr);
453   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
454   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
455 
456   v     = a->a;
457   xb = x;
458 
459   for (i=0; i<mbs; i++) {
460     n  = ai[1] - ai[0]; /* length of i_th block row of A */
461     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
462     ib = aj + *ai;
463     jmin = 0;
464     nonzerorow += (n>0);
465     if (*ib == i){      /* (diag of A)*x */
466       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
467       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
468       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
469       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
470       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
471       v += 25; jmin++;
472     }
473     for (j=jmin; j<n; j++) {
474       /* (strict lower triangular part of A)*x  */
475       cval       = ib[j]*5;
476       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
477       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
478       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
479       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
480       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
481       /* (strict upper triangular part of A)*x  */
482       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
483       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
484       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
485       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
486       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
487       v  += 25;
488     }
489     xb +=5; ai++;
490   }
491 
492   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
493   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
494   ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 
499 #undef __FUNCT__
500 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
501 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
502 {
503   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
504   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
505   MatScalar      *v;
506   PetscErrorCode ierr;
507   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
508   PetscInt       nonzerorow=0;
509 
510   PetscFunctionBegin;
511   ierr = VecSet(zz,zero);CHKERRQ(ierr);
512   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
513   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
514 
515   v     = a->a;
516   xb = x;
517 
518   for (i=0; i<mbs; i++) {
519     n  = ai[1] - ai[0]; /* length of i_th block row of A */
520     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
521     ib = aj + *ai;
522     jmin = 0;
523     nonzerorow += (n>0);
524     if (*ib == i){      /* (diag of A)*x */
525       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
526       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
527       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
528       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
529       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
530       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
531       v += 36; jmin++;
532     }
533     for (j=jmin; j<n; j++) {
534       /* (strict lower triangular part of A)*x  */
535       cval       = ib[j]*6;
536       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
537       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
538       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
539       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
540       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
541       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
542       /* (strict upper triangular part of A)*x  */
543       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
544       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
545       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
546       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
547       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
548       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
549       v  += 36;
550     }
551     xb +=6; ai++;
552   }
553 
554   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
555   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
556   ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
557   PetscFunctionReturn(0);
558 }
559 #undef __FUNCT__
560 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
561 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
562 {
563   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
564   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
565   MatScalar      *v;
566   PetscErrorCode ierr;
567   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
568   PetscInt       nonzerorow=0;
569 
570   PetscFunctionBegin;
571   ierr = VecSet(zz,zero);CHKERRQ(ierr);
572   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
573   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
574 
575   v     = a->a;
576   xb = x;
577 
578   for (i=0; i<mbs; i++) {
579     n  = ai[1] - ai[0]; /* length of i_th block row of A */
580     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
581     ib = aj + *ai;
582     jmin = 0;
583     nonzerorow += (n>0);
584     if (*ib == i){      /* (diag of A)*x */
585       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
586       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
587       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
588       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
589       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
590       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
591       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
592       v += 49; jmin++;
593     }
594     for (j=jmin; j<n; j++) {
595       /* (strict lower triangular part of A)*x  */
596       cval       = ib[j]*7;
597       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
598       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
599       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
600       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
601       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
602       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
603       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
604       /* (strict upper triangular part of A)*x  */
605       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
606       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
607       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
608       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
609       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
610       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
611       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
612       v  += 49;
613     }
614     xb +=7; ai++;
615   }
616   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
617   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
618   ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
619   PetscFunctionReturn(0);
620 }
621 
622 /*
623     This will not work with MatScalar == float because it calls the BLAS
624 */
625 #undef __FUNCT__
626 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
627 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
628 {
629   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
630   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
631   MatScalar      *v;
632   PetscErrorCode ierr;
633   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
634   PetscInt       nonzerorow=0;
635 
636   PetscFunctionBegin;
637   ierr = VecSet(zz,zero);CHKERRQ(ierr);
638   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
639   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
640 
641   aj   = a->j;
642   v    = a->a;
643   ii   = a->i;
644 
645   if (!a->mult_work) {
646     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
647   }
648   work = a->mult_work;
649 
650   for (i=0; i<mbs; i++) {
651     n     = ii[1] - ii[0]; ncols = n*bs;
652     workt = work; idx=aj+ii[0];
653     nonzerorow += (n>0);
654 
655     /* upper triangular part */
656     for (j=0; j<n; j++) {
657       xb = x_ptr + bs*(*idx++);
658       for (k=0; k<bs; k++) workt[k] = xb[k];
659       workt += bs;
660     }
661     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
662     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
663 
664     /* strict lower triangular part */
665     idx = aj+ii[0];
666     if (*idx == i){
667       ncols -= bs; v += bs2; idx++; n--;
668     }
669 
670     if (ncols > 0){
671       workt = work;
672       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
673       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
674       for (j=0; j<n; j++) {
675         zb = z_ptr + bs*(*idx++);
676         for (k=0; k<bs; k++) zb[k] += workt[k] ;
677         workt += bs;
678       }
679     }
680     x += bs; v += n*bs2; z += bs; ii++;
681   }
682 
683   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
684   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
685   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
686   PetscFunctionReturn(0);
687 }
688 
689 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
690 #undef __FUNCT__
691 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
692 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
693 {
694   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
695   PetscScalar    *x,*z,*xb,x1;
696   MatScalar      *v;
697   PetscErrorCode ierr;
698   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
699   PetscInt       nonzerorow=0;
700 
701   PetscFunctionBegin;
702   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
703   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
704   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
705   v  = a->a;
706   xb = x;
707 
708   for (i=0; i<mbs; i++) {
709     n  = ai[1] - ai[0];  /* length of i_th row of A */
710     x1 = xb[0];
711     ib = aj + *ai;
712     jmin = 0;
713     nonzerorow += (n>0);
714     if (*ib == i) {            /* (diag of A)*x */
715       z[i] += *v++ * x[*ib++]; jmin++;
716     }
717     for (j=jmin; j<n; j++) {
718       cval    = *ib;
719       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
720       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
721     }
722     xb++; ai++;
723   }
724 
725   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
726   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
727 
728   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
729   PetscFunctionReturn(0);
730 }
731 
732 #undef __FUNCT__
733 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
734 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
735 {
736   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
737   PetscScalar    *x,*z,*xb,x1,x2;
738   MatScalar      *v;
739   PetscErrorCode ierr;
740   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
741   PetscInt       nonzerorow=0;
742 
743   PetscFunctionBegin;
744   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
745   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
746   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
747 
748   v  = a->a;
749   xb = x;
750 
751   for (i=0; i<mbs; i++) {
752     n  = ai[1] - ai[0]; /* length of i_th block row of A */
753     x1 = xb[0]; x2 = xb[1];
754     ib = aj + *ai;
755     jmin = 0;
756     nonzerorow += (n>0);
757     if (*ib == i){      /* (diag of A)*x */
758       z[2*i]   += v[0]*x1 + v[2]*x2;
759       z[2*i+1] += v[2]*x1 + v[3]*x2;
760       v += 4; jmin++;
761     }
762     for (j=jmin; j<n; j++) {
763       /* (strict lower triangular part of A)*x  */
764       cval       = ib[j]*2;
765       z[cval]     += v[0]*x1 + v[1]*x2;
766       z[cval+1]   += v[2]*x1 + v[3]*x2;
767       /* (strict upper triangular part of A)*x  */
768       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
769       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
770       v  += 4;
771     }
772     xb +=2; ai++;
773   }
774   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
775   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
776 
777   ierr = PetscLogFlops(4*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
778   PetscFunctionReturn(0);
779 }
780 
781 #undef __FUNCT__
782 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
783 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
784 {
785   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
786   PetscScalar    *x,*z,*xb,x1,x2,x3;
787   MatScalar      *v;
788   PetscErrorCode ierr;
789   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
790   PetscInt       nonzerorow=0;
791 
792   PetscFunctionBegin;
793   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
796 
797   v     = a->a;
798   xb = x;
799 
800   for (i=0; i<mbs; i++) {
801     n  = ai[1] - ai[0]; /* length of i_th block row of A */
802     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
803     ib = aj + *ai;
804     jmin = 0;
805     nonzerorow += (n>0);
806     if (*ib == i){     /* (diag of A)*x */
807      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
808      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
809      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
810      v += 9; jmin++;
811     }
812     for (j=jmin; j<n; j++) {
813       /* (strict lower triangular part of A)*x  */
814       cval       = ib[j]*3;
815       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
816       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
817       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
818       /* (strict upper triangular part of A)*x  */
819       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
820       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
821       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
822       v  += 9;
823     }
824     xb +=3; ai++;
825   }
826 
827   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
828   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
829 
830   ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
831   PetscFunctionReturn(0);
832 }
833 
834 #undef __FUNCT__
835 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
836 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
837 {
838   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
839   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
840   MatScalar      *v;
841   PetscErrorCode ierr;
842   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
843   PetscInt       nonzerorow=0;
844 
845   PetscFunctionBegin;
846   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
847   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
848   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
849 
850   v     = a->a;
851   xb = x;
852 
853   for (i=0; i<mbs; i++) {
854     n  = ai[1] - ai[0]; /* length of i_th block row of A */
855     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
856     ib = aj + *ai;
857     jmin = 0;
858     nonzerorow += (n>0);
859     if (*ib == i){      /* (diag of A)*x */
860       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
861       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
862       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
863       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
864       v += 16; jmin++;
865     }
866     for (j=jmin; j<n; j++) {
867       /* (strict lower triangular part of A)*x  */
868       cval       = ib[j]*4;
869       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
870       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
871       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
872       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
873       /* (strict upper triangular part of A)*x  */
874       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
875       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
876       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
877       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
878       v  += 16;
879     }
880     xb +=4; ai++;
881   }
882 
883   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
884   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
885 
886   ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
887   PetscFunctionReturn(0);
888 }
889 
890 #undef __FUNCT__
891 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
892 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
893 {
894   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
895   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
896   MatScalar      *v;
897   PetscErrorCode ierr;
898   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
899   PetscInt       nonzerorow=0;
900 
901   PetscFunctionBegin;
902   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
903   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
904   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
905 
906   v     = a->a;
907   xb = x;
908 
909   for (i=0; i<mbs; i++) {
910     n  = ai[1] - ai[0]; /* length of i_th block row of A */
911     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
912     ib = aj + *ai;
913     jmin = 0;
914     nonzerorow += (n>0);
915     if (*ib == i){      /* (diag of A)*x */
916       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
917       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
918       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
919       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
920       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
921       v += 25; jmin++;
922     }
923     for (j=jmin; j<n; j++) {
924       /* (strict lower triangular part of A)*x  */
925       cval       = ib[j]*5;
926       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
927       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
928       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
929       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
930       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
931       /* (strict upper triangular part of A)*x  */
932       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
933       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
934       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
935       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
936       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
937       v  += 25;
938     }
939     xb +=5; ai++;
940   }
941 
942   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
943   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
944 
945   ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
946   PetscFunctionReturn(0);
947 }
948 #undef __FUNCT__
949 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
950 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
951 {
952   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
953   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
954   MatScalar      *v;
955   PetscErrorCode ierr;
956   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
957   PetscInt       nonzerorow=0;
958 
959   PetscFunctionBegin;
960   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
961   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
962   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
963 
964   v     = a->a;
965   xb = x;
966 
967   for (i=0; i<mbs; i++) {
968     n  = ai[1] - ai[0]; /* length of i_th block row of A */
969     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
970     ib = aj + *ai;
971     jmin = 0;
972     nonzerorow += (n>0);
973     if (*ib == i){     /* (diag of A)*x */
974       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
975       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
976       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
977       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
978       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
979       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
980       v += 36; jmin++;
981     }
982     for (j=jmin; j<n; j++) {
983       /* (strict lower triangular part of A)*x  */
984       cval       = ib[j]*6;
985       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
986       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
987       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
988       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
989       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
990       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
991       /* (strict upper triangular part of A)*x  */
992       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
993       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
994       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
995       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
996       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
997       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
998       v  += 36;
999     }
1000     xb +=6; ai++;
1001   }
1002 
1003   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1004   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1005 
1006   ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
1007   PetscFunctionReturn(0);
1008 }
1009 
1010 #undef __FUNCT__
1011 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1012 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1013 {
1014   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1015   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1016   MatScalar      *v;
1017   PetscErrorCode ierr;
1018   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1019   PetscInt       nonzerorow=0;
1020 
1021   PetscFunctionBegin;
1022   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1023   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1024   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1025 
1026   v     = a->a;
1027   xb = x;
1028 
1029   for (i=0; i<mbs; i++) {
1030     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1031     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1032     ib = aj + *ai;
1033     jmin = 0;
1034     nonzerorow += (n>0);
1035     if (*ib == i){     /* (diag of A)*x */
1036       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1037       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1038       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1039       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1040       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1041       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1042       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1043       v += 49; jmin++;
1044     }
1045     for (j=jmin; j<n; j++) {
1046       /* (strict lower triangular part of A)*x  */
1047       cval       = ib[j]*7;
1048       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1049       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1050       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1051       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1052       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1053       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1054       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1055       /* (strict upper triangular part of A)*x  */
1056       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1057       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1058       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1059       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1060       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1061       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1062       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1063       v  += 49;
1064     }
1065     xb +=7; ai++;
1066   }
1067 
1068   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1069   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1070 
1071   ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
1072   PetscFunctionReturn(0);
1073 }
1074 
1075 #undef __FUNCT__
1076 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1077 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1078 {
1079   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1080   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1081   MatScalar      *v;
1082   PetscErrorCode ierr;
1083   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1084   PetscInt       nonzerorow=0;
1085 
1086   PetscFunctionBegin;
1087   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1088   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1089   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1090 
1091   aj   = a->j;
1092   v    = a->a;
1093   ii   = a->i;
1094 
1095   if (!a->mult_work) {
1096     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1097   }
1098   work = a->mult_work;
1099 
1100 
1101   for (i=0; i<mbs; i++) {
1102     n     = ii[1] - ii[0]; ncols = n*bs;
1103     workt = work; idx=aj+ii[0];
1104     nonzerorow += (n>0);
1105 
1106     /* upper triangular part */
1107     for (j=0; j<n; j++) {
1108       xb = x_ptr + bs*(*idx++);
1109       for (k=0; k<bs; k++) workt[k] = xb[k];
1110       workt += bs;
1111     }
1112     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1113     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1114 
1115     /* strict lower triangular part */
1116     idx = aj+ii[0];
1117     if (*idx == i){
1118       ncols -= bs; v += bs2; idx++; n--;
1119     }
1120     if (ncols > 0){
1121       workt = work;
1122       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1123       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1124       for (j=0; j<n; j++) {
1125         zb = z_ptr + bs*(*idx++);
1126         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1127         workt += bs;
1128       }
1129     }
1130 
1131     x += bs; v += n*bs2; z += bs; ii++;
1132   }
1133 
1134   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1135   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1136 
1137   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
1138   PetscFunctionReturn(0);
1139 }
1140 
1141 #undef __FUNCT__
1142 #define __FUNCT__ "MatScale_SeqSBAIJ"
1143 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1144 {
1145   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1146   PetscScalar    oalpha = alpha;
1147   PetscErrorCode ierr;
1148   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1149 
1150   PetscFunctionBegin;
1151   BLASscal_(&totalnz,&oalpha,a->a,&one);
1152   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 #undef __FUNCT__
1157 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1158 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1159 {
1160   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1161   MatScalar      *v = a->a;
1162   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1163   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1164   PetscErrorCode ierr;
1165   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1166 
1167   PetscFunctionBegin;
1168   if (type == NORM_FROBENIUS) {
1169     for (k=0; k<mbs; k++){
1170       jmin = a->i[k]; jmax = a->i[k+1];
1171       col  = aj + jmin;
1172       if (*col == k){         /* diagonal block */
1173         for (i=0; i<bs2; i++){
1174 #if defined(PETSC_USE_COMPLEX)
1175           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1176 #else
1177           sum_diag += (*v)*(*v); v++;
1178 #endif
1179         }
1180         jmin++;
1181       }
1182       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1183         for (i=0; i<bs2; i++){
1184 #if defined(PETSC_USE_COMPLEX)
1185           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1186 #else
1187           sum_off += (*v)*(*v); v++;
1188 #endif
1189         }
1190       }
1191     }
1192     *norm = sqrt(sum_diag + 2*sum_off);
1193   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1194     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1195     jl   = il + mbs;
1196     sum  = (PetscReal*)(jl + mbs);
1197     for (i=0; i<mbs; i++) jl[i] = mbs;
1198     il[0] = 0;
1199 
1200     *norm = 0.0;
1201     for (k=0; k<mbs; k++) { /* k_th block row */
1202       for (j=0; j<bs; j++) sum[j]=0.0;
1203       /*-- col sum --*/
1204       i = jl[k]; /* first |A(i,k)| to be added */
1205       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1206                   at step k */
1207       while (i<mbs){
1208         nexti = jl[i];  /* next block row to be added */
1209         ik    = il[i];  /* block index of A(i,k) in the array a */
1210         for (j=0; j<bs; j++){
1211           v = a->a + ik*bs2 + j*bs;
1212           for (k1=0; k1<bs; k1++) {
1213             sum[j] += PetscAbsScalar(*v); v++;
1214           }
1215         }
1216         /* update il, jl */
1217         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1218         jmax = a->i[i+1];
1219         if (jmin < jmax){
1220           il[i] = jmin;
1221           j   = a->j[jmin];
1222           jl[i] = jl[j]; jl[j]=i;
1223         }
1224         i = nexti;
1225       }
1226       /*-- row sum --*/
1227       jmin = a->i[k]; jmax = a->i[k+1];
1228       for (i=jmin; i<jmax; i++) {
1229         for (j=0; j<bs; j++){
1230           v = a->a + i*bs2 + j;
1231           for (k1=0; k1<bs; k1++){
1232             sum[j] += PetscAbsScalar(*v); v += bs;
1233           }
1234         }
1235       }
1236       /* add k_th block row to il, jl */
1237       col = aj+jmin;
1238       if (*col == k) jmin++;
1239       if (jmin < jmax){
1240         il[k] = jmin;
1241         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1242       }
1243       for (j=0; j<bs; j++){
1244         if (sum[j] > *norm) *norm = sum[j];
1245       }
1246     }
1247     ierr = PetscFree(il);CHKERRQ(ierr);
1248   } else {
1249     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1250   }
1251   PetscFunctionReturn(0);
1252 }
1253 
1254 #undef __FUNCT__
1255 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1256 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1257 {
1258   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1259   PetscErrorCode ierr;
1260 
1261   PetscFunctionBegin;
1262 
1263   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1264   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1265     *flg = PETSC_FALSE;
1266     PetscFunctionReturn(0);
1267   }
1268 
1269   /* if the a->i are the same */
1270   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1271   if (!*flg) {
1272     PetscFunctionReturn(0);
1273   }
1274 
1275   /* if a->j are the same */
1276   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1277   if (!*flg) {
1278     PetscFunctionReturn(0);
1279   }
1280   /* if a->a are the same */
1281   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1282   PetscFunctionReturn(0);
1283 }
1284 
1285 #undef __FUNCT__
1286 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1287 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1288 {
1289   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1290   PetscErrorCode ierr;
1291   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1292   PetscScalar    *x,zero = 0.0;
1293   MatScalar      *aa,*aa_j;
1294 
1295   PetscFunctionBegin;
1296   bs   = A->rmap->bs;
1297   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1298 
1299   aa   = a->a;
1300   ai   = a->i;
1301   aj   = a->j;
1302   ambs = a->mbs;
1303   bs2  = a->bs2;
1304 
1305   ierr = VecSet(v,zero);CHKERRQ(ierr);
1306   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1307   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1308   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1309   for (i=0; i<ambs; i++) {
1310     j=ai[i];
1311     if (aj[j] == i) {             /* if this is a diagonal element */
1312       row  = i*bs;
1313       aa_j = aa + j*bs2;
1314       if (A->factor && bs==1){
1315         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1316       } else {
1317         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1318       }
1319     }
1320   }
1321 
1322   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1323   PetscFunctionReturn(0);
1324 }
1325 
1326 #undef __FUNCT__
1327 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1328 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1329 {
1330   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1331   PetscScalar    *l,x,*li,*ri;
1332   MatScalar      *aa,*v;
1333   PetscErrorCode ierr;
1334   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1335   PetscTruth     flg;
1336 
1337   PetscFunctionBegin;
1338   if (ll != rr){
1339     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1340     if (!flg)
1341       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1342   }
1343   if (!ll) PetscFunctionReturn(0);
1344   ai  = a->i;
1345   aj  = a->j;
1346   aa  = a->a;
1347   m   = A->rmap->N;
1348   bs  = A->rmap->bs;
1349   mbs = a->mbs;
1350   bs2 = a->bs2;
1351 
1352   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1353   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1354   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1355   for (i=0; i<mbs; i++) { /* for each block row */
1356     M  = ai[i+1] - ai[i];
1357     li = l + i*bs;
1358     v  = aa + bs2*ai[i];
1359     for (j=0; j<M; j++) { /* for each block */
1360       ri = l + bs*aj[ai[i]+j];
1361       for (k=0; k<bs; k++) {
1362         x = ri[k];
1363         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1364       }
1365     }
1366   }
1367   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1368   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 #undef __FUNCT__
1373 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1374 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1375 {
1376   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1377 
1378   PetscFunctionBegin;
1379   info->block_size     = a->bs2;
1380   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1381   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1382   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1383   info->assemblies   = A->num_ass;
1384   info->mallocs      = a->reallocs;
1385   info->memory       = ((PetscObject)A)->mem;
1386   if (A->factor) {
1387     info->fill_ratio_given  = A->info.fill_ratio_given;
1388     info->fill_ratio_needed = A->info.fill_ratio_needed;
1389     info->factor_mallocs    = A->info.factor_mallocs;
1390   } else {
1391     info->fill_ratio_given  = 0;
1392     info->fill_ratio_needed = 0;
1393     info->factor_mallocs    = 0;
1394   }
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 
1399 #undef __FUNCT__
1400 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1401 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1402 {
1403   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1404   PetscErrorCode ierr;
1405 
1406   PetscFunctionBegin;
1407   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 #undef __FUNCT__
1412 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1413 /*
1414    This code does not work since it only checks the upper triangular part of
1415   the matrix. Hence it is not listed in the function table.
1416 */
1417 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1418 {
1419   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1420   PetscErrorCode ierr;
1421   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1422   PetscReal      atmp;
1423   MatScalar      *aa;
1424   PetscScalar    *x;
1425   PetscInt       ncols,brow,bcol,krow,kcol;
1426 
1427   PetscFunctionBegin;
1428   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1429   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1430   bs   = A->rmap->bs;
1431   aa   = a->a;
1432   ai   = a->i;
1433   aj   = a->j;
1434   mbs = a->mbs;
1435 
1436   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1437   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1438   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1439   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1440   for (i=0; i<mbs; i++) {
1441     ncols = ai[1] - ai[0]; ai++;
1442     brow  = bs*i;
1443     for (j=0; j<ncols; j++){
1444       bcol = bs*(*aj);
1445       for (kcol=0; kcol<bs; kcol++){
1446         col = bcol + kcol;      /* col index */
1447         for (krow=0; krow<bs; krow++){
1448           atmp = PetscAbsScalar(*aa); aa++;
1449           row = brow + krow;    /* row index */
1450           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1451           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1452         }
1453       }
1454       aj++;
1455     }
1456   }
1457   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1458   PetscFunctionReturn(0);
1459 }
1460