xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision ce0a2cd1da0658c2b28aad1be2e2c8e41567bece)
1 #define PETSCMAT_DLL
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->rmap.bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       *irow,nrows,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
102   PetscInt       *aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
110   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ssmap = smap;
117   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
144     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
145     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
146     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
147   }
148   c = (Mat_SeqSBAIJ *)(C->data);
149   for (i=0; i<nrows; i++) {
150     row    = irow[i];
151     kstart = ai[row];
152     kend   = kstart + a->ilen[row];
153     mat_i  = c->i[i];
154     mat_j  = c->j + mat_i;
155     mat_a  = c->a + mat_i*bs2;
156     mat_ilen = c->ilen + i;
157     for (k=kstart; k<kend; k++) {
158       if ((tcol=ssmap[a->j[k]])) {
159         *mat_j++ = tcol - 1;
160         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
161         mat_a   += bs2;
162         (*mat_ilen)++;
163       }
164     }
165   }
166 
167   /* Free work space */
168   ierr = PetscFree(smap);CHKERRQ(ierr);
169   ierr = PetscFree(lens);CHKERRQ(ierr);
170   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172 
173   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
174   *B = C;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
181 {
182   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
183   IS             is1;
184   PetscErrorCode ierr;
185   PetscInt       *vary,*iary,*irow,nrows,i,bs=A->rmap.bs,count;
186 
187   PetscFunctionBegin;
188   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
189 
190   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
192 
193   /* Verify if the indices corespond to each element in a block
194    and form the IS with compressed IS */
195   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
196   iary = vary + a->mbs;
197   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
198   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
199 
200   count = 0;
201   for (i=0; i<a->mbs; i++) {
202     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
203     if (vary[i]==bs) iary[count++] = i;
204   }
205   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
206 
207   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
208   ierr = PetscFree(vary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
211   ISDestroy(is1);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   PetscErrorCode ierr;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (scall == MAT_INITIAL_MATRIX) {
224     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<n; i++) {
228     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 /* -------------------------------------------------------*/
234 /* Should check that shapes of vectors and matrices match */
235 /* -------------------------------------------------------*/
236 #include "petscblaslapack.h"
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
240 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
241 {
242   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
243   PetscScalar    *x,*z,*xb,x1,zero=0.0;
244   MatScalar      *v;
245   PetscErrorCode ierr;
246   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
247   PetscInt       nonzerorow=0;
248 
249   PetscFunctionBegin;
250   ierr = VecSet(zz,zero);CHKERRQ(ierr);
251   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
252   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
253 
254   v  = a->a;
255   xb = x;
256 
257   for (i=0; i<mbs; i++) {
258     n    = ai[1] - ai[0];  /* length of i_th row of A */
259     x1   = *xb++;
260     ib   = aj + *ai++;
261     jmin = 0;
262     nonzerorow += (n>0);
263     /* if we ALWAYS required a diagonal entry then could remove this if test */
264     /* should we use a tmp to hold the accumulated z[i] */
265     if (*ib == i) {      /* (diag of A)*x */
266       z[i] += *v++ * x[*ib++];
267       jmin++;
268     }
269     for (j=jmin; j<n; j++) {
270       cval    = *ib;
271       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
272       z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
273     }
274   }
275 
276   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
277   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
278   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);  /* nz = (nz+m)/2 */
279   PetscFunctionReturn(0);
280 }
281 
282 #undef __FUNCT__
283 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
284 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
285 {
286   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
287   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
288   MatScalar      *v;
289   PetscErrorCode ierr;
290   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
291   PetscInt       nonzerorow=0;
292 
293   PetscFunctionBegin;
294   ierr = VecSet(zz,zero);CHKERRQ(ierr);
295   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
296   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
297 
298   v     = a->a;
299   xb = x;
300 
301   for (i=0; i<mbs; i++) {
302     n  = ai[1] - ai[0]; /* length of i_th block row of A */
303     x1 = xb[0]; x2 = xb[1];
304     ib = aj + *ai;
305     jmin = 0;
306     nonzerorow += (n>0);
307     if (*ib == i){     /* (diag of A)*x */
308       z[2*i]   += v[0]*x1 + v[2]*x2;
309       z[2*i+1] += v[2]*x1 + v[3]*x2;
310       v += 4; jmin++;
311     }
312     for (j=jmin; j<n; j++) {
313       /* (strict lower triangular part of A)*x  */
314       cval       = ib[j]*2;
315       z[cval]     += v[0]*x1 + v[1]*x2;
316       z[cval+1]   += v[2]*x1 + v[3]*x2;
317       /* (strict upper triangular part of A)*x  */
318       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
319       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
320       v  += 4;
321     }
322     xb +=2; ai++;
323   }
324 
325   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
326   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
327   ierr = PetscLogFlops(8*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 #undef __FUNCT__
332 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
333 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
334 {
335   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
336   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
337   MatScalar      *v;
338   PetscErrorCode ierr;
339   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
340   PetscInt       nonzerorow=0;
341 
342   PetscFunctionBegin;
343   ierr = VecSet(zz,zero);CHKERRQ(ierr);
344   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
345   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
346 
347   v    = a->a;
348   xb   = x;
349 
350   for (i=0; i<mbs; i++) {
351     n  = ai[1] - ai[0]; /* length of i_th block row of A */
352     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
353     ib = aj + *ai;
354     jmin = 0;
355     nonzerorow += (n>0);
356     if (*ib == i){     /* (diag of A)*x */
357       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
358       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
359       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
360       v += 9; jmin++;
361     }
362     for (j=jmin; j<n; j++) {
363       /* (strict lower triangular part of A)*x  */
364       cval       = ib[j]*3;
365       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
366       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
367       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
368       /* (strict upper triangular part of A)*x  */
369       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
370       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
371       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
372       v  += 9;
373     }
374     xb +=3; ai++;
375   }
376 
377   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
378   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
379   ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
380   PetscFunctionReturn(0);
381 }
382 
383 #undef __FUNCT__
384 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
385 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
386 {
387   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
388   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
389   MatScalar      *v;
390   PetscErrorCode ierr;
391   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
392   PetscInt       nonzerorow=0;
393 
394   PetscFunctionBegin;
395   ierr = VecSet(zz,zero);CHKERRQ(ierr);
396   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
397   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
398 
399   v     = a->a;
400   xb = x;
401 
402   for (i=0; i<mbs; i++) {
403     n  = ai[1] - ai[0]; /* length of i_th block row of A */
404     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
405     ib = aj + *ai;
406     jmin = 0;
407     nonzerorow += (n>0);
408     if (*ib == i){     /* (diag of A)*x */
409       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
410       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
411       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
412       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
413       v += 16; jmin++;
414     }
415     for (j=jmin; j<n; j++) {
416       /* (strict lower triangular part of A)*x  */
417       cval       = ib[j]*4;
418       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
419       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
420       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
421       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
422       /* (strict upper triangular part of A)*x  */
423       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
424       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
425       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
426       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
427       v  += 16;
428     }
429     xb +=4; ai++;
430   }
431 
432   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
433   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
434   ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
435   PetscFunctionReturn(0);
436 }
437 
438 #undef __FUNCT__
439 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
440 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
441 {
442   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
443   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
444   MatScalar      *v;
445   PetscErrorCode ierr;
446   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
447   PetscInt       nonzerorow=0;
448 
449   PetscFunctionBegin;
450   ierr = VecSet(zz,zero);CHKERRQ(ierr);
451   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
452   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
453 
454   v     = a->a;
455   xb = x;
456 
457   for (i=0; i<mbs; i++) {
458     n  = ai[1] - ai[0]; /* length of i_th block row of A */
459     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
460     ib = aj + *ai;
461     jmin = 0;
462     nonzerorow += (n>0);
463     if (*ib == i){      /* (diag of A)*x */
464       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
465       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
466       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
467       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
468       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
469       v += 25; jmin++;
470     }
471     for (j=jmin; j<n; j++) {
472       /* (strict lower triangular part of A)*x  */
473       cval       = ib[j]*5;
474       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
475       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
476       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
477       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
478       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
479       /* (strict upper triangular part of A)*x  */
480       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
481       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
482       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
483       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
484       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
485       v  += 25;
486     }
487     xb +=5; ai++;
488   }
489 
490   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
491   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
492   ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
493   PetscFunctionReturn(0);
494 }
495 
496 
497 #undef __FUNCT__
498 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
499 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
500 {
501   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
502   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
503   MatScalar      *v;
504   PetscErrorCode ierr;
505   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
506   PetscInt       nonzerorow=0;
507 
508   PetscFunctionBegin;
509   ierr = VecSet(zz,zero);CHKERRQ(ierr);
510   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
511   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
512 
513   v     = a->a;
514   xb = x;
515 
516   for (i=0; i<mbs; i++) {
517     n  = ai[1] - ai[0]; /* length of i_th block row of A */
518     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
519     ib = aj + *ai;
520     jmin = 0;
521     nonzerorow += (n>0);
522     if (*ib == i){      /* (diag of A)*x */
523       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
524       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
525       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
526       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
527       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
528       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
529       v += 36; jmin++;
530     }
531     for (j=jmin; j<n; j++) {
532       /* (strict lower triangular part of A)*x  */
533       cval       = ib[j]*6;
534       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
535       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
536       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
537       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
538       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
539       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
540       /* (strict upper triangular part of A)*x  */
541       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
542       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
543       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
544       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
545       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
546       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
547       v  += 36;
548     }
549     xb +=6; ai++;
550   }
551 
552   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
553   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
554   ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
555   PetscFunctionReturn(0);
556 }
557 #undef __FUNCT__
558 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
559 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
560 {
561   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
562   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
563   MatScalar      *v;
564   PetscErrorCode ierr;
565   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
566   PetscInt       nonzerorow=0;
567 
568   PetscFunctionBegin;
569   ierr = VecSet(zz,zero);CHKERRQ(ierr);
570   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
571   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
572 
573   v     = a->a;
574   xb = x;
575 
576   for (i=0; i<mbs; i++) {
577     n  = ai[1] - ai[0]; /* length of i_th block row of A */
578     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
579     ib = aj + *ai;
580     jmin = 0;
581     nonzerorow += (n>0);
582     if (*ib == i){      /* (diag of A)*x */
583       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
584       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
585       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
586       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
587       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
588       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
589       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
590       v += 49; jmin++;
591     }
592     for (j=jmin; j<n; j++) {
593       /* (strict lower triangular part of A)*x  */
594       cval       = ib[j]*7;
595       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
596       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
597       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
598       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
599       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
600       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
601       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
602       /* (strict upper triangular part of A)*x  */
603       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
604       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
605       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
606       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
607       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
608       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
609       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
610       v  += 49;
611     }
612     xb +=7; ai++;
613   }
614   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
615   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
616   ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
617   PetscFunctionReturn(0);
618 }
619 
620 /*
621     This will not work with MatScalar == float because it calls the BLAS
622 */
623 #undef __FUNCT__
624 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
625 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
626 {
627   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
628   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
629   MatScalar      *v;
630   PetscErrorCode ierr;
631   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k;
632   PetscInt       nonzerorow=0;
633 
634   PetscFunctionBegin;
635   ierr = VecSet(zz,zero);CHKERRQ(ierr);
636   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
637   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
638 
639   aj   = a->j;
640   v    = a->a;
641   ii   = a->i;
642 
643   if (!a->mult_work) {
644     ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
645   }
646   work = a->mult_work;
647 
648   for (i=0; i<mbs; i++) {
649     n     = ii[1] - ii[0]; ncols = n*bs;
650     workt = work; idx=aj+ii[0];
651     nonzerorow += (n>0);
652 
653     /* upper triangular part */
654     for (j=0; j<n; j++) {
655       xb = x_ptr + bs*(*idx++);
656       for (k=0; k<bs; k++) workt[k] = xb[k];
657       workt += bs;
658     }
659     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
660     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
661 
662     /* strict lower triangular part */
663     idx = aj+ii[0];
664     if (*idx == i){
665       ncols -= bs; v += bs2; idx++; n--;
666     }
667 
668     if (ncols > 0){
669       workt = work;
670       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
671       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
672       for (j=0; j<n; j++) {
673         zb = z_ptr + bs*(*idx++);
674         for (k=0; k<bs; k++) zb[k] += workt[k] ;
675         workt += bs;
676       }
677     }
678     x += bs; v += n*bs2; z += bs; ii++;
679   }
680 
681   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
682   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
683   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
684   PetscFunctionReturn(0);
685 }
686 
687 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
688 #undef __FUNCT__
689 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
690 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
691 {
692   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
693   PetscScalar    *x,*z,*xb,x1;
694   MatScalar      *v;
695   PetscErrorCode ierr;
696   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
697   PetscInt       nonzerorow=0;
698 
699   PetscFunctionBegin;
700   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
701   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
703   v  = a->a;
704   xb = x;
705 
706   for (i=0; i<mbs; i++) {
707     n  = ai[1] - ai[0];  /* length of i_th row of A */
708     x1 = xb[0];
709     ib = aj + *ai;
710     jmin = 0;
711     nonzerorow += (n>0);
712     if (*ib == i) {            /* (diag of A)*x */
713       z[i] += *v++ * x[*ib++]; jmin++;
714     }
715     for (j=jmin; j<n; j++) {
716       cval    = *ib;
717       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
718       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
719     }
720     xb++; ai++;
721   }
722 
723   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
724   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
725 
726   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 #undef __FUNCT__
731 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
732 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
733 {
734   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
735   PetscScalar    *x,*z,*xb,x1,x2;
736   MatScalar      *v;
737   PetscErrorCode ierr;
738   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
739   PetscInt       nonzerorow=0;
740 
741   PetscFunctionBegin;
742   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
743   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
744   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
745 
746   v  = a->a;
747   xb = x;
748 
749   for (i=0; i<mbs; i++) {
750     n  = ai[1] - ai[0]; /* length of i_th block row of A */
751     x1 = xb[0]; x2 = xb[1];
752     ib = aj + *ai;
753     jmin = 0;
754     nonzerorow += (n>0);
755     if (*ib == i){      /* (diag of A)*x */
756       z[2*i]   += v[0]*x1 + v[2]*x2;
757       z[2*i+1] += v[2]*x1 + v[3]*x2;
758       v += 4; jmin++;
759     }
760     for (j=jmin; j<n; j++) {
761       /* (strict lower triangular part of A)*x  */
762       cval       = ib[j]*2;
763       z[cval]     += v[0]*x1 + v[1]*x2;
764       z[cval+1]   += v[2]*x1 + v[3]*x2;
765       /* (strict upper triangular part of A)*x  */
766       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
767       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
768       v  += 4;
769     }
770     xb +=2; ai++;
771   }
772   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
773   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
774 
775   ierr = PetscLogFlops(4*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
776   PetscFunctionReturn(0);
777 }
778 
779 #undef __FUNCT__
780 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
781 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
782 {
783   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
784   PetscScalar    *x,*z,*xb,x1,x2,x3;
785   MatScalar      *v;
786   PetscErrorCode ierr;
787   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
788   PetscInt       nonzerorow=0;
789 
790   PetscFunctionBegin;
791   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
794 
795   v     = a->a;
796   xb = x;
797 
798   for (i=0; i<mbs; i++) {
799     n  = ai[1] - ai[0]; /* length of i_th block row of A */
800     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
801     ib = aj + *ai;
802     jmin = 0;
803     nonzerorow += (n>0);
804     if (*ib == i){     /* (diag of A)*x */
805      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
806      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
807      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
808      v += 9; jmin++;
809     }
810     for (j=jmin; j<n; j++) {
811       /* (strict lower triangular part of A)*x  */
812       cval       = ib[j]*3;
813       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
814       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
815       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
816       /* (strict upper triangular part of A)*x  */
817       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
818       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
819       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
820       v  += 9;
821     }
822     xb +=3; ai++;
823   }
824 
825   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
826   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
827 
828   ierr = PetscLogFlops(18*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
829   PetscFunctionReturn(0);
830 }
831 
832 #undef __FUNCT__
833 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
834 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
835 {
836   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
837   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
838   MatScalar      *v;
839   PetscErrorCode ierr;
840   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
841   PetscInt       nonzerorow=0;
842 
843   PetscFunctionBegin;
844   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
845   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
846   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
847 
848   v     = a->a;
849   xb = x;
850 
851   for (i=0; i<mbs; i++) {
852     n  = ai[1] - ai[0]; /* length of i_th block row of A */
853     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
854     ib = aj + *ai;
855     jmin = 0;
856     nonzerorow += (n>0);
857     if (*ib == i){      /* (diag of A)*x */
858       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
859       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
860       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
861       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
862       v += 16; jmin++;
863     }
864     for (j=jmin; j<n; j++) {
865       /* (strict lower triangular part of A)*x  */
866       cval       = ib[j]*4;
867       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
868       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
869       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
870       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
871       /* (strict upper triangular part of A)*x  */
872       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
873       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
874       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
875       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
876       v  += 16;
877     }
878     xb +=4; ai++;
879   }
880 
881   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
882   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
883 
884   ierr = PetscLogFlops(32*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
885   PetscFunctionReturn(0);
886 }
887 
888 #undef __FUNCT__
889 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
890 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
891 {
892   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
893   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
894   MatScalar      *v;
895   PetscErrorCode ierr;
896   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
897   PetscInt       nonzerorow=0;
898 
899   PetscFunctionBegin;
900   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
901   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
902   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
903 
904   v     = a->a;
905   xb = x;
906 
907   for (i=0; i<mbs; i++) {
908     n  = ai[1] - ai[0]; /* length of i_th block row of A */
909     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
910     ib = aj + *ai;
911     jmin = 0;
912     nonzerorow += (n>0);
913     if (*ib == i){      /* (diag of A)*x */
914       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
915       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
916       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
917       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
918       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
919       v += 25; jmin++;
920     }
921     for (j=jmin; j<n; j++) {
922       /* (strict lower triangular part of A)*x  */
923       cval       = ib[j]*5;
924       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
925       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
926       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
927       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
928       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
929       /* (strict upper triangular part of A)*x  */
930       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
931       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
932       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
933       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
934       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
935       v  += 25;
936     }
937     xb +=5; ai++;
938   }
939 
940   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
941   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
942 
943   ierr = PetscLogFlops(50*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
944   PetscFunctionReturn(0);
945 }
946 #undef __FUNCT__
947 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
948 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
949 {
950   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
951   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
952   MatScalar      *v;
953   PetscErrorCode ierr;
954   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
955   PetscInt       nonzerorow=0;
956 
957   PetscFunctionBegin;
958   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
959   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
960   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
961 
962   v     = a->a;
963   xb = x;
964 
965   for (i=0; i<mbs; i++) {
966     n  = ai[1] - ai[0]; /* length of i_th block row of A */
967     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
968     ib = aj + *ai;
969     jmin = 0;
970     nonzerorow += (n>0);
971     if (*ib == i){     /* (diag of A)*x */
972       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
973       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
974       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
975       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
976       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
977       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
978       v += 36; jmin++;
979     }
980     for (j=jmin; j<n; j++) {
981       /* (strict lower triangular part of A)*x  */
982       cval       = ib[j]*6;
983       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
984       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
985       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
986       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
987       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
988       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
989       /* (strict upper triangular part of A)*x  */
990       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
991       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
992       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
993       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
994       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
995       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
996       v  += 36;
997     }
998     xb +=6; ai++;
999   }
1000 
1001   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1002   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1003 
1004   ierr = PetscLogFlops(72*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
1005   PetscFunctionReturn(0);
1006 }
1007 
1008 #undef __FUNCT__
1009 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1010 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1011 {
1012   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1013   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1014   MatScalar      *v;
1015   PetscErrorCode ierr;
1016   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1017   PetscInt       nonzerorow=0;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1023 
1024   v     = a->a;
1025   xb = x;
1026 
1027   for (i=0; i<mbs; i++) {
1028     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1029     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1030     ib = aj + *ai;
1031     jmin = 0;
1032     nonzerorow += (n>0);
1033     if (*ib == i){     /* (diag of A)*x */
1034       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1035       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1036       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1037       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1038       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1039       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1040       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1041       v += 49; jmin++;
1042     }
1043     for (j=jmin; j<n; j++) {
1044       /* (strict lower triangular part of A)*x  */
1045       cval       = ib[j]*7;
1046       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1047       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1048       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1049       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1050       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1051       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1052       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1053       /* (strict upper triangular part of A)*x  */
1054       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1055       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1056       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1057       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1058       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1059       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1060       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1061       v  += 49;
1062     }
1063     xb +=7; ai++;
1064   }
1065 
1066   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1067   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1068 
1069   ierr = PetscLogFlops(98*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
1070   PetscFunctionReturn(0);
1071 }
1072 
1073 #undef __FUNCT__
1074 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1075 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1076 {
1077   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1078   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1079   MatScalar      *v;
1080   PetscErrorCode ierr;
1081   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k;
1082   PetscInt       nonzerorow=0;
1083 
1084   PetscFunctionBegin;
1085   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1086   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1087   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1088 
1089   aj   = a->j;
1090   v    = a->a;
1091   ii   = a->i;
1092 
1093   if (!a->mult_work) {
1094     ierr = PetscMalloc((A->rmap.n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1095   }
1096   work = a->mult_work;
1097 
1098 
1099   for (i=0; i<mbs; i++) {
1100     n     = ii[1] - ii[0]; ncols = n*bs;
1101     workt = work; idx=aj+ii[0];
1102     nonzerorow += (n>0);
1103 
1104     /* upper triangular part */
1105     for (j=0; j<n; j++) {
1106       xb = x_ptr + bs*(*idx++);
1107       for (k=0; k<bs; k++) workt[k] = xb[k];
1108       workt += bs;
1109     }
1110     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1111     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1112 
1113     /* strict lower triangular part */
1114     idx = aj+ii[0];
1115     if (*idx == i){
1116       ncols -= bs; v += bs2; idx++; n--;
1117     }
1118     if (ncols > 0){
1119       workt = work;
1120       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1121       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1122       for (j=0; j<n; j++) {
1123         zb = z_ptr + bs*(*idx++);
1124         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1125         workt += bs;
1126       }
1127     }
1128 
1129     x += bs; v += n*bs2; z += bs; ii++;
1130   }
1131 
1132   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1133   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1134 
1135   ierr = PetscLogFlops(2*(a->nz*2 - nonzerorow));CHKERRQ(ierr);
1136   PetscFunctionReturn(0);
1137 }
1138 
1139 #undef __FUNCT__
1140 #define __FUNCT__ "MatScale_SeqSBAIJ"
1141 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1142 {
1143   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1144   PetscScalar    oalpha = alpha;
1145   PetscErrorCode ierr;
1146   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1147 
1148   PetscFunctionBegin;
1149   BLASscal_(&totalnz,&oalpha,a->a,&one);
1150   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 #undef __FUNCT__
1155 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1156 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1157 {
1158   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1159   MatScalar      *v = a->a;
1160   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1161   PetscInt       i,j,k,bs = A->rmap.bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1162   PetscErrorCode ierr;
1163   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1164 
1165   PetscFunctionBegin;
1166   if (type == NORM_FROBENIUS) {
1167     for (k=0; k<mbs; k++){
1168       jmin = a->i[k]; jmax = a->i[k+1];
1169       col  = aj + jmin;
1170       if (*col == k){         /* diagonal block */
1171         for (i=0; i<bs2; i++){
1172 #if defined(PETSC_USE_COMPLEX)
1173           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1174 #else
1175           sum_diag += (*v)*(*v); v++;
1176 #endif
1177         }
1178         jmin++;
1179       }
1180       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1181         for (i=0; i<bs2; i++){
1182 #if defined(PETSC_USE_COMPLEX)
1183           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1184 #else
1185           sum_off += (*v)*(*v); v++;
1186 #endif
1187         }
1188       }
1189     }
1190     *norm = sqrt(sum_diag + 2*sum_off);
1191   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1192     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1193     jl   = il + mbs;
1194     sum  = (PetscReal*)(jl + mbs);
1195     for (i=0; i<mbs; i++) jl[i] = mbs;
1196     il[0] = 0;
1197 
1198     *norm = 0.0;
1199     for (k=0; k<mbs; k++) { /* k_th block row */
1200       for (j=0; j<bs; j++) sum[j]=0.0;
1201       /*-- col sum --*/
1202       i = jl[k]; /* first |A(i,k)| to be added */
1203       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1204                   at step k */
1205       while (i<mbs){
1206         nexti = jl[i];  /* next block row to be added */
1207         ik    = il[i];  /* block index of A(i,k) in the array a */
1208         for (j=0; j<bs; j++){
1209           v = a->a + ik*bs2 + j*bs;
1210           for (k1=0; k1<bs; k1++) {
1211             sum[j] += PetscAbsScalar(*v); v++;
1212           }
1213         }
1214         /* update il, jl */
1215         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1216         jmax = a->i[i+1];
1217         if (jmin < jmax){
1218           il[i] = jmin;
1219           j   = a->j[jmin];
1220           jl[i] = jl[j]; jl[j]=i;
1221         }
1222         i = nexti;
1223       }
1224       /*-- row sum --*/
1225       jmin = a->i[k]; jmax = a->i[k+1];
1226       for (i=jmin; i<jmax; i++) {
1227         for (j=0; j<bs; j++){
1228           v = a->a + i*bs2 + j;
1229           for (k1=0; k1<bs; k1++){
1230             sum[j] += PetscAbsScalar(*v); v += bs;
1231           }
1232         }
1233       }
1234       /* add k_th block row to il, jl */
1235       col = aj+jmin;
1236       if (*col == k) jmin++;
1237       if (jmin < jmax){
1238         il[k] = jmin;
1239         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1240       }
1241       for (j=0; j<bs; j++){
1242         if (sum[j] > *norm) *norm = sum[j];
1243       }
1244     }
1245     ierr = PetscFree(il);CHKERRQ(ierr);
1246   } else {
1247     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1248   }
1249   PetscFunctionReturn(0);
1250 }
1251 
1252 #undef __FUNCT__
1253 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1254 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1255 {
1256   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1257   PetscErrorCode ierr;
1258 
1259   PetscFunctionBegin;
1260 
1261   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1262   if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1263     *flg = PETSC_FALSE;
1264     PetscFunctionReturn(0);
1265   }
1266 
1267   /* if the a->i are the same */
1268   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1269   if (!*flg) {
1270     PetscFunctionReturn(0);
1271   }
1272 
1273   /* if a->j are the same */
1274   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1275   if (!*flg) {
1276     PetscFunctionReturn(0);
1277   }
1278   /* if a->a are the same */
1279   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(A->rmap.bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 #undef __FUNCT__
1284 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1285 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1286 {
1287   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1288   PetscErrorCode ierr;
1289   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1290   PetscScalar    *x,zero = 0.0;
1291   MatScalar      *aa,*aa_j;
1292 
1293   PetscFunctionBegin;
1294   bs   = A->rmap.bs;
1295   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1296 
1297   aa   = a->a;
1298   ai   = a->i;
1299   aj   = a->j;
1300   ambs = a->mbs;
1301   bs2  = a->bs2;
1302 
1303   ierr = VecSet(v,zero);CHKERRQ(ierr);
1304   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1305   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1306   if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1307   for (i=0; i<ambs; i++) {
1308     j=ai[i];
1309     if (aj[j] == i) {             /* if this is a diagonal element */
1310       row  = i*bs;
1311       aa_j = aa + j*bs2;
1312       if (A->factor && bs==1){
1313         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1314       } else {
1315         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1316       }
1317     }
1318   }
1319 
1320   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 #undef __FUNCT__
1325 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1326 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1327 {
1328   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1329   PetscScalar    *l,x,*li,*ri;
1330   MatScalar      *aa,*v;
1331   PetscErrorCode ierr;
1332   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1333   PetscTruth     flg;
1334 
1335   PetscFunctionBegin;
1336   if (ll != rr){
1337     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1338     if (!flg)
1339       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1340   }
1341   if (!ll) PetscFunctionReturn(0);
1342   ai  = a->i;
1343   aj  = a->j;
1344   aa  = a->a;
1345   m   = A->rmap.N;
1346   bs  = A->rmap.bs;
1347   mbs = a->mbs;
1348   bs2 = a->bs2;
1349 
1350   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1351   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1352   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1353   for (i=0; i<mbs; i++) { /* for each block row */
1354     M  = ai[i+1] - ai[i];
1355     li = l + i*bs;
1356     v  = aa + bs2*ai[i];
1357     for (j=0; j<M; j++) { /* for each block */
1358       ri = l + bs*aj[ai[i]+j];
1359       for (k=0; k<bs; k++) {
1360         x = ri[k];
1361         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1362       }
1363     }
1364   }
1365   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1366   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 #undef __FUNCT__
1371 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1372 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1373 {
1374   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1375 
1376   PetscFunctionBegin;
1377   info->rows_global    = (double)A->rmap.N;
1378   info->columns_global = (double)A->rmap.N;
1379   info->rows_local     = (double)A->rmap.N;
1380   info->columns_local  = (double)A->rmap.N;
1381   info->block_size     = a->bs2;
1382   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1383   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1384   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1385   info->assemblies   = A->num_ass;
1386   info->mallocs      = a->reallocs;
1387   info->memory       = ((PetscObject)A)->mem;
1388   if (A->factor) {
1389     info->fill_ratio_given  = A->info.fill_ratio_given;
1390     info->fill_ratio_needed = A->info.fill_ratio_needed;
1391     info->factor_mallocs    = A->info.factor_mallocs;
1392   } else {
1393     info->fill_ratio_given  = 0;
1394     info->fill_ratio_needed = 0;
1395     info->factor_mallocs    = 0;
1396   }
1397   PetscFunctionReturn(0);
1398 }
1399 
1400 
1401 #undef __FUNCT__
1402 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1403 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1404 {
1405   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1406   PetscErrorCode ierr;
1407 
1408   PetscFunctionBegin;
1409   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1410   PetscFunctionReturn(0);
1411 }
1412 
1413 #undef __FUNCT__
1414 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1415 /*
1416    This code does not work since it only checks the upper triangular part of
1417   the matrix. Hence it is not listed in the function table.
1418 */
1419 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1420 {
1421   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1422   PetscErrorCode ierr;
1423   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1424   PetscReal      atmp;
1425   MatScalar      *aa;
1426   PetscScalar    *x;
1427   PetscInt       ncols,brow,bcol,krow,kcol;
1428 
1429   PetscFunctionBegin;
1430   if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1431   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1432   bs   = A->rmap.bs;
1433   aa   = a->a;
1434   ai   = a->i;
1435   aj   = a->j;
1436   mbs = a->mbs;
1437 
1438   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1439   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1440   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1441   if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1442   for (i=0; i<mbs; i++) {
1443     ncols = ai[1] - ai[0]; ai++;
1444     brow  = bs*i;
1445     for (j=0; j<ncols; j++){
1446       bcol = bs*(*aj);
1447       for (kcol=0; kcol<bs; kcol++){
1448         col = bcol + kcol;      /* col index */
1449         for (krow=0; krow<bs; krow++){
1450           atmp = PetscAbsScalar(*aa); aa++;
1451           row = brow + krow;    /* row index */
1452           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1453           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1454         }
1455       }
1456       aj++;
1457     }
1458   }
1459   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1460   PetscFunctionReturn(0);
1461 }
1462