xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 69bb7ac94e38f481a514b28cb10ff7ece56113f9)
1 #define PETSCMAT_DLL
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->rmap.bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       *irow,nrows,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
102   PetscInt       *aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
110   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ssmap = smap;
117   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(A->comm,&C);CHKERRQ(ierr);
144     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
145     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
146     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
147   }
148   c = (Mat_SeqSBAIJ *)(C->data);
149   for (i=0; i<nrows; i++) {
150     row    = irow[i];
151     kstart = ai[row];
152     kend   = kstart + a->ilen[row];
153     mat_i  = c->i[i];
154     mat_j  = c->j + mat_i;
155     mat_a  = c->a + mat_i*bs2;
156     mat_ilen = c->ilen + i;
157     for (k=kstart; k<kend; k++) {
158       if ((tcol=ssmap[a->j[k]])) {
159         *mat_j++ = tcol - 1;
160         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
161         mat_a   += bs2;
162         (*mat_ilen)++;
163       }
164     }
165   }
166 
167   /* Free work space */
168   ierr = PetscFree(smap);CHKERRQ(ierr);
169   ierr = PetscFree(lens);CHKERRQ(ierr);
170   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172 
173   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
174   *B = C;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
181 {
182   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
183   IS             is1;
184   PetscErrorCode ierr;
185   PetscInt       *vary,*iary,*irow,nrows,i,bs=A->rmap.bs,count;
186 
187   PetscFunctionBegin;
188   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
189 
190   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
192 
193   /* Verify if the indices corespond to each element in a block
194    and form the IS with compressed IS */
195   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
196   iary = vary + a->mbs;
197   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
198   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
199 
200   count = 0;
201   for (i=0; i<a->mbs; i++) {
202     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
203     if (vary[i]==bs) iary[count++] = i;
204   }
205   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
206 
207   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
208   ierr = PetscFree(vary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
211   ISDestroy(is1);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   PetscErrorCode ierr;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (scall == MAT_INITIAL_MATRIX) {
224     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<n; i++) {
228     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 /* -------------------------------------------------------*/
234 /* Should check that shapes of vectors and matrices match */
235 /* -------------------------------------------------------*/
236 #include "petscblaslapack.h"
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
240 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
241 {
242   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
243   PetscScalar    *x,*z,*xb,x1,zero=0.0;
244   MatScalar      *v;
245   PetscErrorCode ierr;
246   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
247 
248   PetscFunctionBegin;
249   ierr = VecSet(zz,zero);CHKERRQ(ierr);
250   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
252 
253   v  = a->a;
254   xb = x;
255 
256   for (i=0; i<mbs; i++) {
257     n    = ai[1] - ai[0];  /* length of i_th row of A */
258     x1   = *xb++;
259     ib   = aj + *ai++;
260     jmin = 0;
261     /* if we ALWAYS required a diagonal entry then could remove this if test */
262     /* should we use a tmp to hold the accumulated z[i] */
263     if (*ib == i) {      /* (diag of A)*x */
264       z[i] += *v++ * x[*ib++];
265       jmin++;
266     }
267     for (j=jmin; j<n; j++) {
268       cval    = *ib;
269       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
270       z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
271     }
272   }
273 
274   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
275   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
276   ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr);  /* nz = (nz+m)/2 */
277   PetscFunctionReturn(0);
278 }
279 
280 #undef __FUNCT__
281 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
282 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
283 {
284   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
285   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
286   MatScalar      *v;
287   PetscErrorCode ierr;
288   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
289 
290   PetscFunctionBegin;
291   ierr = VecSet(zz,zero);CHKERRQ(ierr);
292   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
293   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
294 
295   v     = a->a;
296   xb = x;
297 
298   for (i=0; i<mbs; i++) {
299     n  = ai[1] - ai[0]; /* length of i_th block row of A */
300     x1 = xb[0]; x2 = xb[1];
301     ib = aj + *ai;
302     jmin = 0;
303     if (*ib == i){     /* (diag of A)*x */
304       z[2*i]   += v[0]*x1 + v[2]*x2;
305       z[2*i+1] += v[2]*x1 + v[3]*x2;
306       v += 4; jmin++;
307     }
308     for (j=jmin; j<n; j++) {
309       /* (strict lower triangular part of A)*x  */
310       cval       = ib[j]*2;
311       z[cval]     += v[0]*x1 + v[1]*x2;
312       z[cval+1]   += v[2]*x1 + v[3]*x2;
313       /* (strict upper triangular part of A)*x  */
314       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
315       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
316       v  += 4;
317     }
318     xb +=2; ai++;
319   }
320 
321   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
322   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
323   ierr = PetscLogFlops(8*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 #undef __FUNCT__
328 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
329 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
330 {
331   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
332   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
333   MatScalar      *v;
334   PetscErrorCode ierr;
335   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
336 
337   PetscFunctionBegin;
338   ierr = VecSet(zz,zero);CHKERRQ(ierr);
339   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
340   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
341 
342   v    = a->a;
343   xb   = x;
344 
345   for (i=0; i<mbs; i++) {
346     n  = ai[1] - ai[0]; /* length of i_th block row of A */
347     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
348     ib = aj + *ai;
349     jmin = 0;
350     if (*ib == i){     /* (diag of A)*x */
351       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
352       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
353       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
354       v += 9; jmin++;
355     }
356     for (j=jmin; j<n; j++) {
357       /* (strict lower triangular part of A)*x  */
358       cval       = ib[j]*3;
359       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
360       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
361       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
362       /* (strict upper triangular part of A)*x  */
363       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
364       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
365       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
366       v  += 9;
367     }
368     xb +=3; ai++;
369   }
370 
371   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
372   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
373   ierr = PetscLogFlops(18*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
379 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
380 {
381   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
382   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
383   MatScalar      *v;
384   PetscErrorCode ierr;
385   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
386 
387   PetscFunctionBegin;
388   ierr = VecSet(zz,zero);CHKERRQ(ierr);
389   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
390   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
391 
392   v     = a->a;
393   xb = x;
394 
395   for (i=0; i<mbs; i++) {
396     n  = ai[1] - ai[0]; /* length of i_th block row of A */
397     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
398     ib = aj + *ai;
399     jmin = 0;
400     if (*ib == i){     /* (diag of A)*x */
401       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
402       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
403       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
404       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
405       v += 16; jmin++;
406     }
407     for (j=jmin; j<n; j++) {
408       /* (strict lower triangular part of A)*x  */
409       cval       = ib[j]*4;
410       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
411       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
412       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
413       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
414       /* (strict upper triangular part of A)*x  */
415       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
416       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
417       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
418       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
419       v  += 16;
420     }
421     xb +=4; ai++;
422   }
423 
424   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
425   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
426   ierr = PetscLogFlops(32*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
432 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
433 {
434   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
435   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
436   MatScalar      *v;
437   PetscErrorCode ierr;
438   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
439 
440   PetscFunctionBegin;
441   ierr = VecSet(zz,zero);CHKERRQ(ierr);
442   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
443   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
444 
445   v     = a->a;
446   xb = x;
447 
448   for (i=0; i<mbs; i++) {
449     n  = ai[1] - ai[0]; /* length of i_th block row of A */
450     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
451     ib = aj + *ai;
452     jmin = 0;
453     if (*ib == i){      /* (diag of A)*x */
454       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
455       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
456       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
457       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
458       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
459       v += 25; jmin++;
460     }
461     for (j=jmin; j<n; j++) {
462       /* (strict lower triangular part of A)*x  */
463       cval       = ib[j]*5;
464       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
465       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
466       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
467       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
468       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
469       /* (strict upper triangular part of A)*x  */
470       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
471       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
472       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
473       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
474       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
475       v  += 25;
476     }
477     xb +=5; ai++;
478   }
479 
480   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
481   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
482   ierr = PetscLogFlops(50*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 
487 #undef __FUNCT__
488 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
489 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
490 {
491   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
492   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
493   MatScalar      *v;
494   PetscErrorCode ierr;
495   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
496 
497   PetscFunctionBegin;
498   ierr = VecSet(zz,zero);CHKERRQ(ierr);
499   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
500   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
501 
502   v     = a->a;
503   xb = x;
504 
505   for (i=0; i<mbs; i++) {
506     n  = ai[1] - ai[0]; /* length of i_th block row of A */
507     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
508     ib = aj + *ai;
509     jmin = 0;
510     if (*ib == i){      /* (diag of A)*x */
511       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
512       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
513       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
514       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
515       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
516       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
517       v += 36; jmin++;
518     }
519     for (j=jmin; j<n; j++) {
520       /* (strict lower triangular part of A)*x  */
521       cval       = ib[j]*6;
522       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
523       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
524       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
525       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
526       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
527       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
528       /* (strict upper triangular part of A)*x  */
529       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
530       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
531       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
532       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
533       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
534       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
535       v  += 36;
536     }
537     xb +=6; ai++;
538   }
539 
540   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
541   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
542   ierr = PetscLogFlops(72*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 #undef __FUNCT__
546 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
547 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
548 {
549   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
550   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
551   MatScalar      *v;
552   PetscErrorCode ierr;
553   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
554 
555   PetscFunctionBegin;
556   ierr = VecSet(zz,zero);CHKERRQ(ierr);
557   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
558   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
559 
560   v     = a->a;
561   xb = x;
562 
563   for (i=0; i<mbs; i++) {
564     n  = ai[1] - ai[0]; /* length of i_th block row of A */
565     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
566     ib = aj + *ai;
567     jmin = 0;
568     if (*ib == i){      /* (diag of A)*x */
569       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
570       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
571       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
572       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
573       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
574       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
575       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
576       v += 49; jmin++;
577     }
578     for (j=jmin; j<n; j++) {
579       /* (strict lower triangular part of A)*x  */
580       cval       = ib[j]*7;
581       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
582       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
583       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
584       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
585       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
586       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
587       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
588       /* (strict upper triangular part of A)*x  */
589       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
590       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
591       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
592       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
593       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
594       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
595       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
596       v  += 49;
597     }
598     xb +=7; ai++;
599   }
600   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
601   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
602   ierr = PetscLogFlops(98*(a->nz*2 - A->rmap.N) - A->rmap.N);CHKERRQ(ierr);
603   PetscFunctionReturn(0);
604 }
605 
606 /*
607     This will not work with MatScalar == float because it calls the BLAS
608 */
609 #undef __FUNCT__
610 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
611 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
612 {
613   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
614   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
615   MatScalar      *v;
616   PetscErrorCode ierr;
617   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k;
618 
619   PetscFunctionBegin;
620   ierr = VecSet(zz,zero);CHKERRQ(ierr);
621   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
622   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
623 
624   aj   = a->j;
625   v    = a->a;
626   ii   = a->i;
627 
628   if (!a->mult_work) {
629     ierr = PetscMalloc((A->rmap.N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
630   }
631   work = a->mult_work;
632 
633   for (i=0; i<mbs; i++) {
634     n     = ii[1] - ii[0]; ncols = n*bs;
635     workt = work; idx=aj+ii[0];
636 
637     /* upper triangular part */
638     for (j=0; j<n; j++) {
639       xb = x_ptr + bs*(*idx++);
640       for (k=0; k<bs; k++) workt[k] = xb[k];
641       workt += bs;
642     }
643     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
644     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
645 
646     /* strict lower triangular part */
647     idx = aj+ii[0];
648     if (*idx == i){
649       ncols -= bs; v += bs2; idx++; n--;
650     }
651 
652     if (ncols > 0){
653       workt = work;
654       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
655       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
656       for (j=0; j<n; j++) {
657         zb = z_ptr + bs*(*idx++);
658         for (k=0; k<bs; k++) zb[k] += workt[k] ;
659         workt += bs;
660       }
661     }
662     x += bs; v += n*bs2; z += bs; ii++;
663   }
664 
665   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
666   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
667   ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.N)*bs2 - A->rmap.N);CHKERRQ(ierr);
668   PetscFunctionReturn(0);
669 }
670 
671 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
672 #undef __FUNCT__
673 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
674 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
675 {
676   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
677   PetscScalar    *x,*z,*xb,x1;
678   MatScalar      *v;
679   PetscErrorCode ierr;
680   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
681 
682   PetscFunctionBegin;
683   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
684   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
685   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
686   v  = a->a;
687   xb = x;
688 
689   for (i=0; i<mbs; i++) {
690     n  = ai[1] - ai[0];  /* length of i_th row of A */
691     x1 = xb[0];
692     ib = aj + *ai;
693     jmin = 0;
694     if (*ib == i) {            /* (diag of A)*x */
695       z[i] += *v++ * x[*ib++]; jmin++;
696     }
697     for (j=jmin; j<n; j++) {
698       cval    = *ib;
699       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
700       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
701     }
702     xb++; ai++;
703   }
704 
705   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
706   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
707 
708   ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 #undef __FUNCT__
713 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
714 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
715 {
716   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
717   PetscScalar    *x,*z,*xb,x1,x2;
718   MatScalar      *v;
719   PetscErrorCode ierr;
720   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
721 
722   PetscFunctionBegin;
723   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
724   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
725   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
726 
727   v  = a->a;
728   xb = x;
729 
730   for (i=0; i<mbs; i++) {
731     n  = ai[1] - ai[0]; /* length of i_th block row of A */
732     x1 = xb[0]; x2 = xb[1];
733     ib = aj + *ai;
734     jmin = 0;
735     if (*ib == i){      /* (diag of A)*x */
736       z[2*i]   += v[0]*x1 + v[2]*x2;
737       z[2*i+1] += v[2]*x1 + v[3]*x2;
738       v += 4; jmin++;
739     }
740     for (j=jmin; j<n; j++) {
741       /* (strict lower triangular part of A)*x  */
742       cval       = ib[j]*2;
743       z[cval]     += v[0]*x1 + v[1]*x2;
744       z[cval+1]   += v[2]*x1 + v[3]*x2;
745       /* (strict upper triangular part of A)*x  */
746       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
747       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
748       v  += 4;
749     }
750     xb +=2; ai++;
751   }
752   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
753   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
754 
755   ierr = PetscLogFlops(4*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
756   PetscFunctionReturn(0);
757 }
758 
759 #undef __FUNCT__
760 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
761 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
762 {
763   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
764   PetscScalar    *x,*z,*xb,x1,x2,x3;
765   MatScalar      *v;
766   PetscErrorCode ierr;
767   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
768 
769   PetscFunctionBegin;
770   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
771   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
772   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
773 
774   v     = a->a;
775   xb = x;
776 
777   for (i=0; i<mbs; i++) {
778     n  = ai[1] - ai[0]; /* length of i_th block row of A */
779     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
780     ib = aj + *ai;
781     jmin = 0;
782     if (*ib == i){     /* (diag of A)*x */
783      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
784      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
785      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
786      v += 9; jmin++;
787     }
788     for (j=jmin; j<n; j++) {
789       /* (strict lower triangular part of A)*x  */
790       cval       = ib[j]*3;
791       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
792       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
793       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
794       /* (strict upper triangular part of A)*x  */
795       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
796       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
797       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
798       v  += 9;
799     }
800     xb +=3; ai++;
801   }
802 
803   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
804   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
805 
806   ierr = PetscLogFlops(18*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
807   PetscFunctionReturn(0);
808 }
809 
810 #undef __FUNCT__
811 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
812 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
813 {
814   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
815   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
816   MatScalar      *v;
817   PetscErrorCode ierr;
818   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
819 
820   PetscFunctionBegin;
821   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
822   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
823   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
824 
825   v     = a->a;
826   xb = x;
827 
828   for (i=0; i<mbs; i++) {
829     n  = ai[1] - ai[0]; /* length of i_th block row of A */
830     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
831     ib = aj + *ai;
832     jmin = 0;
833     if (*ib == i){      /* (diag of A)*x */
834       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
835       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
836       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
837       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
838       v += 16; jmin++;
839     }
840     for (j=jmin; j<n; j++) {
841       /* (strict lower triangular part of A)*x  */
842       cval       = ib[j]*4;
843       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
844       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
845       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
846       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
847       /* (strict upper triangular part of A)*x  */
848       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
849       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
850       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
851       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
852       v  += 16;
853     }
854     xb +=4; ai++;
855   }
856 
857   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
858   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
859 
860   ierr = PetscLogFlops(32*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
861   PetscFunctionReturn(0);
862 }
863 
864 #undef __FUNCT__
865 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
866 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
867 {
868   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
869   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
870   MatScalar      *v;
871   PetscErrorCode ierr;
872   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
873 
874   PetscFunctionBegin;
875   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
876   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
877   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
878 
879   v     = a->a;
880   xb = x;
881 
882   for (i=0; i<mbs; i++) {
883     n  = ai[1] - ai[0]; /* length of i_th block row of A */
884     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
885     ib = aj + *ai;
886     jmin = 0;
887     if (*ib == i){      /* (diag of A)*x */
888       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
889       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
890       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
891       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
892       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
893       v += 25; jmin++;
894     }
895     for (j=jmin; j<n; j++) {
896       /* (strict lower triangular part of A)*x  */
897       cval       = ib[j]*5;
898       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
899       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
900       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
901       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
902       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
903       /* (strict upper triangular part of A)*x  */
904       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
905       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
906       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
907       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
908       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
909       v  += 25;
910     }
911     xb +=5; ai++;
912   }
913 
914   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
915   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
916 
917   ierr = PetscLogFlops(50*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
918   PetscFunctionReturn(0);
919 }
920 #undef __FUNCT__
921 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
922 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
923 {
924   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
925   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
926   MatScalar      *v;
927   PetscErrorCode ierr;
928   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
929 
930   PetscFunctionBegin;
931   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
932   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
933   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
934 
935   v     = a->a;
936   xb = x;
937 
938   for (i=0; i<mbs; i++) {
939     n  = ai[1] - ai[0]; /* length of i_th block row of A */
940     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
941     ib = aj + *ai;
942     jmin = 0;
943     if (*ib == i){     /* (diag of A)*x */
944       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
945       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
946       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
947       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
948       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
949       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
950       v += 36; jmin++;
951     }
952     for (j=jmin; j<n; j++) {
953       /* (strict lower triangular part of A)*x  */
954       cval       = ib[j]*6;
955       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
956       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
957       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
958       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
959       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
960       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
961       /* (strict upper triangular part of A)*x  */
962       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
963       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
964       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
965       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
966       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
967       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
968       v  += 36;
969     }
970     xb +=6; ai++;
971   }
972 
973   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
974   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
975 
976   ierr = PetscLogFlops(72*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
977   PetscFunctionReturn(0);
978 }
979 
980 #undef __FUNCT__
981 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
982 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
983 {
984   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
985   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
986   MatScalar      *v;
987   PetscErrorCode ierr;
988   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
989 
990   PetscFunctionBegin;
991   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
992   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
993   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
994 
995   v     = a->a;
996   xb = x;
997 
998   for (i=0; i<mbs; i++) {
999     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1000     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1001     ib = aj + *ai;
1002     jmin = 0;
1003     if (*ib == i){     /* (diag of A)*x */
1004       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1005       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1006       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1007       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1008       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1009       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1010       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1011       v += 49; jmin++;
1012     }
1013     for (j=jmin; j<n; j++) {
1014       /* (strict lower triangular part of A)*x  */
1015       cval       = ib[j]*7;
1016       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1017       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1018       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1019       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1020       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1021       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1022       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1023       /* (strict upper triangular part of A)*x  */
1024       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1025       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1026       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1027       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1028       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1029       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1030       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1031       v  += 49;
1032     }
1033     xb +=7; ai++;
1034   }
1035 
1036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1037   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1038 
1039   ierr = PetscLogFlops(98*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 #undef __FUNCT__
1044 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1045 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1046 {
1047   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1048   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1049   MatScalar      *v;
1050   PetscErrorCode ierr;
1051   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k;
1052 
1053   PetscFunctionBegin;
1054   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1055   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1056   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1057 
1058   aj   = a->j;
1059   v    = a->a;
1060   ii   = a->i;
1061 
1062   if (!a->mult_work) {
1063     ierr = PetscMalloc((A->rmap.n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1064   }
1065   work = a->mult_work;
1066 
1067 
1068   for (i=0; i<mbs; i++) {
1069     n     = ii[1] - ii[0]; ncols = n*bs;
1070     workt = work; idx=aj+ii[0];
1071 
1072     /* upper triangular part */
1073     for (j=0; j<n; j++) {
1074       xb = x_ptr + bs*(*idx++);
1075       for (k=0; k<bs; k++) workt[k] = xb[k];
1076       workt += bs;
1077     }
1078     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1079     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1080 
1081     /* strict lower triangular part */
1082     idx = aj+ii[0];
1083     if (*idx == i){
1084       ncols -= bs; v += bs2; idx++; n--;
1085     }
1086     if (ncols > 0){
1087       workt = work;
1088       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1089       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1090       for (j=0; j<n; j++) {
1091         zb = z_ptr + bs*(*idx++);
1092         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1093         workt += bs;
1094       }
1095     }
1096 
1097     x += bs; v += n*bs2; z += bs; ii++;
1098   }
1099 
1100   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1101   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1102 
1103   ierr = PetscLogFlops(2*(a->nz*2 - A->rmap.n));CHKERRQ(ierr);
1104   PetscFunctionReturn(0);
1105 }
1106 
1107 #undef __FUNCT__
1108 #define __FUNCT__ "MatScale_SeqSBAIJ"
1109 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1110 {
1111   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1112   PetscScalar oalpha = alpha;
1113   PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz;
1114   PetscErrorCode ierr;
1115 
1116   PetscFunctionBegin;
1117   BLASscal_(&totalnz,&oalpha,a->a,&one);
1118   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 #undef __FUNCT__
1123 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1124 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1125 {
1126   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1127   MatScalar      *v = a->a;
1128   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1129   PetscInt       i,j,k,bs = A->rmap.bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1130   PetscErrorCode ierr;
1131   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1132 
1133   PetscFunctionBegin;
1134   if (type == NORM_FROBENIUS) {
1135     for (k=0; k<mbs; k++){
1136       jmin = a->i[k]; jmax = a->i[k+1];
1137       col  = aj + jmin;
1138       if (*col == k){         /* diagonal block */
1139         for (i=0; i<bs2; i++){
1140 #if defined(PETSC_USE_COMPLEX)
1141           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1142 #else
1143           sum_diag += (*v)*(*v); v++;
1144 #endif
1145         }
1146         jmin++;
1147       }
1148       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1149         for (i=0; i<bs2; i++){
1150 #if defined(PETSC_USE_COMPLEX)
1151           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1152 #else
1153           sum_off += (*v)*(*v); v++;
1154 #endif
1155         }
1156       }
1157     }
1158     *norm = sqrt(sum_diag + 2*sum_off);
1159   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1160     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1161     jl   = il + mbs;
1162     sum  = (PetscReal*)(jl + mbs);
1163     for (i=0; i<mbs; i++) jl[i] = mbs;
1164     il[0] = 0;
1165 
1166     *norm = 0.0;
1167     for (k=0; k<mbs; k++) { /* k_th block row */
1168       for (j=0; j<bs; j++) sum[j]=0.0;
1169       /*-- col sum --*/
1170       i = jl[k]; /* first |A(i,k)| to be added */
1171       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1172                   at step k */
1173       while (i<mbs){
1174         nexti = jl[i];  /* next block row to be added */
1175         ik    = il[i];  /* block index of A(i,k) in the array a */
1176         for (j=0; j<bs; j++){
1177           v = a->a + ik*bs2 + j*bs;
1178           for (k1=0; k1<bs; k1++) {
1179             sum[j] += PetscAbsScalar(*v); v++;
1180           }
1181         }
1182         /* update il, jl */
1183         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1184         jmax = a->i[i+1];
1185         if (jmin < jmax){
1186           il[i] = jmin;
1187           j   = a->j[jmin];
1188           jl[i] = jl[j]; jl[j]=i;
1189         }
1190         i = nexti;
1191       }
1192       /*-- row sum --*/
1193       jmin = a->i[k]; jmax = a->i[k+1];
1194       for (i=jmin; i<jmax; i++) {
1195         for (j=0; j<bs; j++){
1196           v = a->a + i*bs2 + j;
1197           for (k1=0; k1<bs; k1++){
1198             sum[j] += PetscAbsScalar(*v); v += bs;
1199           }
1200         }
1201       }
1202       /* add k_th block row to il, jl */
1203       col = aj+jmin;
1204       if (*col == k) jmin++;
1205       if (jmin < jmax){
1206         il[k] = jmin;
1207         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1208       }
1209       for (j=0; j<bs; j++){
1210         if (sum[j] > *norm) *norm = sum[j];
1211       }
1212     }
1213     ierr = PetscFree(il);CHKERRQ(ierr);
1214   } else {
1215     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1216   }
1217   PetscFunctionReturn(0);
1218 }
1219 
1220 #undef __FUNCT__
1221 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1222 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1223 {
1224   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1225   PetscErrorCode ierr;
1226 
1227   PetscFunctionBegin;
1228 
1229   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1230   if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1231     *flg = PETSC_FALSE;
1232     PetscFunctionReturn(0);
1233   }
1234 
1235   /* if the a->i are the same */
1236   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1237   if (!*flg) {
1238     PetscFunctionReturn(0);
1239   }
1240 
1241   /* if a->j are the same */
1242   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1243   if (!*flg) {
1244     PetscFunctionReturn(0);
1245   }
1246   /* if a->a are the same */
1247   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(A->rmap.bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNCT__
1252 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1253 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1254 {
1255   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1256   PetscErrorCode ierr;
1257   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1258   PetscScalar    *x,zero = 0.0;
1259   MatScalar      *aa,*aa_j;
1260 
1261   PetscFunctionBegin;
1262   bs   = A->rmap.bs;
1263   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1264 
1265   aa   = a->a;
1266   ai   = a->i;
1267   aj   = a->j;
1268   ambs = a->mbs;
1269   bs2  = a->bs2;
1270 
1271   ierr = VecSet(v,zero);CHKERRQ(ierr);
1272   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1273   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1274   if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1275   for (i=0; i<ambs; i++) {
1276     j=ai[i];
1277     if (aj[j] == i) {             /* if this is a diagonal element */
1278       row  = i*bs;
1279       aa_j = aa + j*bs2;
1280       if (A->factor && bs==1){
1281         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1282       } else {
1283         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1284       }
1285     }
1286   }
1287 
1288   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1289   PetscFunctionReturn(0);
1290 }
1291 
1292 #undef __FUNCT__
1293 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1294 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1295 {
1296   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1297   PetscScalar    *l,x,*li,*ri;
1298   MatScalar      *aa,*v;
1299   PetscErrorCode ierr;
1300   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1301   PetscTruth     flg;
1302 
1303   PetscFunctionBegin;
1304   if (ll != rr){
1305     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1306     if (!flg)
1307       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1308   }
1309   if (!ll) PetscFunctionReturn(0);
1310   ai  = a->i;
1311   aj  = a->j;
1312   aa  = a->a;
1313   m   = A->rmap.N;
1314   bs  = A->rmap.bs;
1315   mbs = a->mbs;
1316   bs2 = a->bs2;
1317 
1318   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1319   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1320   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1321   for (i=0; i<mbs; i++) { /* for each block row */
1322     M  = ai[i+1] - ai[i];
1323     li = l + i*bs;
1324     v  = aa + bs2*ai[i];
1325     for (j=0; j<M; j++) { /* for each block */
1326       ri = l + bs*aj[ai[i]+j];
1327       for (k=0; k<bs; k++) {
1328         x = ri[k];
1329         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1330       }
1331     }
1332   }
1333   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1334   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 #undef __FUNCT__
1339 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1340 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1341 {
1342   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1343 
1344   PetscFunctionBegin;
1345   info->rows_global    = (double)A->rmap.N;
1346   info->columns_global = (double)A->rmap.N;
1347   info->rows_local     = (double)A->rmap.N;
1348   info->columns_local  = (double)A->rmap.N;
1349   info->block_size     = a->bs2;
1350   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1351   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1352   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1353   info->assemblies   = A->num_ass;
1354   info->mallocs      = a->reallocs;
1355   info->memory       = A->mem;
1356   if (A->factor) {
1357     info->fill_ratio_given  = A->info.fill_ratio_given;
1358     info->fill_ratio_needed = A->info.fill_ratio_needed;
1359     info->factor_mallocs    = A->info.factor_mallocs;
1360   } else {
1361     info->fill_ratio_given  = 0;
1362     info->fill_ratio_needed = 0;
1363     info->factor_mallocs    = 0;
1364   }
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 
1369 #undef __FUNCT__
1370 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1371 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1372 {
1373   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1374   PetscErrorCode ierr;
1375 
1376   PetscFunctionBegin;
1377   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 #undef __FUNCT__
1382 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1383 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1384 {
1385   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1386   PetscErrorCode ierr;
1387   PetscInt i,j,n,row,col,bs,*ai,*aj,mbs;
1388   PetscReal    atmp;
1389   MatScalar    *aa;
1390   PetscScalar  zero = 0.0,*x;
1391   PetscInt          ncols,brow,bcol,krow,kcol;
1392 
1393   PetscFunctionBegin;
1394   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1395   bs   = A->rmap.bs;
1396   aa   = a->a;
1397   ai   = a->i;
1398   aj   = a->j;
1399   mbs = a->mbs;
1400 
1401   ierr = VecSet(v,zero);CHKERRQ(ierr);
1402   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1403   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1404   if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1405   for (i=0; i<mbs; i++) {
1406     ncols = ai[1] - ai[0]; ai++;
1407     brow  = bs*i;
1408     for (j=0; j<ncols; j++){
1409       bcol = bs*(*aj);
1410       for (kcol=0; kcol<bs; kcol++){
1411         col = bcol + kcol;      /* col index */
1412         for (krow=0; krow<bs; krow++){
1413           atmp = PetscAbsScalar(*aa); aa++;
1414           row = brow + krow;    /* row index */
1415           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1416           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1417         }
1418       }
1419       aj++;
1420     }
1421   }
1422   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1423   PetscFunctionReturn(0);
1424 }
1425