xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 6849ba73f22fecb8f92ef896a42e4e8bd4cd6965)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/inline/spops.h"
4 #include "src/inline/ilu.h"
5 #include "petscbt.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
11 {
12   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   int          brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,
15                start,end,*ai,*aj,bs,*nidx2;
16   PetscBT      table;
17   PetscBT      table0;
18 
19   PetscFunctionBegin;
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = a->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
30   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
31 
32   for (i=0; i<is_max; i++) { /* for each is */
33     isz  = 0;
34     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
41     bcol_max = 0;
42     for (j=0; j<n ; ++j){
43       brow = idx[j]/bs; /* convert the indices into block indices */
44       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45       if(!PetscBTLookupSet(table,brow)) {
46         nidx[isz++] = brow;
47         if (bcol_max < brow) bcol_max = brow;
48       }
49     }
50     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
51     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
52 
53     k = 0;
54     for (j=0; j<ov; j++){ /* for each overlap */
55       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
56       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
57       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
58 
59       n = isz;  /* length of the updated is[i] */
60       for (brow=0; brow<mbs; brow++){
61         start = ai[brow]; end   = ai[brow+1];
62         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
63           for (l = start; l<end ; l++){
64             bcol = aj[l];
65             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end ; l++){
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table0,bcol)){
74               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++)
85         nidx2[j*bs+k] = nidx[j]*bs+k;
86     }
87     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
88   }
89   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
90   ierr = PetscFree(nidx);CHKERRQ(ierr);
91   ierr = PetscFree(nidx2);CHKERRQ(ierr);
92   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
93   PetscFunctionReturn(0);
94 }
95 
96 #undef __FUNCT__
97 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
98 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
99 {
100   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
101   PetscErrorCode ierr;
102   int          *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
103   int          row,mat_i,*mat_j,tcol,*mat_ilen;
104   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
105   int          *aj = a->j,*ai = a->i;
106   MatScalar    *mat_a;
107   Mat          C;
108   PetscTruth   flag;
109 
110   PetscFunctionBegin;
111 
112   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
113   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
114   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
115 
116   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
117   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
118 
119   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
120   ssmap = smap;
121   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
122   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
123   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
124   /* determine lens of each row */
125   for (i=0; i<nrows; i++) {
126     kstart  = ai[irow[i]];
127     kend    = kstart + a->ilen[irow[i]];
128     lens[i] = 0;
129       for (k=kstart; k<kend; k++) {
130         if (ssmap[aj[k]]) {
131           lens[i]++;
132         }
133       }
134     }
135   /* Create and fill new matrix */
136   if (scall == MAT_REUSE_MATRIX) {
137     c = (Mat_SeqSBAIJ *)((*B)->data);
138 
139     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
140     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
141     if (flag == PETSC_FALSE) {
142       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
143     }
144     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
145     C = *B;
146   } else {
147     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
148     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
149     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
150   }
151   c = (Mat_SeqSBAIJ *)(C->data);
152   for (i=0; i<nrows; i++) {
153     row    = irow[i];
154     kstart = ai[row];
155     kend   = kstart + a->ilen[row];
156     mat_i  = c->i[i];
157     mat_j  = c->j + mat_i;
158     mat_a  = c->a + mat_i*bs2;
159     mat_ilen = c->ilen + i;
160     for (k=kstart; k<kend; k++) {
161       if ((tcol=ssmap[a->j[k]])) {
162         *mat_j++ = tcol - 1;
163         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
164         mat_a   += bs2;
165         (*mat_ilen)++;
166       }
167     }
168   }
169 
170   /* Free work space */
171   ierr = PetscFree(smap);CHKERRQ(ierr);
172   ierr = PetscFree(lens);CHKERRQ(ierr);
173   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
174   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
175 
176   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
177   *B = C;
178   PetscFunctionReturn(0);
179 }
180 
181 #undef __FUNCT__
182 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
183 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
184 {
185   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
186   IS          is1;
187   PetscErrorCode ierr;
188   int         *vary,*iary,*irow,nrows,i,bs=a->bs,count;
189 
190   PetscFunctionBegin;
191   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
192 
193   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
194   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
195 
196   /* Verify if the indices corespond to each element in a block
197    and form the IS with compressed IS */
198   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
199   iary = vary + a->mbs;
200   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
201   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
202 
203   count = 0;
204   for (i=0; i<a->mbs; i++) {
205     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
206     if (vary[i]==bs) iary[count++] = i;
207   }
208   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
209 
210   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
211   ierr = PetscFree(vary);CHKERRQ(ierr);
212 
213   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
214   ISDestroy(is1);
215   PetscFunctionReturn(0);
216 }
217 
218 #undef __FUNCT__
219 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
220 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
221 {
222   PetscErrorCode ierr;
223   int i;
224 
225   PetscFunctionBegin;
226   if (scall == MAT_INITIAL_MATRIX) {
227     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
228   }
229 
230   for (i=0; i<n; i++) {
231     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
232   }
233   PetscFunctionReturn(0);
234 }
235 
236 /* -------------------------------------------------------*/
237 /* Should check that shapes of vectors and matrices match */
238 /* -------------------------------------------------------*/
239 #include "petscblaslapack.h"
240 
241 #undef __FUNCT__
242 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
243 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
244 {
245   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
246   PetscScalar     *x,*z,*xb,x1,zero=0.0;
247   MatScalar       *v;
248   PetscErrorCode ierr;
249   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
250 
251   PetscFunctionBegin;
252   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
254   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
255 
256   v  = a->a;
257   xb = x;
258 
259   for (i=0; i<mbs; i++) {
260     n  = ai[1] - ai[0];  /* length of i_th row of A */
261     x1 = xb[0];
262     ib = aj + *ai;
263     jmin = 0;
264     if (*ib == i) {      /* (diag of A)*x */
265       z[i] += *v++ * x[*ib++];
266       jmin++;
267     }
268     for (j=jmin; j<n; j++) {
269       cval    = *ib;
270       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
271       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
272     }
273     xb++; ai++;
274   }
275 
276   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
277   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
278   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
279   PetscFunctionReturn(0);
280 }
281 
282 #undef __FUNCT__
283 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
284 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
285 {
286   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
287   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
288   MatScalar        *v;
289   PetscErrorCode ierr;
290   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
291 
292 
293   PetscFunctionBegin;
294   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
295   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
296   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
297 
298   v     = a->a;
299   xb = x;
300 
301   for (i=0; i<mbs; i++) {
302     n  = ai[1] - ai[0]; /* length of i_th block row of A */
303     x1 = xb[0]; x2 = xb[1];
304     ib = aj + *ai;
305     jmin = 0;
306     if (*ib == i){     /* (diag of A)*x */
307       z[2*i]   += v[0]*x1 + v[2]*x2;
308       z[2*i+1] += v[2]*x1 + v[3]*x2;
309       v += 4; jmin++;
310     }
311     for (j=jmin; j<n; j++) {
312       /* (strict lower triangular part of A)*x  */
313       cval       = ib[j]*2;
314       z[cval]     += v[0]*x1 + v[1]*x2;
315       z[cval+1]   += v[2]*x1 + v[3]*x2;
316       /* (strict upper triangular part of A)*x  */
317       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
318       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
319       v  += 4;
320     }
321     xb +=2; ai++;
322   }
323 
324   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
325   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
326   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
327   PetscFunctionReturn(0);
328 }
329 
330 #undef __FUNCT__
331 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
332 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
333 {
334   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
335   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
336   MatScalar     *v;
337   PetscErrorCode ierr;
338   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
339 
340 
341   PetscFunctionBegin;
342   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
343   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
344   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
345 
346   v     = a->a;
347   xb = x;
348 
349   for (i=0; i<mbs; i++) {
350     n  = ai[1] - ai[0]; /* length of i_th block row of A */
351     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
352     ib = aj + *ai;
353     jmin = 0;
354     if (*ib == i){     /* (diag of A)*x */
355       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
356       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
357       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
358       v += 9; jmin++;
359     }
360     for (j=jmin; j<n; j++) {
361       /* (strict lower triangular part of A)*x  */
362       cval       = ib[j]*3;
363       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
364       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
365       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
366       /* (strict upper triangular part of A)*x  */
367       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
368       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
369       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
370       v  += 9;
371     }
372     xb +=3; ai++;
373   }
374 
375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
376   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
377   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
378   PetscFunctionReturn(0);
379 }
380 
381 #undef __FUNCT__
382 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
383 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
384 {
385   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
386   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
387   MatScalar        *v;
388   PetscErrorCode ierr;
389   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
390 
391   PetscFunctionBegin;
392   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
393   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
394   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
395 
396   v     = a->a;
397   xb = x;
398 
399   for (i=0; i<mbs; i++) {
400     n  = ai[1] - ai[0]; /* length of i_th block row of A */
401     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
402     ib = aj + *ai;
403     jmin = 0;
404     if (*ib == i){     /* (diag of A)*x */
405       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
406       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
407       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
408       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
409       v += 16; jmin++;
410     }
411     for (j=jmin; j<n; j++) {
412       /* (strict lower triangular part of A)*x  */
413       cval       = ib[j]*4;
414       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
415       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
416       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
417       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
418       /* (strict upper triangular part of A)*x  */
419       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
420       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
421       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
422       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
423       v  += 16;
424     }
425     xb +=4; ai++;
426   }
427 
428   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
429   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
430   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
431   PetscFunctionReturn(0);
432 }
433 
434 #undef __FUNCT__
435 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
436 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
437 {
438   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
439   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
440   MatScalar       *v;
441   PetscErrorCode ierr;
442   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
443 
444   PetscFunctionBegin;
445   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
446   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
447   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
448 
449   v     = a->a;
450   xb = x;
451 
452   for (i=0; i<mbs; i++) {
453     n  = ai[1] - ai[0]; /* length of i_th block row of A */
454     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
455     ib = aj + *ai;
456     jmin = 0;
457     if (*ib == i){      /* (diag of A)*x */
458       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
459       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
460       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
461       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
462       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
463       v += 25; jmin++;
464     }
465     for (j=jmin; j<n; j++) {
466       /* (strict lower triangular part of A)*x  */
467       cval       = ib[j]*5;
468       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
469       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
470       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
471       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
472       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
473       /* (strict upper triangular part of A)*x  */
474       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
475       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
476       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
477       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
478       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
479       v  += 25;
480     }
481     xb +=5; ai++;
482   }
483 
484   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
485   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
486   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
487   PetscFunctionReturn(0);
488 }
489 
490 
491 #undef __FUNCT__
492 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
493 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
494 {
495   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
496   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
497   MatScalar       *v;
498   PetscErrorCode ierr;
499   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
500 
501   PetscFunctionBegin;
502   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
503   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
504   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
505 
506   v     = a->a;
507   xb = x;
508 
509   for (i=0; i<mbs; i++) {
510     n  = ai[1] - ai[0]; /* length of i_th block row of A */
511     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
512     ib = aj + *ai;
513     jmin = 0;
514     if (*ib == i){      /* (diag of A)*x */
515       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
516       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
517       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
518       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
519       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
520       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
521       v += 36; jmin++;
522     }
523     for (j=jmin; j<n; j++) {
524       /* (strict lower triangular part of A)*x  */
525       cval       = ib[j]*6;
526       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
527       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
528       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
529       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
530       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
531       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
532       /* (strict upper triangular part of A)*x  */
533       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
534       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
535       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
536       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
537       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
538       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
539       v  += 36;
540     }
541     xb +=6; ai++;
542   }
543 
544   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
545   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
546   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
547   PetscFunctionReturn(0);
548 }
549 #undef __FUNCT__
550 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
551 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
552 {
553   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
554   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
555   MatScalar       *v;
556   PetscErrorCode ierr;
557   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
558 
559   PetscFunctionBegin;
560   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
561   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
562   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
563 
564   v     = a->a;
565   xb = x;
566 
567   for (i=0; i<mbs; i++) {
568     n  = ai[1] - ai[0]; /* length of i_th block row of A */
569     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
570     ib = aj + *ai;
571     jmin = 0;
572     if (*ib == i){      /* (diag of A)*x */
573       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
574       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
575       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
576       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
577       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
578       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
579       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
580       v += 49; jmin++;
581     }
582     for (j=jmin; j<n; j++) {
583       /* (strict lower triangular part of A)*x  */
584       cval       = ib[j]*7;
585       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
586       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
587       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
588       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
589       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
590       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
591       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
592       /* (strict upper triangular part of A)*x  */
593       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
594       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
595       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
596       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
597       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
598       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
599       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
600       v  += 49;
601     }
602     xb +=7; ai++;
603   }
604   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
605   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
606   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
607   PetscFunctionReturn(0);
608 }
609 
610 /*
611     This will not work with MatScalar == float because it calls the BLAS
612 */
613 #undef __FUNCT__
614 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
615 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
616 {
617   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
618   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
619   MatScalar       *v;
620   PetscErrorCode ierr;
621   int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
622   int             ncols,k;
623 
624   PetscFunctionBegin;
625   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
626   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
627   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
628 
629   aj   = a->j;
630   v    = a->a;
631   ii   = a->i;
632 
633   if (!a->mult_work) {
634     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
635   }
636   work = a->mult_work;
637 
638   for (i=0; i<mbs; i++) {
639     n     = ii[1] - ii[0]; ncols = n*bs;
640     workt = work; idx=aj+ii[0];
641 
642     /* upper triangular part */
643     for (j=0; j<n; j++) {
644       xb = x_ptr + bs*(*idx++);
645       for (k=0; k<bs; k++) workt[k] = xb[k];
646       workt += bs;
647     }
648     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
649     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
650 
651     /* strict lower triangular part */
652     idx = aj+ii[0];
653     if (*idx == i){
654       ncols -= bs; v += bs2; idx++; n--;
655     }
656 
657     if (ncols > 0){
658       workt = work;
659       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
660       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
661       for (j=0; j<n; j++) {
662         zb = z_ptr + bs*(*idx++);
663         for (k=0; k<bs; k++) zb[k] += workt[k] ;
664         workt += bs;
665       }
666     }
667     x += bs; v += n*bs2; z += bs; ii++;
668   }
669 
670   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
671   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
672   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
673   PetscFunctionReturn(0);
674 }
675 
676 #undef __FUNCT__
677 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
678 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
679 {
680   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
681   PetscScalar     *x,*y,*z,*xb,x1;
682   MatScalar       *v;
683   PetscErrorCode ierr;
684   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
685 
686   PetscFunctionBegin;
687   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
688   if (yy != xx) {
689     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
690   } else {
691     y = x;
692   }
693   if (zz != yy) {
694     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
695     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
696     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
697   } else {
698     z = y;
699   }
700 
701   v  = a->a;
702   xb = x;
703 
704   for (i=0; i<mbs; i++) {
705     n  = ai[1] - ai[0];  /* length of i_th row of A */
706     x1 = xb[0];
707     ib = aj + *ai;
708     jmin = 0;
709     if (*ib == i) {            /* (diag of A)*x */
710       z[i] += *v++ * x[*ib++]; jmin++;
711     }
712     for (j=jmin; j<n; j++) {
713       cval    = *ib;
714       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
715       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
716     }
717     xb++; ai++;
718   }
719 
720   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
721   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
722   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
723 
724   PetscLogFlops(2*(a->nz*2 - A->m));
725   PetscFunctionReturn(0);
726 }
727 
728 #undef __FUNCT__
729 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
730 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
731 {
732   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
733   PetscScalar     *x,*y,*z,*xb,x1,x2;
734   MatScalar       *v;
735   PetscErrorCode ierr;
736   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
737 
738   PetscFunctionBegin;
739   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
740   if (yy != xx) {
741     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
742   } else {
743     y = x;
744   }
745   if (zz != yy) {
746     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
747     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
748     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
749   } else {
750     z = y;
751   }
752 
753   v     = a->a;
754   xb = x;
755 
756   for (i=0; i<mbs; i++) {
757     n  = ai[1] - ai[0]; /* length of i_th block row of A */
758     x1 = xb[0]; x2 = xb[1];
759     ib = aj + *ai;
760     jmin = 0;
761     if (*ib == i){      /* (diag of A)*x */
762       z[2*i]   += v[0]*x1 + v[2]*x2;
763       z[2*i+1] += v[2]*x1 + v[3]*x2;
764       v += 4; jmin++;
765     }
766     for (j=jmin; j<n; j++) {
767       /* (strict lower triangular part of A)*x  */
768       cval       = ib[j]*2;
769       z[cval]     += v[0]*x1 + v[1]*x2;
770       z[cval+1]   += v[2]*x1 + v[3]*x2;
771       /* (strict upper triangular part of A)*x  */
772       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
773       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
774       v  += 4;
775     }
776     xb +=2; ai++;
777   }
778 
779   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
780   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
781   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
782 
783   PetscLogFlops(4*(a->nz*2 - A->m));
784   PetscFunctionReturn(0);
785 }
786 
787 #undef __FUNCT__
788 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
789 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
790 {
791   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
792   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
793   MatScalar       *v;
794   PetscErrorCode ierr;
795   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
796 
797   PetscFunctionBegin;
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   if (yy != xx) {
800     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
801   } else {
802     y = x;
803   }
804   if (zz != yy) {
805     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
806     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
807     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
808   } else {
809     z = y;
810   }
811 
812   v     = a->a;
813   xb = x;
814 
815   for (i=0; i<mbs; i++) {
816     n  = ai[1] - ai[0]; /* length of i_th block row of A */
817     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
818     ib = aj + *ai;
819     jmin = 0;
820     if (*ib == i){     /* (diag of A)*x */
821      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
822      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
823      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
824      v += 9; jmin++;
825     }
826     for (j=jmin; j<n; j++) {
827       /* (strict lower triangular part of A)*x  */
828       cval       = ib[j]*3;
829       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
830       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
831       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
832       /* (strict upper triangular part of A)*x  */
833       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
834       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
835       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
836       v  += 9;
837     }
838     xb +=3; ai++;
839   }
840 
841   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
842   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
843   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
844 
845   PetscLogFlops(18*(a->nz*2 - A->m));
846   PetscFunctionReturn(0);
847 }
848 
849 #undef __FUNCT__
850 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
851 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
852 {
853   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
854   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
855   MatScalar       *v;
856   PetscErrorCode ierr;
857   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
858 
859   PetscFunctionBegin;
860   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
861   if (yy != xx) {
862     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
863   } else {
864     y = x;
865   }
866   if (zz != yy) {
867     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
868     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
869     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
870   } else {
871     z = y;
872   }
873 
874   v     = a->a;
875   xb = x;
876 
877   for (i=0; i<mbs; i++) {
878     n  = ai[1] - ai[0]; /* length of i_th block row of A */
879     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
880     ib = aj + *ai;
881     jmin = 0;
882     if (*ib == i){      /* (diag of A)*x */
883       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
884       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
885       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
886       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
887       v += 16; jmin++;
888     }
889     for (j=jmin; j<n; j++) {
890       /* (strict lower triangular part of A)*x  */
891       cval       = ib[j]*4;
892       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
893       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
894       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
895       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
896       /* (strict upper triangular part of A)*x  */
897       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
898       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
899       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
900       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
901       v  += 16;
902     }
903     xb +=4; ai++;
904   }
905 
906   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
907   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
908   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
909 
910   PetscLogFlops(32*(a->nz*2 - A->m));
911   PetscFunctionReturn(0);
912 }
913 
914 #undef __FUNCT__
915 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
916 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
917 {
918   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
919   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
920   MatScalar       *v;
921   PetscErrorCode ierr;
922   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
923 
924   PetscFunctionBegin;
925   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
926   if (yy != xx) {
927     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
928   } else {
929     y = x;
930   }
931   if (zz != yy) {
932     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
933     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
934     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
935   } else {
936     z = y;
937   }
938 
939   v     = a->a;
940   xb = x;
941 
942   for (i=0; i<mbs; i++) {
943     n  = ai[1] - ai[0]; /* length of i_th block row of A */
944     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
945     ib = aj + *ai;
946     jmin = 0;
947     if (*ib == i){      /* (diag of A)*x */
948       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
949       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
950       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
951       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
952       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
953       v += 25; jmin++;
954     }
955     for (j=jmin; j<n; j++) {
956       /* (strict lower triangular part of A)*x  */
957       cval       = ib[j]*5;
958       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
959       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
960       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
961       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
962       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
963       /* (strict upper triangular part of A)*x  */
964       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
965       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
966       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
967       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
968       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
969       v  += 25;
970     }
971     xb +=5; ai++;
972   }
973 
974   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
975   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
976   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
977 
978   PetscLogFlops(50*(a->nz*2 - A->m));
979   PetscFunctionReturn(0);
980 }
981 #undef __FUNCT__
982 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
983 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
984 {
985   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
986   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
987   MatScalar       *v;
988   PetscErrorCode ierr;
989   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
990 
991   PetscFunctionBegin;
992   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
993   if (yy != xx) {
994     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
995   } else {
996     y = x;
997   }
998   if (zz != yy) {
999     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1000     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1001     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1002   } else {
1003     z = y;
1004   }
1005 
1006   v     = a->a;
1007   xb = x;
1008 
1009   for (i=0; i<mbs; i++) {
1010     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1011     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1012     ib = aj + *ai;
1013     jmin = 0;
1014     if (*ib == i){     /* (diag of A)*x */
1015       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1016       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1017       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1018       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1019       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1020       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1021       v += 36; jmin++;
1022     }
1023     for (j=jmin; j<n; j++) {
1024       /* (strict lower triangular part of A)*x  */
1025       cval       = ib[j]*6;
1026       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1027       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1028       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1029       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1030       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1031       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1032       /* (strict upper triangular part of A)*x  */
1033       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1034       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1035       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1036       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1037       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1038       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1039       v  += 36;
1040     }
1041     xb +=6; ai++;
1042   }
1043 
1044   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1045   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1046   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1047 
1048   PetscLogFlops(72*(a->nz*2 - A->m));
1049   PetscFunctionReturn(0);
1050 }
1051 
1052 #undef __FUNCT__
1053 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1054 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1055 {
1056   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1057   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1058   MatScalar       *v;
1059   PetscErrorCode ierr;
1060   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1061 
1062   PetscFunctionBegin;
1063   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1064   if (yy != xx) {
1065     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1066   } else {
1067     y = x;
1068   }
1069   if (zz != yy) {
1070     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1071     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1072     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1073   } else {
1074     z = y;
1075   }
1076 
1077   v     = a->a;
1078   xb = x;
1079 
1080   for (i=0; i<mbs; i++) {
1081     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1082     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1083     ib = aj + *ai;
1084     jmin = 0;
1085     if (*ib == i){     /* (diag of A)*x */
1086       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1087       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1088       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1089       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1090       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1091       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1092       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1093       v += 49; jmin++;
1094     }
1095     for (j=jmin; j<n; j++) {
1096       /* (strict lower triangular part of A)*x  */
1097       cval       = ib[j]*7;
1098       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1099       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1100       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1101       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1102       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1103       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1104       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1105       /* (strict upper triangular part of A)*x  */
1106       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1107       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1108       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1109       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1110       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1111       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1112       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1113       v  += 49;
1114     }
1115     xb +=7; ai++;
1116   }
1117 
1118   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1119   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1120   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1121 
1122   PetscLogFlops(98*(a->nz*2 - A->m));
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 #undef __FUNCT__
1127 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1128 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1129 {
1130   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1131   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1132   MatScalar       *v;
1133   PetscErrorCode ierr;
1134   int mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1135   int             ncols,k;
1136 
1137   PetscFunctionBegin;
1138   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1139   if (yy != xx) {
1140     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1141   } else {
1142     y = x;
1143   }
1144   if (zz != yy) {
1145     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1146     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1147     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1148   } else {
1149     z = y;
1150   }
1151 
1152   aj   = a->j;
1153   v    = a->a;
1154   ii   = a->i;
1155 
1156   if (!a->mult_work) {
1157     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1158   }
1159   work = a->mult_work;
1160 
1161 
1162   for (i=0; i<mbs; i++) {
1163     n     = ii[1] - ii[0]; ncols = n*bs;
1164     workt = work; idx=aj+ii[0];
1165 
1166     /* upper triangular part */
1167     for (j=0; j<n; j++) {
1168       xb = x_ptr + bs*(*idx++);
1169       for (k=0; k<bs; k++) workt[k] = xb[k];
1170       workt += bs;
1171     }
1172     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1173     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1174 
1175     /* strict lower triangular part */
1176     idx = aj+ii[0];
1177     if (*idx == i){
1178       ncols -= bs; v += bs2; idx++; n--;
1179     }
1180     if (ncols > 0){
1181       workt = work;
1182       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1183       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1184       for (j=0; j<n; j++) {
1185         zb = z_ptr + bs*(*idx++);
1186         /* idx++; */
1187         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1188         workt += bs;
1189       }
1190     }
1191 
1192     x += bs; v += n*bs2; z += bs; ii++;
1193   }
1194 
1195   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1196   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1197   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1198 
1199   PetscLogFlops(2*(a->nz*2 - A->m));
1200   PetscFunctionReturn(0);
1201 }
1202 
1203 #undef __FUNCT__
1204 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1205 PetscErrorCode MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1206 {
1207   PetscErrorCode ierr;
1208 
1209   PetscFunctionBegin;
1210   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1211   PetscFunctionReturn(0);
1212 }
1213 
1214 #undef __FUNCT__
1215 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1216 PetscErrorCode MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1217 
1218 {
1219   PetscErrorCode ierr;
1220 
1221   PetscFunctionBegin;
1222   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 #undef __FUNCT__
1227 #define __FUNCT__ "MatScale_SeqSBAIJ"
1228 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1229 {
1230   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1231   int         one = 1,totalnz = a->bs2*a->nz;
1232 
1233   PetscFunctionBegin;
1234   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1235   PetscLogFlops(totalnz);
1236   PetscFunctionReturn(0);
1237 }
1238 
1239 #undef __FUNCT__
1240 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1241 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1242 {
1243   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1244   MatScalar   *v = a->a;
1245   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1246   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1247   PetscErrorCode ierr;
1248   int         *jl,*il,jmin,jmax,nexti,ik,*col;
1249 
1250   PetscFunctionBegin;
1251   if (type == NORM_FROBENIUS) {
1252     for (k=0; k<mbs; k++){
1253       jmin = a->i[k]; jmax = a->i[k+1];
1254       col  = aj + jmin;
1255       if (*col == k){         /* diagonal block */
1256         for (i=0; i<bs2; i++){
1257 #if defined(PETSC_USE_COMPLEX)
1258           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1259 #else
1260           sum_diag += (*v)*(*v); v++;
1261 #endif
1262         }
1263         jmin++;
1264       }
1265       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1266         for (i=0; i<bs2; i++){
1267 #if defined(PETSC_USE_COMPLEX)
1268           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1269 #else
1270           sum_off += (*v)*(*v); v++;
1271 #endif
1272         }
1273       }
1274     }
1275     *norm = sqrt(sum_diag + 2*sum_off);
1276 
1277   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1278     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1279     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1280     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1281     for (i=0; i<mbs; i++) {
1282       jl[i] = mbs; il[0] = 0;
1283     }
1284 
1285     *norm = 0.0;
1286     for (k=0; k<mbs; k++) { /* k_th block row */
1287       for (j=0; j<bs; j++) sum[j]=0.0;
1288 
1289       /*-- col sum --*/
1290       i = jl[k]; /* first |A(i,k)| to be added */
1291       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1292                   at step k */
1293       while (i<mbs){
1294         nexti = jl[i];  /* next block row to be added */
1295         ik    = il[i];  /* block index of A(i,k) in the array a */
1296         for (j=0; j<bs; j++){
1297           v = a->a + ik*bs2 + j*bs;
1298           for (k1=0; k1<bs; k1++) {
1299             sum[j] += PetscAbsScalar(*v); v++;
1300           }
1301         }
1302         /* update il, jl */
1303         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1304         jmax = a->i[i+1];
1305         if (jmin < jmax){
1306           il[i] = jmin;
1307           j   = a->j[jmin];
1308           jl[i] = jl[j]; jl[j]=i;
1309         }
1310         i = nexti;
1311       }
1312 
1313       /*-- row sum --*/
1314       jmin = a->i[k]; jmax = a->i[k+1];
1315       for (i=jmin; i<jmax; i++) {
1316         for (j=0; j<bs; j++){
1317           v = a->a + i*bs2 + j;
1318           for (k1=0; k1<bs; k1++){
1319             sum[j] += PetscAbsScalar(*v);
1320             v   += bs;
1321           }
1322         }
1323       }
1324       /* add k_th block row to il, jl */
1325       col = aj+jmin;
1326       if (*col == k) jmin++;
1327       if (jmin < jmax){
1328         il[k] = jmin;
1329         j   = a->j[jmin];
1330         jl[k] = jl[j]; jl[j] = k;
1331       }
1332       for (j=0; j<bs; j++){
1333         if (sum[j] > *norm) *norm = sum[j];
1334       }
1335     }
1336     ierr = PetscFree(il);CHKERRQ(ierr);
1337     ierr = PetscFree(jl);CHKERRQ(ierr);
1338     ierr = PetscFree(sum);CHKERRQ(ierr);
1339   } else {
1340     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1341   }
1342   PetscFunctionReturn(0);
1343 }
1344 
1345 #undef __FUNCT__
1346 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1347 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1348 {
1349   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1350   PetscErrorCode ierr;
1351 
1352   PetscFunctionBegin;
1353 
1354   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1355   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1356     *flg = PETSC_FALSE;
1357     PetscFunctionReturn(0);
1358   }
1359 
1360   /* if the a->i are the same */
1361   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1362   if (*flg == PETSC_FALSE) {
1363     PetscFunctionReturn(0);
1364   }
1365 
1366   /* if a->j are the same */
1367   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1368   if (*flg == PETSC_FALSE) {
1369     PetscFunctionReturn(0);
1370   }
1371   /* if a->a are the same */
1372   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1373 
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 #undef __FUNCT__
1378 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1379 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1380 {
1381   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1382   PetscErrorCode ierr;
1383   int i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1384   PetscScalar  *x,zero = 0.0;
1385   MatScalar    *aa,*aa_j;
1386 
1387   PetscFunctionBegin;
1388   bs   = a->bs;
1389   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1390 
1391   aa   = a->a;
1392   ai   = a->i;
1393   aj   = a->j;
1394   ambs = a->mbs;
1395   bs2  = a->bs2;
1396 
1397   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1398   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1399   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1400   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1401   for (i=0; i<ambs; i++) {
1402     j=ai[i];
1403     if (aj[j] == i) {             /* if this is a diagonal element */
1404       row  = i*bs;
1405       aa_j = aa + j*bs2;
1406       if (A->factor && bs==1){
1407         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1408       } else {
1409         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1410       }
1411     }
1412   }
1413 
1414   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1415   PetscFunctionReturn(0);
1416 }
1417 
1418 #undef __FUNCT__
1419 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1420 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1421 {
1422   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1423   PetscScalar  *l,*r,x,*li,*ri;
1424   MatScalar    *aa,*v;
1425   PetscErrorCode ierr;
1426   int i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1427 
1428   PetscFunctionBegin;
1429   ai  = a->i;
1430   aj  = a->j;
1431   aa  = a->a;
1432   m   = A->m;
1433   bs  = a->bs;
1434   mbs = a->mbs;
1435   bs2 = a->bs2;
1436 
1437   if (ll != rr) {
1438     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1439   }
1440   if (ll) {
1441     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1442     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1443     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1444     for (i=0; i<mbs; i++) { /* for each block row */
1445       M  = ai[i+1] - ai[i];
1446       li = l + i*bs;
1447       v  = aa + bs2*ai[i];
1448       for (j=0; j<M; j++) { /* for each block */
1449         for (k=0; k<bs2; k++) {
1450           (*v++) *= li[k%bs];
1451         }
1452 #ifdef CONT
1453         /* will be used to replace the above loop */
1454         ri = l + bs*aj[ai[i]+j];
1455         for (k=0; k<bs; k++) { /* column value */
1456           x = ri[k];
1457           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1458         }
1459 #endif
1460 
1461       }
1462     }
1463     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1464     PetscLogFlops(2*a->nz);
1465   }
1466   /* will be deleted */
1467   if (rr) {
1468     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1469     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1470     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1471     for (i=0; i<mbs; i++) { /* for each block row */
1472       M  = ai[i+1] - ai[i];
1473       v  = aa + bs2*ai[i];
1474       for (j=0; j<M; j++) { /* for each block */
1475         ri = r + bs*aj[ai[i]+j];
1476         for (k=0; k<bs; k++) {
1477           x = ri[k];
1478           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1479         }
1480       }
1481     }
1482     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1483     PetscLogFlops(a->nz);
1484   }
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 #undef __FUNCT__
1489 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1490 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1491 {
1492   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1493 
1494   PetscFunctionBegin;
1495   info->rows_global    = (double)A->m;
1496   info->columns_global = (double)A->m;
1497   info->rows_local     = (double)A->m;
1498   info->columns_local  = (double)A->m;
1499   info->block_size     = a->bs2;
1500   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1501   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1502   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1503   info->assemblies   = A->num_ass;
1504   info->mallocs      = a->reallocs;
1505   info->memory       = A->mem;
1506   if (A->factor) {
1507     info->fill_ratio_given  = A->info.fill_ratio_given;
1508     info->fill_ratio_needed = A->info.fill_ratio_needed;
1509     info->factor_mallocs    = A->info.factor_mallocs;
1510   } else {
1511     info->fill_ratio_given  = 0;
1512     info->fill_ratio_needed = 0;
1513     info->factor_mallocs    = 0;
1514   }
1515   PetscFunctionReturn(0);
1516 }
1517 
1518 
1519 #undef __FUNCT__
1520 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1521 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1522 {
1523   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1524   PetscErrorCode ierr;
1525 
1526   PetscFunctionBegin;
1527   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 #undef __FUNCT__
1532 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1533 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1534 {
1535   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1536   PetscErrorCode ierr;
1537   int i,j,n,row,col,bs,*ai,*aj,mbs;
1538   PetscReal    atmp;
1539   MatScalar    *aa;
1540   PetscScalar  zero = 0.0,*x;
1541   int          ncols,brow,bcol,krow,kcol;
1542 
1543   PetscFunctionBegin;
1544   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1545   bs   = a->bs;
1546   aa   = a->a;
1547   ai   = a->i;
1548   aj   = a->j;
1549   mbs = a->mbs;
1550 
1551   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1552   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1553   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1554   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1555   for (i=0; i<mbs; i++) {
1556     ncols = ai[1] - ai[0]; ai++;
1557     brow  = bs*i;
1558     for (j=0; j<ncols; j++){
1559       bcol = bs*(*aj);
1560       for (kcol=0; kcol<bs; kcol++){
1561         col = bcol + kcol;      /* col index */
1562         for (krow=0; krow<bs; krow++){
1563           atmp = PetscAbsScalar(*aa); aa++;
1564           row = brow + krow;    /* row index */
1565           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1566           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1567           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1568         }
1569       }
1570       aj++;
1571     }
1572   }
1573   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1574   PetscFunctionReturn(0);
1575 }
1576