xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision e2df7a95c5ea77c899beea10ff9effd6061e7c8f)
1 #define PETSCMAT_DLL
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table,table0;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->bs;
24   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j){
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if(!PetscBTLookupSet(table,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table0,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++){
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++){
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l<end ; l++){
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table0,bcol)){
71               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78 
79     /* expand the Index Set */
80     for (j=0; j<isz; j++) {
81       for (k=0; k<bs; k++)
82         nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
95 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
96 {
97   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
98   PetscErrorCode ierr;
99   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
101   PetscInt       *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2;
102   PetscInt       *aj = a->j,*ai = a->i;
103   MatScalar      *mat_a;
104   Mat            C;
105   PetscTruth     flag;
106 
107   PetscFunctionBegin;
108   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
110   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
111 
112   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
113   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
114 
115   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
116   ssmap = smap;
117   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120   /* determine lens of each row */
121   for (i=0; i<nrows; i++) {
122     kstart  = ai[irow[i]];
123     kend    = kstart + a->ilen[irow[i]];
124     lens[i] = 0;
125       for (k=kstart; k<kend; k++) {
126         if (ssmap[aj[k]]) {
127           lens[i]++;
128         }
129       }
130     }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ *)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) {
138       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139     }
140     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
141     C = *B;
142   } else {
143     ierr = MatCreate(A->comm,&C);CHKERRQ(ierr);
144     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
145     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
146     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
147   }
148   c = (Mat_SeqSBAIJ *)(C->data);
149   for (i=0; i<nrows; i++) {
150     row    = irow[i];
151     kstart = ai[row];
152     kend   = kstart + a->ilen[row];
153     mat_i  = c->i[i];
154     mat_j  = c->j + mat_i;
155     mat_a  = c->a + mat_i*bs2;
156     mat_ilen = c->ilen + i;
157     for (k=kstart; k<kend; k++) {
158       if ((tcol=ssmap[a->j[k]])) {
159         *mat_j++ = tcol - 1;
160         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
161         mat_a   += bs2;
162         (*mat_ilen)++;
163       }
164     }
165   }
166 
167   /* Free work space */
168   ierr = PetscFree(smap);CHKERRQ(ierr);
169   ierr = PetscFree(lens);CHKERRQ(ierr);
170   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
171   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172 
173   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
174   *B = C;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
180 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
181 {
182   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
183   IS             is1;
184   PetscErrorCode ierr;
185   PetscInt       *vary,*iary,*irow,nrows,i,bs=A->bs,count;
186 
187   PetscFunctionBegin;
188   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
189 
190   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
192 
193   /* Verify if the indices corespond to each element in a block
194    and form the IS with compressed IS */
195   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
196   iary = vary + a->mbs;
197   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
198   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
199 
200   count = 0;
201   for (i=0; i<a->mbs; i++) {
202     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
203     if (vary[i]==bs) iary[count++] = i;
204   }
205   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
206 
207   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
208   ierr = PetscFree(vary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
211   ISDestroy(is1);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   PetscErrorCode ierr;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (scall == MAT_INITIAL_MATRIX) {
224     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<n; i++) {
228     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 /* -------------------------------------------------------*/
234 /* Should check that shapes of vectors and matrices match */
235 /* -------------------------------------------------------*/
236 #include "petscblaslapack.h"
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
240 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
241 {
242   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
243   PetscScalar    *x,*z,*xb,x1,zero=0.0;
244   MatScalar      *v;
245   PetscErrorCode ierr;
246   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
247 
248   PetscFunctionBegin;
249   ierr = VecSet(zz,zero);CHKERRQ(ierr);
250   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
252 
253   v  = a->a;
254   xb = x;
255 
256   for (i=0; i<mbs; i++) {
257     n  = ai[1] - ai[0];  /* length of i_th row of A */
258     x1 = xb[0];
259     ib = aj + *ai;
260     jmin = 0;
261     if (*ib == i) {      /* (diag of A)*x */
262       z[i] += *v++ * x[*ib++];
263       jmin++;
264     }
265     for (j=jmin; j<n; j++) {
266       cval    = *ib;
267       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
268       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
269     }
270     xb++; ai++;
271   }
272 
273   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
275   ierr = PetscLogFlops(2*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);  /* nz = (nz+m)/2 */
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
281 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
284   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
285   MatScalar      *v;
286   PetscErrorCode ierr;
287   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
288 
289   PetscFunctionBegin;
290   ierr = VecSet(zz,zero);CHKERRQ(ierr);
291   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
293 
294   v     = a->a;
295   xb = x;
296 
297   for (i=0; i<mbs; i++) {
298     n  = ai[1] - ai[0]; /* length of i_th block row of A */
299     x1 = xb[0]; x2 = xb[1];
300     ib = aj + *ai;
301     jmin = 0;
302     if (*ib == i){     /* (diag of A)*x */
303       z[2*i]   += v[0]*x1 + v[2]*x2;
304       z[2*i+1] += v[2]*x1 + v[3]*x2;
305       v += 4; jmin++;
306     }
307     for (j=jmin; j<n; j++) {
308       /* (strict lower triangular part of A)*x  */
309       cval       = ib[j]*2;
310       z[cval]     += v[0]*x1 + v[1]*x2;
311       z[cval+1]   += v[2]*x1 + v[3]*x2;
312       /* (strict upper triangular part of A)*x  */
313       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
314       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
315       v  += 4;
316     }
317     xb +=2; ai++;
318   }
319 
320   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
321   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
322   ierr = PetscLogFlops(8*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
328 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
329 {
330   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
331   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
332   MatScalar      *v;
333   PetscErrorCode ierr;
334   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
335 
336   PetscFunctionBegin;
337   ierr = VecSet(zz,zero);CHKERRQ(ierr);
338   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
339   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
340 
341   v    = a->a;
342   xb   = x;
343 
344   for (i=0; i<mbs; i++) {
345     n  = ai[1] - ai[0]; /* length of i_th block row of A */
346     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
347     ib = aj + *ai;
348     jmin = 0;
349     if (*ib == i){     /* (diag of A)*x */
350       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
351       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
352       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
353       v += 9; jmin++;
354     }
355     for (j=jmin; j<n; j++) {
356       /* (strict lower triangular part of A)*x  */
357       cval       = ib[j]*3;
358       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
359       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
360       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
361       /* (strict upper triangular part of A)*x  */
362       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
363       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
364       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
365       v  += 9;
366     }
367     xb +=3; ai++;
368   }
369 
370   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
371   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
372   ierr = PetscLogFlops(18*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
378 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
379 {
380   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
381   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
382   MatScalar      *v;
383   PetscErrorCode ierr;
384   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
385 
386   PetscFunctionBegin;
387   ierr = VecSet(zz,zero);CHKERRQ(ierr);
388   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
389   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
390 
391   v     = a->a;
392   xb = x;
393 
394   for (i=0; i<mbs; i++) {
395     n  = ai[1] - ai[0]; /* length of i_th block row of A */
396     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
397     ib = aj + *ai;
398     jmin = 0;
399     if (*ib == i){     /* (diag of A)*x */
400       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
401       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
402       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
403       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
404       v += 16; jmin++;
405     }
406     for (j=jmin; j<n; j++) {
407       /* (strict lower triangular part of A)*x  */
408       cval       = ib[j]*4;
409       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
410       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
411       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
412       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
413       /* (strict upper triangular part of A)*x  */
414       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
415       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
416       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
417       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
418       v  += 16;
419     }
420     xb +=4; ai++;
421   }
422 
423   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
424   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
425   ierr = PetscLogFlops(32*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
426   PetscFunctionReturn(0);
427 }
428 
429 #undef __FUNCT__
430 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
431 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
432 {
433   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
434   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
435   MatScalar      *v;
436   PetscErrorCode ierr;
437   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
438 
439   PetscFunctionBegin;
440   ierr = VecSet(zz,zero);CHKERRQ(ierr);
441   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
442   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
443 
444   v     = a->a;
445   xb = x;
446 
447   for (i=0; i<mbs; i++) {
448     n  = ai[1] - ai[0]; /* length of i_th block row of A */
449     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
450     ib = aj + *ai;
451     jmin = 0;
452     if (*ib == i){      /* (diag of A)*x */
453       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
454       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
455       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
456       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
457       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
458       v += 25; jmin++;
459     }
460     for (j=jmin; j<n; j++) {
461       /* (strict lower triangular part of A)*x  */
462       cval       = ib[j]*5;
463       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
464       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
465       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
466       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
467       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
468       /* (strict upper triangular part of A)*x  */
469       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
470       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
471       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
472       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
473       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
474       v  += 25;
475     }
476     xb +=5; ai++;
477   }
478 
479   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
480   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
481   ierr = PetscLogFlops(50*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 
486 #undef __FUNCT__
487 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
488 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
489 {
490   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
491   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
492   MatScalar      *v;
493   PetscErrorCode ierr;
494   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
495 
496   PetscFunctionBegin;
497   ierr = VecSet(zz,zero);CHKERRQ(ierr);
498   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
499   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
500 
501   v     = a->a;
502   xb = x;
503 
504   for (i=0; i<mbs; i++) {
505     n  = ai[1] - ai[0]; /* length of i_th block row of A */
506     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
507     ib = aj + *ai;
508     jmin = 0;
509     if (*ib == i){      /* (diag of A)*x */
510       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
511       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
512       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
513       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
514       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
515       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
516       v += 36; jmin++;
517     }
518     for (j=jmin; j<n; j++) {
519       /* (strict lower triangular part of A)*x  */
520       cval       = ib[j]*6;
521       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
522       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
523       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
524       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
525       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
526       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
527       /* (strict upper triangular part of A)*x  */
528       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
529       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
530       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
531       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
532       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
533       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
534       v  += 36;
535     }
536     xb +=6; ai++;
537   }
538 
539   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
540   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
541   ierr = PetscLogFlops(72*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
542   PetscFunctionReturn(0);
543 }
544 #undef __FUNCT__
545 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
546 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
547 {
548   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
549   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
550   MatScalar      *v;
551   PetscErrorCode ierr;
552   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
553 
554   PetscFunctionBegin;
555   ierr = VecSet(zz,zero);CHKERRQ(ierr);
556   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
557   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
558 
559   v     = a->a;
560   xb = x;
561 
562   for (i=0; i<mbs; i++) {
563     n  = ai[1] - ai[0]; /* length of i_th block row of A */
564     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
565     ib = aj + *ai;
566     jmin = 0;
567     if (*ib == i){      /* (diag of A)*x */
568       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
569       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
570       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
571       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
572       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
573       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
574       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
575       v += 49; jmin++;
576     }
577     for (j=jmin; j<n; j++) {
578       /* (strict lower triangular part of A)*x  */
579       cval       = ib[j]*7;
580       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
581       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
582       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
583       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
584       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
585       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
586       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
587       /* (strict upper triangular part of A)*x  */
588       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
589       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
590       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
591       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
592       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
593       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
594       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
595       v  += 49;
596     }
597     xb +=7; ai++;
598   }
599   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
601   ierr = PetscLogFlops(98*(a->nz*2 - A->m) - A->m);CHKERRQ(ierr);
602   PetscFunctionReturn(0);
603 }
604 
605 /*
606     This will not work with MatScalar == float because it calls the BLAS
607 */
608 #undef __FUNCT__
609 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
610 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
611 {
612   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
613   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
614   MatScalar      *v;
615   PetscErrorCode ierr;
616   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
617 
618   PetscFunctionBegin;
619   ierr = VecSet(zz,zero);CHKERRQ(ierr);
620   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
621   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
622 
623   aj   = a->j;
624   v    = a->a;
625   ii   = a->i;
626 
627   if (!a->mult_work) {
628     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
629   }
630   work = a->mult_work;
631 
632   for (i=0; i<mbs; i++) {
633     n     = ii[1] - ii[0]; ncols = n*bs;
634     workt = work; idx=aj+ii[0];
635 
636     /* upper triangular part */
637     for (j=0; j<n; j++) {
638       xb = x_ptr + bs*(*idx++);
639       for (k=0; k<bs; k++) workt[k] = xb[k];
640       workt += bs;
641     }
642     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
643     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
644 
645     /* strict lower triangular part */
646     idx = aj+ii[0];
647     if (*idx == i){
648       ncols -= bs; v += bs2; idx++; n--;
649     }
650 
651     if (ncols > 0){
652       workt = work;
653       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
654       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
655       for (j=0; j<n; j++) {
656         zb = z_ptr + bs*(*idx++);
657         for (k=0; k<bs; k++) zb[k] += workt[k] ;
658         workt += bs;
659       }
660     }
661     x += bs; v += n*bs2; z += bs; ii++;
662   }
663 
664   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
665   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
666   ierr = PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);CHKERRQ(ierr);
667   PetscFunctionReturn(0);
668 }
669 
670 #undef __FUNCT__
671 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
672 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
673 {
674   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
675   PetscScalar    *x,*y,*z,*xb,x1;
676   MatScalar      *v;
677   PetscErrorCode ierr;
678   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
679 
680   PetscFunctionBegin;
681   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
682   if (yy != xx) {
683     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
684   } else {
685     y = x;
686   }
687   if (zz != yy) {
688     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
689     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
690     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
691   } else {
692     z = y;
693   }
694 
695   v  = a->a;
696   xb = x;
697 
698   for (i=0; i<mbs; i++) {
699     n  = ai[1] - ai[0];  /* length of i_th row of A */
700     x1 = xb[0];
701     ib = aj + *ai;
702     jmin = 0;
703     if (*ib == i) {            /* (diag of A)*x */
704       z[i] += *v++ * x[*ib++]; jmin++;
705     }
706     for (j=jmin; j<n; j++) {
707       cval    = *ib;
708       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
709       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
710     }
711     xb++; ai++;
712   }
713 
714   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
715   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
716   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
717 
718   ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr);
719   PetscFunctionReturn(0);
720 }
721 
722 #undef __FUNCT__
723 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
724 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
725 {
726   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
727   PetscScalar    *x,*y,*z,*xb,x1,x2;
728   MatScalar      *v;
729   PetscErrorCode ierr;
730   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
731 
732   PetscFunctionBegin;
733   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
734   if (yy != xx) {
735     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
736   } else {
737     y = x;
738   }
739   if (zz != yy) {
740     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
741     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
742     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
743   } else {
744     z = y;
745   }
746 
747   v     = a->a;
748   xb = x;
749 
750   for (i=0; i<mbs; i++) {
751     n  = ai[1] - ai[0]; /* length of i_th block row of A */
752     x1 = xb[0]; x2 = xb[1];
753     ib = aj + *ai;
754     jmin = 0;
755     if (*ib == i){      /* (diag of A)*x */
756       z[2*i]   += v[0]*x1 + v[2]*x2;
757       z[2*i+1] += v[2]*x1 + v[3]*x2;
758       v += 4; jmin++;
759     }
760     for (j=jmin; j<n; j++) {
761       /* (strict lower triangular part of A)*x  */
762       cval       = ib[j]*2;
763       z[cval]     += v[0]*x1 + v[1]*x2;
764       z[cval+1]   += v[2]*x1 + v[3]*x2;
765       /* (strict upper triangular part of A)*x  */
766       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
767       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
768       v  += 4;
769     }
770     xb +=2; ai++;
771   }
772 
773   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
774   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
775   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
776 
777   ierr = PetscLogFlops(4*(a->nz*2 - A->m));CHKERRQ(ierr);
778   PetscFunctionReturn(0);
779 }
780 
781 #undef __FUNCT__
782 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
783 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
784 {
785   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
786   PetscScalar    *x,*y,*z,*xb,x1,x2,x3;
787   MatScalar      *v;
788   PetscErrorCode ierr;
789   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
790 
791   PetscFunctionBegin;
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   if (yy != xx) {
794     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
795   } else {
796     y = x;
797   }
798   if (zz != yy) {
799     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
800     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
801     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
802   } else {
803     z = y;
804   }
805 
806   v     = a->a;
807   xb = x;
808 
809   for (i=0; i<mbs; i++) {
810     n  = ai[1] - ai[0]; /* length of i_th block row of A */
811     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
812     ib = aj + *ai;
813     jmin = 0;
814     if (*ib == i){     /* (diag of A)*x */
815      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
816      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
817      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
818      v += 9; jmin++;
819     }
820     for (j=jmin; j<n; j++) {
821       /* (strict lower triangular part of A)*x  */
822       cval       = ib[j]*3;
823       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
824       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
825       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
826       /* (strict upper triangular part of A)*x  */
827       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
828       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
829       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
830       v  += 9;
831     }
832     xb +=3; ai++;
833   }
834 
835   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
836   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
837   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
838 
839   ierr = PetscLogFlops(18*(a->nz*2 - A->m));CHKERRQ(ierr);
840   PetscFunctionReturn(0);
841 }
842 
843 #undef __FUNCT__
844 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
845 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
846 {
847   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
848   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4;
849   MatScalar      *v;
850   PetscErrorCode ierr;
851   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
852 
853   PetscFunctionBegin;
854   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
855   if (yy != xx) {
856     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
857   } else {
858     y = x;
859   }
860   if (zz != yy) {
861     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
862     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
863     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
864   } else {
865     z = y;
866   }
867 
868   v     = a->a;
869   xb = x;
870 
871   for (i=0; i<mbs; i++) {
872     n  = ai[1] - ai[0]; /* length of i_th block row of A */
873     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
874     ib = aj + *ai;
875     jmin = 0;
876     if (*ib == i){      /* (diag of A)*x */
877       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
878       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
879       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
880       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
881       v += 16; jmin++;
882     }
883     for (j=jmin; j<n; j++) {
884       /* (strict lower triangular part of A)*x  */
885       cval       = ib[j]*4;
886       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
887       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
888       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
889       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
890       /* (strict upper triangular part of A)*x  */
891       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
892       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
893       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
894       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
895       v  += 16;
896     }
897     xb +=4; ai++;
898   }
899 
900   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
901   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
902   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
903 
904   ierr = PetscLogFlops(32*(a->nz*2 - A->m));CHKERRQ(ierr);
905   PetscFunctionReturn(0);
906 }
907 
908 #undef __FUNCT__
909 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
910 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
911 {
912   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
913   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5;
914   MatScalar      *v;
915   PetscErrorCode ierr;
916   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
917 
918   PetscFunctionBegin;
919   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
920   if (yy != xx) {
921     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
922   } else {
923     y = x;
924   }
925   if (zz != yy) {
926     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
927     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
928     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
929   } else {
930     z = y;
931   }
932 
933   v     = a->a;
934   xb = x;
935 
936   for (i=0; i<mbs; i++) {
937     n  = ai[1] - ai[0]; /* length of i_th block row of A */
938     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
939     ib = aj + *ai;
940     jmin = 0;
941     if (*ib == i){      /* (diag of A)*x */
942       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
943       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
944       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
945       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
946       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
947       v += 25; jmin++;
948     }
949     for (j=jmin; j<n; j++) {
950       /* (strict lower triangular part of A)*x  */
951       cval       = ib[j]*5;
952       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
953       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
954       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
955       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
956       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
957       /* (strict upper triangular part of A)*x  */
958       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
959       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
960       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
961       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
962       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
963       v  += 25;
964     }
965     xb +=5; ai++;
966   }
967 
968   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
969   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
970   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
971 
972   ierr = PetscLogFlops(50*(a->nz*2 - A->m));CHKERRQ(ierr);
973   PetscFunctionReturn(0);
974 }
975 #undef __FUNCT__
976 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
977 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
978 {
979   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
980   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
981   MatScalar      *v;
982   PetscErrorCode ierr;
983   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
984 
985   PetscFunctionBegin;
986   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
987   if (yy != xx) {
988     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
989   } else {
990     y = x;
991   }
992   if (zz != yy) {
993     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
994     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
995   } else {
996     z = y;
997   }
998 
999   v     = a->a;
1000   xb = x;
1001 
1002   for (i=0; i<mbs; i++) {
1003     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1004     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1005     ib = aj + *ai;
1006     jmin = 0;
1007     if (*ib == i){     /* (diag of A)*x */
1008       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1009       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1010       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1011       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1012       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1013       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1014       v += 36; jmin++;
1015     }
1016     for (j=jmin; j<n; j++) {
1017       /* (strict lower triangular part of A)*x  */
1018       cval       = ib[j]*6;
1019       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1020       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1021       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1022       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1023       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1024       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1025       /* (strict upper triangular part of A)*x  */
1026       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1027       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1028       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1029       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1030       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1031       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1032       v  += 36;
1033     }
1034     xb +=6; ai++;
1035   }
1036 
1037   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1038   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1039   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1040 
1041   ierr = PetscLogFlops(72*(a->nz*2 - A->m));CHKERRQ(ierr);
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 #undef __FUNCT__
1046 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1047 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1048 {
1049   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1050   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1051   MatScalar      *v;
1052   PetscErrorCode ierr;
1053   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1054 
1055   PetscFunctionBegin;
1056   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1057   if (yy != xx) {
1058     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1059   } else {
1060     y = x;
1061   }
1062   if (zz != yy) {
1063     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1064     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1065   } else {
1066     z = y;
1067   }
1068 
1069   v     = a->a;
1070   xb = x;
1071 
1072   for (i=0; i<mbs; i++) {
1073     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1074     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1075     ib = aj + *ai;
1076     jmin = 0;
1077     if (*ib == i){     /* (diag of A)*x */
1078       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1079       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1080       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1081       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1082       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1083       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1084       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1085       v += 49; jmin++;
1086     }
1087     for (j=jmin; j<n; j++) {
1088       /* (strict lower triangular part of A)*x  */
1089       cval       = ib[j]*7;
1090       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1091       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1092       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1093       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1094       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1095       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1096       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1097       /* (strict upper triangular part of A)*x  */
1098       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1099       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1100       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1101       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1102       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1103       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1104       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1105       v  += 49;
1106     }
1107     xb +=7; ai++;
1108   }
1109 
1110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1111   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1112   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1113 
1114   ierr = PetscLogFlops(98*(a->nz*2 - A->m));CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 #undef __FUNCT__
1119 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1120 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1121 {
1122   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1123   PetscScalar    *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1124   MatScalar      *v;
1125   PetscErrorCode ierr;
1126   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
1127 
1128   PetscFunctionBegin;
1129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1130   if (yy != xx) {
1131     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1132   } else {
1133     y = x;
1134   }
1135   if (zz != yy) {
1136     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1137     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1138   } else {
1139     z = y;
1140   }
1141 
1142   aj   = a->j;
1143   v    = a->a;
1144   ii   = a->i;
1145 
1146   if (!a->mult_work) {
1147     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1148   }
1149   work = a->mult_work;
1150 
1151 
1152   for (i=0; i<mbs; i++) {
1153     n     = ii[1] - ii[0]; ncols = n*bs;
1154     workt = work; idx=aj+ii[0];
1155 
1156     /* upper triangular part */
1157     for (j=0; j<n; j++) {
1158       xb = x_ptr + bs*(*idx++);
1159       for (k=0; k<bs; k++) workt[k] = xb[k];
1160       workt += bs;
1161     }
1162     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1163     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1164 
1165     /* strict lower triangular part */
1166     idx = aj+ii[0];
1167     if (*idx == i){
1168       ncols -= bs; v += bs2; idx++; n--;
1169     }
1170     if (ncols > 0){
1171       workt = work;
1172       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1173       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1174       for (j=0; j<n; j++) {
1175         zb = z_ptr + bs*(*idx++);
1176         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1177         workt += bs;
1178       }
1179     }
1180 
1181     x += bs; v += n*bs2; z += bs; ii++;
1182   }
1183 
1184   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1185   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1186   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1187 
1188   ierr = PetscLogFlops(2*(a->nz*2 - A->m));CHKERRQ(ierr);
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 #undef __FUNCT__
1193 #define __FUNCT__ "MatScale_SeqSBAIJ"
1194 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1195 {
1196   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1197   PetscScalar oalpha = alpha;
1198   PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz;
1199   PetscErrorCode ierr;
1200 
1201   PetscFunctionBegin;
1202   BLASscal_(&totalnz,&oalpha,a->a,&one);
1203   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 #undef __FUNCT__
1208 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1209 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1210 {
1211   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1212   MatScalar      *v = a->a;
1213   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1214   PetscInt       i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1215   PetscErrorCode ierr;
1216   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1217 
1218   PetscFunctionBegin;
1219   if (type == NORM_FROBENIUS) {
1220     for (k=0; k<mbs; k++){
1221       jmin = a->i[k]; jmax = a->i[k+1];
1222       col  = aj + jmin;
1223       if (*col == k){         /* diagonal block */
1224         for (i=0; i<bs2; i++){
1225 #if defined(PETSC_USE_COMPLEX)
1226           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1227 #else
1228           sum_diag += (*v)*(*v); v++;
1229 #endif
1230         }
1231         jmin++;
1232       }
1233       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1234         for (i=0; i<bs2; i++){
1235 #if defined(PETSC_USE_COMPLEX)
1236           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1237 #else
1238           sum_off += (*v)*(*v); v++;
1239 #endif
1240         }
1241       }
1242     }
1243     *norm = sqrt(sum_diag + 2*sum_off);
1244   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1245     ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);CHKERRQ(ierr);
1246     jl   = il + mbs;
1247     sum  = (PetscReal*)(jl + mbs);
1248     for (i=0; i<mbs; i++) jl[i] = mbs;
1249     il[0] = 0;
1250 
1251     *norm = 0.0;
1252     for (k=0; k<mbs; k++) { /* k_th block row */
1253       for (j=0; j<bs; j++) sum[j]=0.0;
1254       /*-- col sum --*/
1255       i = jl[k]; /* first |A(i,k)| to be added */
1256       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1257                   at step k */
1258       while (i<mbs){
1259         nexti = jl[i];  /* next block row to be added */
1260         ik    = il[i];  /* block index of A(i,k) in the array a */
1261         for (j=0; j<bs; j++){
1262           v = a->a + ik*bs2 + j*bs;
1263           for (k1=0; k1<bs; k1++) {
1264             sum[j] += PetscAbsScalar(*v); v++;
1265           }
1266         }
1267         /* update il, jl */
1268         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1269         jmax = a->i[i+1];
1270         if (jmin < jmax){
1271           il[i] = jmin;
1272           j   = a->j[jmin];
1273           jl[i] = jl[j]; jl[j]=i;
1274         }
1275         i = nexti;
1276       }
1277       /*-- row sum --*/
1278       jmin = a->i[k]; jmax = a->i[k+1];
1279       for (i=jmin; i<jmax; i++) {
1280         for (j=0; j<bs; j++){
1281           v = a->a + i*bs2 + j;
1282           for (k1=0; k1<bs; k1++){
1283             sum[j] += PetscAbsScalar(*v); v += bs;
1284           }
1285         }
1286       }
1287       /* add k_th block row to il, jl */
1288       col = aj+jmin;
1289       if (*col == k) jmin++;
1290       if (jmin < jmax){
1291         il[k] = jmin;
1292         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1293       }
1294       for (j=0; j<bs; j++){
1295         if (sum[j] > *norm) *norm = sum[j];
1296       }
1297     }
1298     ierr = PetscFree(il);CHKERRQ(ierr);
1299   } else {
1300     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1301   }
1302   PetscFunctionReturn(0);
1303 }
1304 
1305 #undef __FUNCT__
1306 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1307 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1308 {
1309   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1310   PetscErrorCode ierr;
1311 
1312   PetscFunctionBegin;
1313 
1314   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1315   if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1316     *flg = PETSC_FALSE;
1317     PetscFunctionReturn(0);
1318   }
1319 
1320   /* if the a->i are the same */
1321   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1322   if (!*flg) {
1323     PetscFunctionReturn(0);
1324   }
1325 
1326   /* if a->j are the same */
1327   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1328   if (!*flg) {
1329     PetscFunctionReturn(0);
1330   }
1331   /* if a->a are the same */
1332   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 #undef __FUNCT__
1337 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1338 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1339 {
1340   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1341   PetscErrorCode ierr;
1342   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1343   PetscScalar    *x,zero = 0.0;
1344   MatScalar      *aa,*aa_j;
1345 
1346   PetscFunctionBegin;
1347   bs   = A->bs;
1348   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1349 
1350   aa   = a->a;
1351   ai   = a->i;
1352   aj   = a->j;
1353   ambs = a->mbs;
1354   bs2  = a->bs2;
1355 
1356   ierr = VecSet(v,zero);CHKERRQ(ierr);
1357   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1358   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1359   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1360   for (i=0; i<ambs; i++) {
1361     j=ai[i];
1362     if (aj[j] == i) {             /* if this is a diagonal element */
1363       row  = i*bs;
1364       aa_j = aa + j*bs2;
1365       if (A->factor && bs==1){
1366         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1367       } else {
1368         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1369       }
1370     }
1371   }
1372 
1373   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 #undef __FUNCT__
1378 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1379 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1380 {
1381   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1382   PetscScalar    *l,x,*li,*ri;
1383   MatScalar      *aa,*v;
1384   PetscErrorCode ierr;
1385   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1386   PetscTruth     flg;
1387 
1388   PetscFunctionBegin;
1389   if (ll != rr){
1390     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1391     if (!flg)
1392       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1393   }
1394   if (!ll) PetscFunctionReturn(0);
1395   ai  = a->i;
1396   aj  = a->j;
1397   aa  = a->a;
1398   m   = A->m;
1399   bs  = A->bs;
1400   mbs = a->mbs;
1401   bs2 = a->bs2;
1402 
1403   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1404   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1405   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1406   for (i=0; i<mbs; i++) { /* for each block row */
1407     M  = ai[i+1] - ai[i];
1408     li = l + i*bs;
1409     v  = aa + bs2*ai[i];
1410     for (j=0; j<M; j++) { /* for each block */
1411       ri = l + bs*aj[ai[i]+j];
1412       for (k=0; k<bs; k++) {
1413         x = ri[k];
1414         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1415       }
1416     }
1417   }
1418   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1419   ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
1420   PetscFunctionReturn(0);
1421 }
1422 
1423 #undef __FUNCT__
1424 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1425 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1426 {
1427   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1428 
1429   PetscFunctionBegin;
1430   info->rows_global    = (double)A->m;
1431   info->columns_global = (double)A->m;
1432   info->rows_local     = (double)A->m;
1433   info->columns_local  = (double)A->m;
1434   info->block_size     = a->bs2;
1435   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1436   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1437   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1438   info->assemblies   = A->num_ass;
1439   info->mallocs      = a->reallocs;
1440   info->memory       = A->mem;
1441   if (A->factor) {
1442     info->fill_ratio_given  = A->info.fill_ratio_given;
1443     info->fill_ratio_needed = A->info.fill_ratio_needed;
1444     info->factor_mallocs    = A->info.factor_mallocs;
1445   } else {
1446     info->fill_ratio_given  = 0;
1447     info->fill_ratio_needed = 0;
1448     info->factor_mallocs    = 0;
1449   }
1450   PetscFunctionReturn(0);
1451 }
1452 
1453 
1454 #undef __FUNCT__
1455 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1456 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1457 {
1458   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1459   PetscErrorCode ierr;
1460 
1461   PetscFunctionBegin;
1462   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1463   PetscFunctionReturn(0);
1464 }
1465 
1466 #undef __FUNCT__
1467 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1468 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1469 {
1470   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1471   PetscErrorCode ierr;
1472   PetscInt i,j,n,row,col,bs,*ai,*aj,mbs;
1473   PetscReal    atmp;
1474   MatScalar    *aa;
1475   PetscScalar  zero = 0.0,*x;
1476   PetscInt          ncols,brow,bcol,krow,kcol;
1477 
1478   PetscFunctionBegin;
1479   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1480   bs   = A->bs;
1481   aa   = a->a;
1482   ai   = a->i;
1483   aj   = a->j;
1484   mbs = a->mbs;
1485 
1486   ierr = VecSet(v,zero);CHKERRQ(ierr);
1487   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1488   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1489   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1490   for (i=0; i<mbs; i++) {
1491     ncols = ai[1] - ai[0]; ai++;
1492     brow  = bs*i;
1493     for (j=0; j<ncols; j++){
1494       bcol = bs*(*aj);
1495       for (kcol=0; kcol<bs; kcol++){
1496         col = bcol + kcol;      /* col index */
1497         for (krow=0; krow<bs; krow++){
1498           atmp = PetscAbsScalar(*aa); aa++;
1499           row = brow + krow;    /* row index */
1500           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1501           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1502         }
1503       }
1504       aj++;
1505     }
1506   }
1507   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1508   PetscFunctionReturn(0);
1509 }
1510