xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision efe48dd8cf8b935accbbb9f4bcb20bc83865fa4d)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "petscblaslapack.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
15   const PetscInt *idx;
16   PetscInt       start,end,*ai,*aj,bs,*nidx2;
17   PetscBT        table;
18 
19   PetscFunctionBegin;
20   m     = a->mbs;
21   ai    = a->i;
22   aj    = a->j;
23   bs    = A->rmap->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
30 
31   for (i=0; i<is_max; i++) {
32     /* Initialise the two local arrays */
33     isz  = 0;
34     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j=0; j<n ; ++j){
42       ival = idx[j]/bs; /* convert the indices into block indices */
43       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap*/
51       n = isz;
52       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end ; l++){
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++)
65         nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 #undef __FUNCT__
76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
78 {
79   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
80   PetscErrorCode ierr;
81   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
82   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
83   const PetscInt *irow,*icol;
84   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
85   PetscInt       *aj = a->j,*ai = a->i;
86   MatScalar      *mat_a;
87   Mat            C;
88   PetscBool      flag,sorted;
89 
90   PetscFunctionBegin;
91   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
92   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
93 
94   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
95   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
98 
99   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
100   ssmap = smap;
101   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
102   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
103   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109       for (k=kstart; k<kend; k++) {
110         if (ssmap[aj[k]]) {
111           lens[i]++;
112         }
113       }
114     }
115   /* Create and fill new matrix */
116   if (scall == MAT_REUSE_MATRIX) {
117     c = (Mat_SeqBAIJ *)((*B)->data);
118 
119     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
121     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
122     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
123     C = *B;
124   } else {
125     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
126     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
127     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
128     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
129   }
130   c = (Mat_SeqBAIJ *)(C->data);
131   for (i=0; i<nrows; i++) {
132     row    = irow[i];
133     kstart = ai[row];
134     kend   = kstart + a->ilen[row];
135     mat_i  = c->i[i];
136     mat_j  = c->j + mat_i;
137     mat_a  = c->a + mat_i*bs2;
138     mat_ilen = c->ilen + i;
139     for (k=kstart; k<kend; k++) {
140       if ((tcol=ssmap[a->j[k]])) {
141         *mat_j++ = tcol - 1;
142         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
143         mat_a   += bs2;
144         (*mat_ilen)++;
145       }
146     }
147   }
148 
149   /* Free work space */
150   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
151   ierr = PetscFree(smap);CHKERRQ(ierr);
152   ierr = PetscFree(lens);CHKERRQ(ierr);
153   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
154   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155 
156   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
157   *B = C;
158   PetscFunctionReturn(0);
159 }
160 
161 #undef __FUNCT__
162 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164 {
165   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166   IS             is1,is2;
167   PetscErrorCode ierr;
168   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
169   const PetscInt *irow,*icol;
170 
171   PetscFunctionBegin;
172   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176 
177   /* Verify if the indices corespond to each element in a block
178    and form the IS with compressed IS */
179   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
180   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
181   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182   count = 0;
183   for (i=0; i<a->mbs; i++) {
184     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185     if (vary[i]==bs) iary[count++] = i;
186   }
187   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
188 
189   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
190   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191   count = 0;
192   for (i=0; i<a->mbs; i++) {
193     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
194     if (vary[i]==bs) iary[count++] = i;
195   }
196   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
197   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
198   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
200 
201   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
202   ISDestroy(is1);
203   ISDestroy(is2);
204   PetscFunctionReturn(0);
205 }
206 
207 #undef __FUNCT__
208 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
209 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210 {
211   PetscErrorCode ierr;
212   PetscInt       i;
213 
214   PetscFunctionBegin;
215   if (scall == MAT_INITIAL_MATRIX) {
216     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
217   }
218 
219   for (i=0; i<n; i++) {
220     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 
226 /* -------------------------------------------------------*/
227 /* Should check that shapes of vectors and matrices match */
228 /* -------------------------------------------------------*/
229 
230 #undef __FUNCT__
231 #define __FUNCT__ "MatMult_SeqBAIJ_1"
232 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
233 {
234   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
235   PetscScalar       *z,sum;
236   const PetscScalar *x;
237   const MatScalar   *v;
238   PetscErrorCode    ierr;
239   PetscInt          mbs,i,n,nonzerorow=0;
240   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
241   PetscBool         usecprow=a->compressedrow.use;
242 
243   PetscFunctionBegin;
244   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
245   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
246 
247   if (usecprow){
248     mbs  = a->compressedrow.nrows;
249     ii   = a->compressedrow.i;
250     ridx = a->compressedrow.rindex;
251     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
252   } else {
253     mbs = a->mbs;
254     ii  = a->i;
255   }
256 
257   for (i=0; i<mbs; i++) {
258     n    = ii[1] - ii[0];
259     v    = a->a + ii[0];
260     idx  = a->j + ii[0];
261     ii++;
262     sum  = 0.0;
263     PetscSparseDensePlusDot(sum,x,v,idx,n);
264     if (usecprow){
265       z[ridx[i]] = sum;
266     } else {
267       nonzerorow += (n>0);
268       z[i] = sum;
269     }
270   }
271   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
272   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
273   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatMult_SeqBAIJ_2"
279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282   PetscScalar       *z = 0,sum1,sum2,*zarray;
283   const PetscScalar *x,*xb;
284   PetscScalar       x1,x2;
285   const MatScalar   *v;
286   PetscErrorCode    ierr;
287   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
288   PetscBool         usecprow=a->compressedrow.use;
289 
290   PetscFunctionBegin;
291   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
293 
294   idx = a->j;
295   v   = a->a;
296   if (usecprow){
297     mbs  = a->compressedrow.nrows;
298     ii   = a->compressedrow.i;
299     ridx = a->compressedrow.rindex;
300   } else {
301     mbs = a->mbs;
302     ii  = a->i;
303     z   = zarray;
304   }
305 
306   for (i=0; i<mbs; i++) {
307     n  = ii[1] - ii[0]; ii++;
308     sum1 = 0.0; sum2 = 0.0;
309     nonzerorow += (n>0);
310     for (j=0; j<n; j++) {
311       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
312       sum1 += v[0]*x1 + v[2]*x2;
313       sum2 += v[1]*x1 + v[3]*x2;
314       v += 4;
315     }
316     if (usecprow) z = zarray + 2*ridx[i];
317     z[0] = sum1; z[1] = sum2;
318     if (!usecprow) z += 2;
319   }
320   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
321   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
322   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatMult_SeqBAIJ_3"
328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
329 {
330   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
331   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
332   const PetscScalar *x,*xb;
333   const MatScalar   *v;
334   PetscErrorCode    ierr;
335   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
336   PetscBool         usecprow=a->compressedrow.use;
337 
338 
339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
340 #pragma disjoint(*v,*z,*xb)
341 #endif
342 
343   PetscFunctionBegin;
344   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
345   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
346 
347   idx = a->j;
348   v   = a->a;
349   if (usecprow){
350     mbs  = a->compressedrow.nrows;
351     ii   = a->compressedrow.i;
352     ridx = a->compressedrow.rindex;
353   } else {
354     mbs = a->mbs;
355     ii  = a->i;
356     z   = zarray;
357   }
358 
359   for (i=0; i<mbs; i++) {
360     n  = ii[1] - ii[0]; ii++;
361     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
362     nonzerorow += (n>0);
363     for (j=0; j<n; j++) {
364       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
365       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
366       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
367       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
368       v += 9;
369     }
370     if (usecprow) z = zarray + 3*ridx[i];
371     z[0] = sum1; z[1] = sum2; z[2] = sum3;
372     if (!usecprow) z += 3;
373   }
374   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
375   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
376   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 #undef __FUNCT__
381 #define __FUNCT__ "MatMult_SeqBAIJ_4"
382 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
383 {
384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
385   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
386   const PetscScalar *x,*xb;
387   const MatScalar   *v;
388   PetscErrorCode    ierr;
389   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
390   PetscBool         usecprow=a->compressedrow.use;
391 
392   PetscFunctionBegin;
393   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
395 
396   idx = a->j;
397   v   = a->a;
398   if (usecprow){
399     mbs  = a->compressedrow.nrows;
400     ii   = a->compressedrow.i;
401     ridx = a->compressedrow.rindex;
402   } else {
403     mbs = a->mbs;
404     ii  = a->i;
405     z   = zarray;
406   }
407 
408   for (i=0; i<mbs; i++) {
409     n  = ii[1] - ii[0]; ii++;
410     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
411     nonzerorow += (n>0);
412     for (j=0; j<n; j++) {
413       xb = x + 4*(*idx++);
414       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
415       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
416       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
417       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
418       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
419       v += 16;
420     }
421     if (usecprow) z = zarray + 4*ridx[i];
422     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
423     if (!usecprow) z += 4;
424   }
425   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
426   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
427   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 #undef __FUNCT__
432 #define __FUNCT__ "MatMult_SeqBAIJ_5"
433 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
434 {
435   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
436   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
437   const PetscScalar *xb,*x;
438   const MatScalar   *v;
439   PetscErrorCode    ierr;
440   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
441   PetscInt          mbs,i,j,n,nonzerorow=0;
442   PetscBool         usecprow=a->compressedrow.use;
443 
444   PetscFunctionBegin;
445   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
446   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
447 
448   idx = a->j;
449   v   = a->a;
450   if (usecprow){
451     mbs  = a->compressedrow.nrows;
452     ii   = a->compressedrow.i;
453     ridx = a->compressedrow.rindex;
454   } else {
455     mbs = a->mbs;
456     ii  = a->i;
457     z   = zarray;
458   }
459 
460   for (i=0; i<mbs; i++) {
461     n  = ii[1] - ii[0]; ii++;
462     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
463     nonzerorow += (n>0);
464     for (j=0; j<n; j++) {
465       xb = x + 5*(*idx++);
466       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
467       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
468       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
469       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
470       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
471       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
472       v += 25;
473     }
474     if (usecprow) z = zarray + 5*ridx[i];
475     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
476     if (!usecprow) z += 5;
477   }
478   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
479   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
480   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
481   PetscFunctionReturn(0);
482 }
483 
484 
485 #undef __FUNCT__
486 #define __FUNCT__ "MatMult_SeqBAIJ_6"
487 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
488 {
489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
491   const PetscScalar *x,*xb;
492   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
493   const MatScalar   *v;
494   PetscErrorCode    ierr;
495   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
496   PetscBool         usecprow=a->compressedrow.use;
497 
498   PetscFunctionBegin;
499   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
500   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
501 
502   idx = a->j;
503   v   = a->a;
504   if (usecprow){
505     mbs  = a->compressedrow.nrows;
506     ii   = a->compressedrow.i;
507     ridx = a->compressedrow.rindex;
508   } else {
509     mbs = a->mbs;
510     ii  = a->i;
511     z   = zarray;
512   }
513 
514   for (i=0; i<mbs; i++) {
515     n  = ii[1] - ii[0]; ii++;
516     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
517     nonzerorow += (n>0);
518     for (j=0; j<n; j++) {
519       xb = x + 6*(*idx++);
520       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
521       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
522       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
523       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
524       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
525       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
526       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
527       v += 36;
528     }
529     if (usecprow) z = zarray + 6*ridx[i];
530     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
531     if (!usecprow) z += 6;
532   }
533 
534   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
535   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
536   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 #undef __FUNCT__
541 #define __FUNCT__ "MatMult_SeqBAIJ_7"
542 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
543 {
544   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
545   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
546   const PetscScalar *x,*xb;
547   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
548   const MatScalar   *v;
549   PetscErrorCode    ierr;
550   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
551   PetscBool         usecprow=a->compressedrow.use;
552 
553   PetscFunctionBegin;
554   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
555   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
556 
557   idx = a->j;
558   v   = a->a;
559   if (usecprow){
560     mbs    = a->compressedrow.nrows;
561     ii     = a->compressedrow.i;
562     ridx = a->compressedrow.rindex;
563   } else {
564     mbs = a->mbs;
565     ii  = a->i;
566     z   = zarray;
567   }
568 
569   for (i=0; i<mbs; i++) {
570     n  = ii[1] - ii[0]; ii++;
571     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
572     nonzerorow += (n>0);
573     for (j=0; j<n; j++) {
574       xb = x + 7*(*idx++);
575       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
576       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
577       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
578       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
579       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
580       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
581       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
582       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
583       v += 49;
584     }
585     if (usecprow) z = zarray + 7*ridx[i];
586     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
587     if (!usecprow) z += 7;
588   }
589 
590   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
591   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
592   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
593   PetscFunctionReturn(0);
594 }
595 
596 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
597 /* Default MatMult for block size 15 */
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
601 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
602 {
603   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
604   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
605   const PetscScalar *x,*xb;
606   PetscScalar       *zarray,xv;
607   const MatScalar   *v;
608   PetscErrorCode    ierr;
609   const PetscInt    *ii,*ij=a->j,*idx;
610   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
611   PetscBool         usecprow=a->compressedrow.use;
612 
613   PetscFunctionBegin;
614   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
615   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
616 
617   v   = a->a;
618   if (usecprow){
619     mbs    = a->compressedrow.nrows;
620     ii     = a->compressedrow.i;
621     ridx = a->compressedrow.rindex;
622   } else {
623     mbs = a->mbs;
624     ii  = a->i;
625     z   = zarray;
626   }
627 
628   for (i=0; i<mbs; i++) {
629     n  = ii[i+1] - ii[i];
630     idx = ij + ii[i];
631     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
632     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
633 
634     nonzerorow += (n>0);
635     for (j=0; j<n; j++) {
636       xb = x + 15*(idx[j]);
637 
638       for(k=0;k<15;k++){
639         xv    =  xb[k];
640 	sum1  += v[0]*xv;
641         sum2  += v[1]*xv;
642 	sum3  += v[2]*xv;
643 	sum4  += v[3]*xv;
644 	sum5  += v[4]*xv;
645         sum6  += v[5]*xv;
646 	sum7  += v[6]*xv;
647 	sum8  += v[7]*xv;
648         sum9  += v[8]*xv;
649         sum10 += v[9]*xv;
650 	sum11 += v[10]*xv;
651 	sum12 += v[11]*xv;
652 	sum13 += v[12]*xv;
653         sum14 += v[13]*xv;
654 	sum15 += v[14]*xv;
655 	v += 15;
656       }
657     }
658     if (usecprow) z = zarray + 15*ridx[i];
659     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
660     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
661 
662     if (!usecprow) z += 15;
663   }
664 
665   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
666   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
667   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
668   PetscFunctionReturn(0);
669 }
670 
671 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
672 #undef __FUNCT__
673 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
674 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
675 {
676   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
677   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
678   const PetscScalar *x,*xb;
679   PetscScalar        x1,x2,x3,x4,*zarray;
680   const MatScalar   *v;
681   PetscErrorCode    ierr;
682   const PetscInt    *ii,*ij=a->j,*idx;
683   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
684   PetscBool         usecprow=a->compressedrow.use;
685 
686   PetscFunctionBegin;
687   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
688   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
689 
690   v   = a->a;
691   if (usecprow){
692     mbs    = a->compressedrow.nrows;
693     ii     = a->compressedrow.i;
694     ridx = a->compressedrow.rindex;
695   } else {
696     mbs = a->mbs;
697     ii  = a->i;
698     z   = zarray;
699   }
700 
701   for (i=0; i<mbs; i++) {
702     n  = ii[i+1] - ii[i];
703     idx = ij + ii[i];
704     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
705     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
706 
707     nonzerorow += (n>0);
708     for (j=0; j<n; j++) {
709       xb = x + 15*(idx[j]);
710       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
711 
712       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
713       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
714       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
715       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
716       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
717       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
718       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
719       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
720       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
721       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
722       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
723       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
724       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
725       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
726       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
727 
728       v += 60;
729 
730       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
731 
732       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
733       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
734       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
735       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
736       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
737       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
738       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
739       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
740       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
741       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
742       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
743       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
744       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
745       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
746       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
747       v += 60;
748 
749       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
750       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
751       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
752       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
753       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
754       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
755       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
756       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
757       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
758       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
759       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
760       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
761       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
762       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
763       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
764       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
765       v  += 60;
766 
767       x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
768       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
769       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
770       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
771       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
772       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
773       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
774       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
775       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
776       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
777       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
778       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
779       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
780       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
781       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
782       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
783       v += 45;
784     }
785     if (usecprow) z = zarray + 15*ridx[i];
786     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
787     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
788 
789     if (!usecprow) z += 15;
790   }
791 
792   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
793   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
794   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
795   PetscFunctionReturn(0);
796 }
797 
798 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
799 #undef __FUNCT__
800 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
801 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
802 {
803   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
804   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
805   const PetscScalar *x,*xb;
806   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
807   const MatScalar   *v;
808   PetscErrorCode    ierr;
809   const PetscInt    *ii,*ij=a->j,*idx;
810   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
811   PetscBool         usecprow=a->compressedrow.use;
812 
813   PetscFunctionBegin;
814   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
815   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
816 
817   v   = a->a;
818   if (usecprow){
819     mbs    = a->compressedrow.nrows;
820     ii     = a->compressedrow.i;
821     ridx = a->compressedrow.rindex;
822   } else {
823     mbs = a->mbs;
824     ii  = a->i;
825     z   = zarray;
826   }
827 
828   for (i=0; i<mbs; i++) {
829     n  = ii[i+1] - ii[i];
830     idx = ij + ii[i];
831     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
832     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
833 
834     nonzerorow += (n>0);
835     for (j=0; j<n; j++) {
836       xb = x + 15*(idx[j]);
837       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
838       x8 = xb[7];
839 
840       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
841       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
842       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
843       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
844       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
845       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
846       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
847       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
848       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
849       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
850       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
851       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
852       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
853       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
854       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
855       v += 120;
856 
857       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
858 
859       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
860       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
861       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
862       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
863       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
864       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
865       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
866       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
867       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
868       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
869       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
870       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
871       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
872       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
873       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
874       v += 105;
875     }
876     if (usecprow) z = zarray + 15*ridx[i];
877     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
878     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
879 
880     if (!usecprow) z += 15;
881   }
882 
883   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
884   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
885   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
886   PetscFunctionReturn(0);
887 }
888 
889 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
890 
891 #undef __FUNCT__
892 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
893 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
894 {
895   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
896   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
897   const PetscScalar *x,*xb;
898   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
899   const MatScalar   *v;
900   PetscErrorCode    ierr;
901   const PetscInt    *ii,*ij=a->j,*idx;
902   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
903   PetscBool         usecprow=a->compressedrow.use;
904 
905   PetscFunctionBegin;
906   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
907   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
908 
909   v   = a->a;
910   if (usecprow){
911     mbs    = a->compressedrow.nrows;
912     ii     = a->compressedrow.i;
913     ridx = a->compressedrow.rindex;
914   } else {
915     mbs = a->mbs;
916     ii  = a->i;
917     z   = zarray;
918   }
919 
920   for (i=0; i<mbs; i++) {
921     n  = ii[i+1] - ii[i];
922     idx = ij + ii[i];
923     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
924     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
925 
926     nonzerorow += (n>0);
927     for (j=0; j<n; j++) {
928       xb = x + 15*(idx[j]);
929       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
930       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
931 
932       sum1 +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
933       sum2 +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
934       sum3 +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
935       sum4 +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
936       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
937       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
938       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
939       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
940       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
941       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
942       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
943       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
944       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
945       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
946       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
947       v += 225;
948     }
949     if (usecprow) z = zarray + 15*ridx[i];
950     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
951     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
952 
953     if (!usecprow) z += 15;
954   }
955 
956   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
957   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
958   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
959   PetscFunctionReturn(0);
960 }
961 
962 
963 /*
964     This will not work with MatScalar == float because it calls the BLAS
965 */
966 #undef __FUNCT__
967 #define __FUNCT__ "MatMult_SeqBAIJ_N"
968 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
969 {
970   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
971   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
972   MatScalar      *v;
973   PetscErrorCode ierr;
974   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
975   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
976   PetscBool      usecprow=a->compressedrow.use;
977 
978   PetscFunctionBegin;
979   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
980   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
981 
982   idx = a->j;
983   v   = a->a;
984   if (usecprow){
985     mbs  = a->compressedrow.nrows;
986     ii   = a->compressedrow.i;
987     ridx = a->compressedrow.rindex;
988   } else {
989     mbs = a->mbs;
990     ii  = a->i;
991     z   = zarray;
992   }
993 
994   if (!a->mult_work) {
995     k    = PetscMax(A->rmap->n,A->cmap->n);
996     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
997   }
998   work = a->mult_work;
999   for (i=0; i<mbs; i++) {
1000     n     = ii[1] - ii[0]; ii++;
1001     ncols = n*bs;
1002     workt = work;
1003     nonzerorow += (n>0);
1004     for (j=0; j<n; j++) {
1005       xb = x + bs*(*idx++);
1006       for (k=0; k<bs; k++) workt[k] = xb[k];
1007       workt += bs;
1008     }
1009     if (usecprow) z = zarray + bs*ridx[i];
1010     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1011     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1012     v += n*bs2;
1013     if (!usecprow) z += bs;
1014   }
1015   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1016   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1017   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
1018   PetscFunctionReturn(0);
1019 }
1020 
1021 #undef __FUNCT__
1022 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1023 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1024 {
1025   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1026   const PetscScalar  *x;
1027   PetscScalar        *y,*z,sum;
1028   const MatScalar    *v;
1029   PetscErrorCode     ierr;
1030   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1031   const PetscInt     *idx,*ii;
1032   PetscBool          usecprow=a->compressedrow.use;
1033 
1034   PetscFunctionBegin;
1035   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1036   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1037   if (zz != yy) {
1038     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1039   } else {
1040     z = y;
1041   }
1042 
1043   idx = a->j;
1044   v   = a->a;
1045   if (usecprow){
1046     if (zz != yy){
1047       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1048     }
1049     mbs  = a->compressedrow.nrows;
1050     ii   = a->compressedrow.i;
1051     ridx = a->compressedrow.rindex;
1052   } else {
1053     ii  = a->i;
1054   }
1055 
1056   for (i=0; i<mbs; i++) {
1057     n    = ii[1] - ii[0];
1058     ii++;
1059     if (!usecprow){
1060       nonzerorow += (n>0);
1061       sum = y[i];
1062     } else {
1063       sum = y[ridx[i]];
1064     }
1065     PetscSparseDensePlusDot(sum,x,v,idx,n);
1066     v += n;
1067     idx += n;
1068     if (usecprow){
1069       z[ridx[i]] = sum;
1070     } else {
1071       z[i] = sum;
1072     }
1073   }
1074   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1075   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1076   if (zz != yy) {
1077     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1078   }
1079   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
1080   PetscFunctionReturn(0);
1081 }
1082 
1083 #undef __FUNCT__
1084 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1085 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1086 {
1087   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1088   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1089   PetscScalar    x1,x2,*yarray,*zarray;
1090   MatScalar      *v;
1091   PetscErrorCode ierr;
1092   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1093   PetscBool      usecprow=a->compressedrow.use;
1094 
1095   PetscFunctionBegin;
1096   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1097   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1098   if (zz != yy) {
1099     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1100   } else {
1101     zarray = yarray;
1102   }
1103 
1104   idx = a->j;
1105   v   = a->a;
1106   if (usecprow){
1107     if (zz != yy){
1108       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1109     }
1110     mbs  = a->compressedrow.nrows;
1111     ii   = a->compressedrow.i;
1112     ridx = a->compressedrow.rindex;
1113     if (zz != yy){
1114       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1115     }
1116   } else {
1117     ii  = a->i;
1118     y   = yarray;
1119     z   = zarray;
1120   }
1121 
1122   for (i=0; i<mbs; i++) {
1123     n  = ii[1] - ii[0]; ii++;
1124     if (usecprow){
1125       z = zarray + 2*ridx[i];
1126       y = yarray + 2*ridx[i];
1127     }
1128     sum1 = y[0]; sum2 = y[1];
1129     for (j=0; j<n; j++) {
1130       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1131       sum1 += v[0]*x1 + v[2]*x2;
1132       sum2 += v[1]*x1 + v[3]*x2;
1133       v += 4;
1134     }
1135     z[0] = sum1; z[1] = sum2;
1136     if (!usecprow){
1137       z += 2; y += 2;
1138     }
1139   }
1140   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1141   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1142   if (zz != yy) {
1143     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1144   }
1145   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1146   PetscFunctionReturn(0);
1147 }
1148 
1149 #undef __FUNCT__
1150 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1151 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1152 {
1153   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1154   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1155   MatScalar      *v;
1156   PetscErrorCode ierr;
1157   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1158   PetscBool      usecprow=a->compressedrow.use;
1159 
1160   PetscFunctionBegin;
1161   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1162   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1163   if (zz != yy) {
1164     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1165   } else {
1166     zarray = yarray;
1167   }
1168 
1169   idx = a->j;
1170   v   = a->a;
1171   if (usecprow){
1172     if (zz != yy){
1173       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1174     }
1175     mbs  = a->compressedrow.nrows;
1176     ii   = a->compressedrow.i;
1177     ridx = a->compressedrow.rindex;
1178   } else {
1179     ii  = a->i;
1180     y   = yarray;
1181     z   = zarray;
1182   }
1183 
1184   for (i=0; i<mbs; i++) {
1185     n  = ii[1] - ii[0]; ii++;
1186     if (usecprow){
1187       z = zarray + 3*ridx[i];
1188       y = yarray + 3*ridx[i];
1189     }
1190     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1191     for (j=0; j<n; j++) {
1192       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1193       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1194       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1195       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1196       v += 9;
1197     }
1198     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1199     if (!usecprow){
1200       z += 3; y += 3;
1201     }
1202   }
1203   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1204   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1205   if (zz != yy) {
1206     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1207   }
1208   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 #undef __FUNCT__
1213 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1214 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1215 {
1216   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1217   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1218   MatScalar      *v;
1219   PetscErrorCode ierr;
1220   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1221   PetscBool      usecprow=a->compressedrow.use;
1222 
1223   PetscFunctionBegin;
1224   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1225   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1226   if (zz != yy) {
1227     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1228   } else {
1229     zarray = yarray;
1230   }
1231 
1232   idx   = a->j;
1233   v     = a->a;
1234   if (usecprow){
1235     if (zz != yy){
1236       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1237     }
1238     mbs  = a->compressedrow.nrows;
1239     ii   = a->compressedrow.i;
1240     ridx = a->compressedrow.rindex;
1241   } else {
1242     ii  = a->i;
1243     y   = yarray;
1244     z   = zarray;
1245   }
1246 
1247   for (i=0; i<mbs; i++) {
1248     n  = ii[1] - ii[0]; ii++;
1249     if (usecprow){
1250       z = zarray + 4*ridx[i];
1251       y = yarray + 4*ridx[i];
1252     }
1253     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1254     for (j=0; j<n; j++) {
1255       xb = x + 4*(*idx++);
1256       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1257       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1258       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1259       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1260       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1261       v += 16;
1262     }
1263     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1264     if (!usecprow){
1265       z += 4; y += 4;
1266     }
1267   }
1268   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1269   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1270   if (zz != yy) {
1271     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1272   }
1273   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1279 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1280 {
1281   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1282   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1283   PetscScalar    *yarray,*zarray;
1284   MatScalar      *v;
1285   PetscErrorCode ierr;
1286   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1287   PetscBool      usecprow=a->compressedrow.use;
1288 
1289   PetscFunctionBegin;
1290   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1291   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1292   if (zz != yy) {
1293     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1294   } else {
1295     zarray = yarray;
1296   }
1297 
1298   idx = a->j;
1299   v   = a->a;
1300   if (usecprow){
1301     if (zz != yy){
1302       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1303     }
1304     mbs  = a->compressedrow.nrows;
1305     ii   = a->compressedrow.i;
1306     ridx = a->compressedrow.rindex;
1307   } else {
1308     ii  = a->i;
1309     y   = yarray;
1310     z   = zarray;
1311   }
1312 
1313   for (i=0; i<mbs; i++) {
1314     n  = ii[1] - ii[0]; ii++;
1315     if (usecprow){
1316       z = zarray + 5*ridx[i];
1317       y = yarray + 5*ridx[i];
1318     }
1319     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1320     for (j=0; j<n; j++) {
1321       xb = x + 5*(*idx++);
1322       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1323       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1324       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1325       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1326       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1327       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1328       v += 25;
1329     }
1330     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1331     if (!usecprow){
1332       z += 5; y += 5;
1333     }
1334   }
1335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1337   if (zz != yy) {
1338     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1339   }
1340   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1341   PetscFunctionReturn(0);
1342 }
1343 #undef __FUNCT__
1344 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1345 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1346 {
1347   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1348   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1349   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1350   MatScalar      *v;
1351   PetscErrorCode ierr;
1352   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1353   PetscBool      usecprow=a->compressedrow.use;
1354 
1355   PetscFunctionBegin;
1356   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1357   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1358   if (zz != yy) {
1359     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1360   } else {
1361     zarray = yarray;
1362   }
1363 
1364   idx = a->j;
1365   v   = a->a;
1366   if (usecprow){
1367     if (zz != yy){
1368       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1369     }
1370     mbs  = a->compressedrow.nrows;
1371     ii   = a->compressedrow.i;
1372     ridx = a->compressedrow.rindex;
1373   } else {
1374     ii  = a->i;
1375     y   = yarray;
1376     z   = zarray;
1377   }
1378 
1379   for (i=0; i<mbs; i++) {
1380     n  = ii[1] - ii[0]; ii++;
1381     if (usecprow){
1382       z = zarray + 6*ridx[i];
1383       y = yarray + 6*ridx[i];
1384     }
1385     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1386     for (j=0; j<n; j++) {
1387       xb = x + 6*(*idx++);
1388       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1389       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1390       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1391       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1392       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1393       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1394       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1395       v += 36;
1396     }
1397     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1398     if (!usecprow){
1399       z += 6; y += 6;
1400     }
1401   }
1402   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1403   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1404   if (zz != yy) {
1405     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1406   }
1407   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 #undef __FUNCT__
1412 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1413 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1414 {
1415   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1416   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1417   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1418   MatScalar      *v;
1419   PetscErrorCode ierr;
1420   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1421   PetscBool      usecprow=a->compressedrow.use;
1422 
1423   PetscFunctionBegin;
1424   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1425   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1426   if (zz != yy) {
1427     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1428   } else {
1429     zarray = yarray;
1430   }
1431 
1432   idx = a->j;
1433   v   = a->a;
1434   if (usecprow){
1435     if (zz != yy){
1436       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1437     }
1438     mbs  = a->compressedrow.nrows;
1439     ii   = a->compressedrow.i;
1440     ridx = a->compressedrow.rindex;
1441   } else {
1442     ii  = a->i;
1443     y   = yarray;
1444     z   = zarray;
1445   }
1446 
1447   for (i=0; i<mbs; i++) {
1448     n  = ii[1] - ii[0]; ii++;
1449     if (usecprow){
1450       z = zarray + 7*ridx[i];
1451       y = yarray + 7*ridx[i];
1452     }
1453     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1454     for (j=0; j<n; j++) {
1455       xb = x + 7*(*idx++);
1456       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1457       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1458       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1459       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1460       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1461       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1462       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1463       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1464       v += 49;
1465     }
1466     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1467     if (!usecprow){
1468       z += 7; y += 7;
1469     }
1470   }
1471   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1472   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1473   if (zz != yy) {
1474     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1475   }
1476   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1477   PetscFunctionReturn(0);
1478 }
1479 
1480 #undef __FUNCT__
1481 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1482 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1483 {
1484   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1485   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1486   MatScalar      *v;
1487   PetscErrorCode ierr;
1488   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1489   PetscInt       ncols,k,*ridx=PETSC_NULL;
1490   PetscBool      usecprow=a->compressedrow.use;
1491 
1492   PetscFunctionBegin;
1493   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1494   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1495   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1496 
1497   idx = a->j;
1498   v   = a->a;
1499   if (usecprow){
1500     mbs    = a->compressedrow.nrows;
1501     ii     = a->compressedrow.i;
1502     ridx = a->compressedrow.rindex;
1503   } else {
1504     mbs = a->mbs;
1505     ii  = a->i;
1506     z   = zarray;
1507   }
1508 
1509   if (!a->mult_work) {
1510     k    = PetscMax(A->rmap->n,A->cmap->n);
1511     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1512   }
1513   work = a->mult_work;
1514   for (i=0; i<mbs; i++) {
1515     n     = ii[1] - ii[0]; ii++;
1516     ncols = n*bs;
1517     workt = work;
1518     for (j=0; j<n; j++) {
1519       xb = x + bs*(*idx++);
1520       for (k=0; k<bs; k++) workt[k] = xb[k];
1521       workt += bs;
1522     }
1523     if (usecprow) z = zarray + bs*ridx[i];
1524     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1525     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1526     v += n*bs2;
1527     if (!usecprow){
1528       z += bs;
1529     }
1530   }
1531   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1532   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1533   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 #undef __FUNCT__
1538 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1539 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1540 {
1541   PetscScalar    zero = 0.0;
1542   PetscErrorCode ierr;
1543 
1544   PetscFunctionBegin;
1545   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1546   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 #undef __FUNCT__
1551 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1552 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1553 {
1554   PetscScalar    zero = 0.0;
1555   PetscErrorCode ierr;
1556 
1557   PetscFunctionBegin;
1558   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1559   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 #undef __FUNCT__
1564 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1565 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1566 
1567 {
1568   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1569   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1570   MatScalar         *v;
1571   PetscErrorCode    ierr;
1572   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1573   Mat_CompressedRow cprow = a->compressedrow;
1574   PetscBool         usecprow=cprow.use;
1575 
1576   PetscFunctionBegin;
1577   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1578   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1579   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1580 
1581   idx = a->j;
1582   v   = a->a;
1583   if (usecprow){
1584     mbs  = cprow.nrows;
1585     ii   = cprow.i;
1586     ridx = cprow.rindex;
1587   } else {
1588     mbs=a->mbs;
1589     ii = a->i;
1590     xb = x;
1591   }
1592 
1593   switch (bs) {
1594   case 1:
1595     for (i=0; i<mbs; i++) {
1596       if (usecprow) xb = x + ridx[i];
1597       x1 = xb[0];
1598       ib = idx + ii[0];
1599       n  = ii[1] - ii[0]; ii++;
1600       for (j=0; j<n; j++) {
1601         rval    = ib[j];
1602         z[rval] += PetscConj(*v) * x1;
1603         v++;
1604       }
1605       if (!usecprow) xb++;
1606     }
1607     break;
1608   case 2:
1609     for (i=0; i<mbs; i++) {
1610       if (usecprow) xb = x + 2*ridx[i];
1611       x1 = xb[0]; x2 = xb[1];
1612       ib = idx + ii[0];
1613       n  = ii[1] - ii[0]; ii++;
1614       for (j=0; j<n; j++) {
1615         rval      = ib[j]*2;
1616         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1617         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1618         v  += 4;
1619       }
1620       if (!usecprow) xb += 2;
1621     }
1622     break;
1623   case 3:
1624     for (i=0; i<mbs; i++) {
1625       if (usecprow) xb = x + 3*ridx[i];
1626       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1627       ib = idx + ii[0];
1628       n  = ii[1] - ii[0]; ii++;
1629       for (j=0; j<n; j++) {
1630         rval      = ib[j]*3;
1631         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1632         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1633         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1634         v  += 9;
1635       }
1636       if (!usecprow) xb += 3;
1637     }
1638     break;
1639   case 4:
1640     for (i=0; i<mbs; i++) {
1641       if (usecprow) xb = x + 4*ridx[i];
1642       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1643       ib = idx + ii[0];
1644       n  = ii[1] - ii[0]; ii++;
1645       for (j=0; j<n; j++) {
1646         rval      = ib[j]*4;
1647         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1648         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1649         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1650         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1651         v  += 16;
1652       }
1653       if (!usecprow) xb += 4;
1654     }
1655     break;
1656   case 5:
1657     for (i=0; i<mbs; i++) {
1658       if (usecprow) xb = x + 5*ridx[i];
1659       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1660       x4 = xb[3]; x5 = xb[4];
1661       ib = idx + ii[0];
1662       n  = ii[1] - ii[0]; ii++;
1663       for (j=0; j<n; j++) {
1664         rval      = ib[j]*5;
1665         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1666         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1667         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1668         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1669         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1670         v  += 25;
1671       }
1672       if (!usecprow) xb += 5;
1673     }
1674     break;
1675   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1676       PetscInt     ncols,k;
1677       PetscScalar  *work,*workt,*xtmp;
1678 
1679       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1680       if (!a->mult_work) {
1681         k = PetscMax(A->rmap->n,A->cmap->n);
1682         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1683       }
1684       work = a->mult_work;
1685       xtmp = x;
1686       for (i=0; i<mbs; i++) {
1687         n     = ii[1] - ii[0]; ii++;
1688         ncols = n*bs;
1689         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1690         if (usecprow) {
1691           xtmp = x + bs*ridx[i];
1692         }
1693         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1694         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1695         v += n*bs2;
1696         if (!usecprow) xtmp += bs;
1697         workt = work;
1698         for (j=0; j<n; j++) {
1699           zb = z + bs*(*idx++);
1700           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1701           workt += bs;
1702         }
1703       }
1704     }
1705   }
1706   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1707   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1708   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1709   PetscFunctionReturn(0);
1710 }
1711 
1712 #undef __FUNCT__
1713 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1714 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1715 {
1716   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1717   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1718   MatScalar         *v;
1719   PetscErrorCode    ierr;
1720   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1721   Mat_CompressedRow cprow = a->compressedrow;
1722   PetscBool         usecprow=cprow.use;
1723 
1724   PetscFunctionBegin;
1725   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1726   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1727   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1728 
1729   idx = a->j;
1730   v   = a->a;
1731   if (usecprow){
1732     mbs  = cprow.nrows;
1733     ii   = cprow.i;
1734     ridx = cprow.rindex;
1735   } else {
1736     mbs=a->mbs;
1737     ii = a->i;
1738     xb = x;
1739   }
1740 
1741   switch (bs) {
1742   case 1:
1743     for (i=0; i<mbs; i++) {
1744       if (usecprow) xb = x + ridx[i];
1745       x1 = xb[0];
1746       ib = idx + ii[0];
1747       n  = ii[1] - ii[0]; ii++;
1748       for (j=0; j<n; j++) {
1749         rval    = ib[j];
1750         z[rval] += *v * x1;
1751         v++;
1752       }
1753       if (!usecprow) xb++;
1754     }
1755     break;
1756   case 2:
1757     for (i=0; i<mbs; i++) {
1758       if (usecprow) xb = x + 2*ridx[i];
1759       x1 = xb[0]; x2 = xb[1];
1760       ib = idx + ii[0];
1761       n  = ii[1] - ii[0]; ii++;
1762       for (j=0; j<n; j++) {
1763         rval      = ib[j]*2;
1764         z[rval++] += v[0]*x1 + v[1]*x2;
1765         z[rval++] += v[2]*x1 + v[3]*x2;
1766         v  += 4;
1767       }
1768       if (!usecprow) xb += 2;
1769     }
1770     break;
1771   case 3:
1772     for (i=0; i<mbs; i++) {
1773       if (usecprow) xb = x + 3*ridx[i];
1774       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1775       ib = idx + ii[0];
1776       n  = ii[1] - ii[0]; ii++;
1777       for (j=0; j<n; j++) {
1778         rval      = ib[j]*3;
1779         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1780         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1781         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1782         v  += 9;
1783       }
1784       if (!usecprow) xb += 3;
1785     }
1786     break;
1787   case 4:
1788     for (i=0; i<mbs; i++) {
1789       if (usecprow) xb = x + 4*ridx[i];
1790       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1791       ib = idx + ii[0];
1792       n  = ii[1] - ii[0]; ii++;
1793       for (j=0; j<n; j++) {
1794         rval      = ib[j]*4;
1795         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1796         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1797         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1798         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1799         v  += 16;
1800       }
1801       if (!usecprow) xb += 4;
1802     }
1803     break;
1804   case 5:
1805     for (i=0; i<mbs; i++) {
1806       if (usecprow) xb = x + 5*ridx[i];
1807       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1808       x4 = xb[3]; x5 = xb[4];
1809       ib = idx + ii[0];
1810       n  = ii[1] - ii[0]; ii++;
1811       for (j=0; j<n; j++) {
1812         rval      = ib[j]*5;
1813         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1814         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1815         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1816         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1817         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1818         v  += 25;
1819       }
1820       if (!usecprow) xb += 5;
1821     }
1822     break;
1823   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1824       PetscInt     ncols,k;
1825       PetscScalar  *work,*workt,*xtmp;
1826 
1827       if (!a->mult_work) {
1828         k = PetscMax(A->rmap->n,A->cmap->n);
1829         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1830       }
1831       work = a->mult_work;
1832       xtmp = x;
1833       for (i=0; i<mbs; i++) {
1834         n     = ii[1] - ii[0]; ii++;
1835         ncols = n*bs;
1836         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1837         if (usecprow) {
1838           xtmp = x + bs*ridx[i];
1839         }
1840         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1841         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1842         v += n*bs2;
1843         if (!usecprow) xtmp += bs;
1844         workt = work;
1845         for (j=0; j<n; j++) {
1846           zb = z + bs*(*idx++);
1847           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1848           workt += bs;
1849         }
1850       }
1851     }
1852   }
1853   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1854   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1855   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1856   PetscFunctionReturn(0);
1857 }
1858 
1859 #undef __FUNCT__
1860 #define __FUNCT__ "MatScale_SeqBAIJ"
1861 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1862 {
1863   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1864   PetscInt       totalnz = a->bs2*a->nz;
1865   PetscScalar    oalpha = alpha;
1866   PetscErrorCode ierr;
1867   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1868 
1869   PetscFunctionBegin;
1870   BLASscal_(&tnz,&oalpha,a->a,&one);
1871   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 #undef __FUNCT__
1876 #define __FUNCT__ "MatNorm_SeqBAIJ"
1877 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1878 {
1879   PetscErrorCode ierr;
1880   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1881   MatScalar      *v = a->a;
1882   PetscReal      sum = 0.0;
1883   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1884 
1885   PetscFunctionBegin;
1886   if (type == NORM_FROBENIUS) {
1887     for (i=0; i< bs2*nz; i++) {
1888 #if defined(PETSC_USE_COMPLEX)
1889       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1890 #else
1891       sum += (*v)*(*v); v++;
1892 #endif
1893     }
1894     *norm = sqrt(sum);
1895   } else if (type == NORM_1) { /* maximum column sum */
1896     PetscReal *tmp;
1897     PetscInt  *bcol = a->j;
1898     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1899     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1900     for (i=0; i<nz; i++){
1901       for (j=0; j<bs; j++){
1902         k1 = bs*(*bcol) + j; /* column index */
1903         for (k=0; k<bs; k++){
1904           tmp[k1] += PetscAbsScalar(*v); v++;
1905         }
1906       }
1907       bcol++;
1908     }
1909     *norm = 0.0;
1910     for (j=0; j<A->cmap->n; j++) {
1911       if (tmp[j] > *norm) *norm = tmp[j];
1912     }
1913     ierr = PetscFree(tmp);CHKERRQ(ierr);
1914   } else if (type == NORM_INFINITY) { /* maximum row sum */
1915     *norm = 0.0;
1916     for (k=0; k<bs; k++) {
1917       for (j=0; j<a->mbs; j++) {
1918         v = a->a + bs2*a->i[j] + k;
1919         sum = 0.0;
1920         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1921           for (k1=0; k1<bs; k1++){
1922             sum += PetscAbsScalar(*v);
1923             v   += bs;
1924           }
1925         }
1926         if (sum > *norm) *norm = sum;
1927       }
1928     }
1929   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1930   PetscFunctionReturn(0);
1931 }
1932 
1933 
1934 #undef __FUNCT__
1935 #define __FUNCT__ "MatEqual_SeqBAIJ"
1936 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1937 {
1938   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1939   PetscErrorCode ierr;
1940 
1941   PetscFunctionBegin;
1942   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1943   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1944     *flg = PETSC_FALSE;
1945     PetscFunctionReturn(0);
1946   }
1947 
1948   /* if the a->i are the same */
1949   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1950   if (!*flg) {
1951     PetscFunctionReturn(0);
1952   }
1953 
1954   /* if a->j are the same */
1955   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1956   if (!*flg) {
1957     PetscFunctionReturn(0);
1958   }
1959   /* if a->a are the same */
1960   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1961   PetscFunctionReturn(0);
1962 
1963 }
1964 
1965 #undef __FUNCT__
1966 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1967 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1968 {
1969   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1970   PetscErrorCode ierr;
1971   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1972   PetscScalar    *x,zero = 0.0;
1973   MatScalar      *aa,*aa_j;
1974 
1975   PetscFunctionBegin;
1976   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1977   bs   = A->rmap->bs;
1978   aa   = a->a;
1979   ai   = a->i;
1980   aj   = a->j;
1981   ambs = a->mbs;
1982   bs2  = a->bs2;
1983 
1984   ierr = VecSet(v,zero);CHKERRQ(ierr);
1985   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1986   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1987   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1988   for (i=0; i<ambs; i++) {
1989     for (j=ai[i]; j<ai[i+1]; j++) {
1990       if (aj[j] == i) {
1991         row  = i*bs;
1992         aa_j = aa+j*bs2;
1993         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1994         break;
1995       }
1996     }
1997   }
1998   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1999   PetscFunctionReturn(0);
2000 }
2001 
2002 #undef __FUNCT__
2003 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2004 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2005 {
2006   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2007   const PetscScalar *l,*r,*li,*ri;
2008   PetscScalar       x;
2009   MatScalar         *aa, *v;
2010   PetscErrorCode    ierr;
2011   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2012   const PetscInt    *ai,*aj;
2013 
2014   PetscFunctionBegin;
2015   ai  = a->i;
2016   aj  = a->j;
2017   aa  = a->a;
2018   m   = A->rmap->n;
2019   n   = A->cmap->n;
2020   bs  = A->rmap->bs;
2021   mbs = a->mbs;
2022   bs2 = a->bs2;
2023   if (ll) {
2024     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2025     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2026     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2027     for (i=0; i<mbs; i++) { /* for each block row */
2028       M  = ai[i+1] - ai[i];
2029       li = l + i*bs;
2030       v  = aa + bs2*ai[i];
2031       for (j=0; j<M; j++) { /* for each block */
2032         for (k=0; k<bs2; k++) {
2033           (*v++) *= li[k%bs];
2034         }
2035       }
2036     }
2037     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2038     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2039   }
2040 
2041   if (rr) {
2042     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2043     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2044     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2045     for (i=0; i<mbs; i++) { /* for each block row */
2046       iai = ai[i];
2047       M   = ai[i+1] - iai;
2048       v   = aa + bs2*iai;
2049       for (j=0; j<M; j++) { /* for each block */
2050         ri = r + bs*aj[iai+j];
2051         for (k=0; k<bs; k++) {
2052           x = ri[k];
2053           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2054           v += bs;
2055         }
2056       }
2057     }
2058     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2059     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2060   }
2061   PetscFunctionReturn(0);
2062 }
2063 
2064 
2065 #undef __FUNCT__
2066 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2067 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2068 {
2069   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2070 
2071   PetscFunctionBegin;
2072   info->block_size     = a->bs2;
2073   info->nz_allocated   = a->bs2*a->maxnz;
2074   info->nz_used        = a->bs2*a->nz;
2075   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
2076   info->assemblies   = A->num_ass;
2077   info->mallocs      = A->info.mallocs;
2078   info->memory       = ((PetscObject)A)->mem;
2079   if (A->factortype) {
2080     info->fill_ratio_given  = A->info.fill_ratio_given;
2081     info->fill_ratio_needed = A->info.fill_ratio_needed;
2082     info->factor_mallocs    = A->info.factor_mallocs;
2083   } else {
2084     info->fill_ratio_given  = 0;
2085     info->fill_ratio_needed = 0;
2086     info->factor_mallocs    = 0;
2087   }
2088   PetscFunctionReturn(0);
2089 }
2090 
2091 
2092 #undef __FUNCT__
2093 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2094 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2095 {
2096   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2097   PetscErrorCode ierr;
2098 
2099   PetscFunctionBegin;
2100   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2101   PetscFunctionReturn(0);
2102 }
2103