xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision d736bfeb4d37a01fcbdf00fe73fb60d6f0ba2142)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "petscblaslapack.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
15   const PetscInt *idx;
16   PetscInt       start,end,*ai,*aj,bs,*nidx2;
17   PetscBT        table;
18 
19   PetscFunctionBegin;
20   m     = a->mbs;
21   ai    = a->i;
22   aj    = a->j;
23   bs    = A->rmap->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
30 
31   for (i=0; i<is_max; i++) {
32     /* Initialise the two local arrays */
33     isz  = 0;
34     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j=0; j<n ; ++j){
42       ival = idx[j]/bs; /* convert the indices into block indices */
43       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap*/
51       n = isz;
52       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end ; l++){
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++)
65         nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 #undef __FUNCT__
76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
78 {
79   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
80   PetscErrorCode ierr;
81   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
82   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
83   const PetscInt *irow,*icol;
84   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
85   PetscInt       *aj = a->j,*ai = a->i;
86   MatScalar      *mat_a;
87   Mat            C;
88   PetscTruth     flag,sorted;
89 
90   PetscFunctionBegin;
91   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
92   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
93 
94   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
95   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
98 
99   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
100   ssmap = smap;
101   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
102   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
103   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109       for (k=kstart; k<kend; k++) {
110         if (ssmap[aj[k]]) {
111           lens[i]++;
112         }
113       }
114     }
115   /* Create and fill new matrix */
116   if (scall == MAT_REUSE_MATRIX) {
117     c = (Mat_SeqBAIJ *)((*B)->data);
118 
119     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
121     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
122     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
123     C = *B;
124   } else {
125     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
126     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
127     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
128     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
129   }
130   c = (Mat_SeqBAIJ *)(C->data);
131   for (i=0; i<nrows; i++) {
132     row    = irow[i];
133     kstart = ai[row];
134     kend   = kstart + a->ilen[row];
135     mat_i  = c->i[i];
136     mat_j  = c->j + mat_i;
137     mat_a  = c->a + mat_i*bs2;
138     mat_ilen = c->ilen + i;
139     for (k=kstart; k<kend; k++) {
140       if ((tcol=ssmap[a->j[k]])) {
141         *mat_j++ = tcol - 1;
142         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
143         mat_a   += bs2;
144         (*mat_ilen)++;
145       }
146     }
147   }
148 
149   /* Free work space */
150   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
151   ierr = PetscFree(smap);CHKERRQ(ierr);
152   ierr = PetscFree(lens);CHKERRQ(ierr);
153   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
154   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155 
156   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
157   *B = C;
158   PetscFunctionReturn(0);
159 }
160 
161 #undef __FUNCT__
162 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164 {
165   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166   IS             is1,is2;
167   PetscErrorCode ierr;
168   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
169   const PetscInt *irow,*icol;
170 
171   PetscFunctionBegin;
172   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176 
177   /* Verify if the indices corespond to each element in a block
178    and form the IS with compressed IS */
179   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
180   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
181   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182   count = 0;
183   for (i=0; i<a->mbs; i++) {
184     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185     if (vary[i]==bs) iary[count++] = i;
186   }
187   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
188 
189   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
190   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191   count = 0;
192   for (i=0; i<a->mbs; i++) {
193     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
194     if (vary[i]==bs) iary[count++] = i;
195   }
196   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr);
197   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
198   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
200 
201   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
202   ISDestroy(is1);
203   ISDestroy(is2);
204   PetscFunctionReturn(0);
205 }
206 
207 #undef __FUNCT__
208 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
209 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210 {
211   PetscErrorCode ierr;
212   PetscInt       i;
213 
214   PetscFunctionBegin;
215   if (scall == MAT_INITIAL_MATRIX) {
216     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
217   }
218 
219   for (i=0; i<n; i++) {
220     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 
226 /* -------------------------------------------------------*/
227 /* Should check that shapes of vectors and matrices match */
228 /* -------------------------------------------------------*/
229 
230 #undef __FUNCT__
231 #define __FUNCT__ "MatMult_SeqBAIJ_1"
232 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
233 {
234   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
235   PetscScalar       *z,sum;
236   const PetscScalar *x;
237   const MatScalar   *v;
238   PetscErrorCode    ierr;
239   PetscInt          mbs,i,n,nonzerorow=0;
240   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
241   PetscTruth        usecprow=a->compressedrow.use;
242 
243   PetscFunctionBegin;
244   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
245   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
246 
247   if (usecprow){
248     mbs  = a->compressedrow.nrows;
249     ii   = a->compressedrow.i;
250     ridx = a->compressedrow.rindex;
251     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
252   } else {
253     mbs = a->mbs;
254     ii  = a->i;
255   }
256 
257   for (i=0; i<mbs; i++) {
258     n    = ii[1] - ii[0];
259     v    = a->a + ii[0];
260     idx  = a->j + ii[0];
261     ii++;
262     sum  = 0.0;
263     PetscSparseDensePlusDot(sum,x,v,idx,n);
264     if (usecprow){
265       z[ridx[i]] = sum;
266     } else {
267       nonzerorow += (n>0);
268       z[i] = sum;
269     }
270   }
271   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
272   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
273   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatMult_SeqBAIJ_2"
279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282   PetscScalar       *z = 0,sum1,sum2,*zarray;
283   const PetscScalar *x,*xb;
284   PetscScalar       x1,x2;
285   const MatScalar   *v;
286   PetscErrorCode    ierr;
287   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
288   PetscTruth        usecprow=a->compressedrow.use;
289 
290   PetscFunctionBegin;
291   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
292   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
293 
294   idx = a->j;
295   v   = a->a;
296   if (usecprow){
297     mbs  = a->compressedrow.nrows;
298     ii   = a->compressedrow.i;
299     ridx = a->compressedrow.rindex;
300   } else {
301     mbs = a->mbs;
302     ii  = a->i;
303     z   = zarray;
304   }
305 
306   for (i=0; i<mbs; i++) {
307     n  = ii[1] - ii[0]; ii++;
308     sum1 = 0.0; sum2 = 0.0;
309     nonzerorow += (n>0);
310     for (j=0; j<n; j++) {
311       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
312       sum1 += v[0]*x1 + v[2]*x2;
313       sum2 += v[1]*x1 + v[3]*x2;
314       v += 4;
315     }
316     if (usecprow) z = zarray + 2*ridx[i];
317     z[0] = sum1; z[1] = sum2;
318     if (!usecprow) z += 2;
319   }
320   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
321   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
322   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatMult_SeqBAIJ_3"
328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
329 {
330   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
331   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
332   const PetscScalar *x,*xb;
333   const MatScalar   *v;
334   PetscErrorCode    ierr;
335   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
336   PetscTruth        usecprow=a->compressedrow.use;
337 
338 
339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
340 #pragma disjoint(*v,*z,*xb)
341 #endif
342 
343   PetscFunctionBegin;
344   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
345   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
346 
347   idx = a->j;
348   v   = a->a;
349   if (usecprow){
350     mbs  = a->compressedrow.nrows;
351     ii   = a->compressedrow.i;
352     ridx = a->compressedrow.rindex;
353   } else {
354     mbs = a->mbs;
355     ii  = a->i;
356     z   = zarray;
357   }
358 
359   for (i=0; i<mbs; i++) {
360     n  = ii[1] - ii[0]; ii++;
361     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
362     nonzerorow += (n>0);
363     for (j=0; j<n; j++) {
364       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
365       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
366       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
367       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
368       v += 9;
369     }
370     if (usecprow) z = zarray + 3*ridx[i];
371     z[0] = sum1; z[1] = sum2; z[2] = sum3;
372     if (!usecprow) z += 3;
373   }
374   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
375   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
376   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 #undef __FUNCT__
381 #define __FUNCT__ "MatMult_SeqBAIJ_4"
382 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
383 {
384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
385   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
386   const PetscScalar *x,*xb;
387   const MatScalar   *v;
388   PetscErrorCode    ierr;
389   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
390   PetscTruth        usecprow=a->compressedrow.use;
391 
392   PetscFunctionBegin;
393   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
394   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
395 
396   idx = a->j;
397   v   = a->a;
398   if (usecprow){
399     mbs  = a->compressedrow.nrows;
400     ii   = a->compressedrow.i;
401     ridx = a->compressedrow.rindex;
402   } else {
403     mbs = a->mbs;
404     ii  = a->i;
405     z   = zarray;
406   }
407 
408   for (i=0; i<mbs; i++) {
409     n  = ii[1] - ii[0]; ii++;
410     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
411     nonzerorow += (n>0);
412     for (j=0; j<n; j++) {
413       xb = x + 4*(*idx++);
414       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
415       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
416       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
417       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
418       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
419       v += 16;
420     }
421     if (usecprow) z = zarray + 4*ridx[i];
422     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
423     if (!usecprow) z += 4;
424   }
425   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
426   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
427   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 #undef __FUNCT__
432 #define __FUNCT__ "MatMult_SeqBAIJ_5"
433 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
434 {
435   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
436   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
437   const PetscScalar *xb,*x;
438   const MatScalar   *v;
439   PetscErrorCode    ierr;
440   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
441   PetscInt          mbs,i,j,n,nonzerorow=0;
442   PetscTruth        usecprow=a->compressedrow.use;
443 
444   PetscFunctionBegin;
445   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
446   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
447 
448   idx = a->j;
449   v   = a->a;
450   if (usecprow){
451     mbs  = a->compressedrow.nrows;
452     ii   = a->compressedrow.i;
453     ridx = a->compressedrow.rindex;
454   } else {
455     mbs = a->mbs;
456     ii  = a->i;
457     z   = zarray;
458   }
459 
460   for (i=0; i<mbs; i++) {
461     n  = ii[1] - ii[0]; ii++;
462     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
463     nonzerorow += (n>0);
464     for (j=0; j<n; j++) {
465       xb = x + 5*(*idx++);
466       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
467       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
468       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
469       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
470       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
471       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
472       v += 25;
473     }
474     if (usecprow) z = zarray + 5*ridx[i];
475     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
476     if (!usecprow) z += 5;
477   }
478   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
479   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
480   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
481   PetscFunctionReturn(0);
482 }
483 
484 
485 #undef __FUNCT__
486 #define __FUNCT__ "MatMult_SeqBAIJ_6"
487 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
488 {
489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
491   const PetscScalar *x,*xb;
492   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
493   const MatScalar   *v;
494   PetscErrorCode    ierr;
495   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
496   PetscTruth        usecprow=a->compressedrow.use;
497 
498   PetscFunctionBegin;
499   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
500   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
501 
502   idx = a->j;
503   v   = a->a;
504   if (usecprow){
505     mbs  = a->compressedrow.nrows;
506     ii   = a->compressedrow.i;
507     ridx = a->compressedrow.rindex;
508   } else {
509     mbs = a->mbs;
510     ii  = a->i;
511     z   = zarray;
512   }
513 
514   for (i=0; i<mbs; i++) {
515     n  = ii[1] - ii[0]; ii++;
516     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
517     nonzerorow += (n>0);
518     for (j=0; j<n; j++) {
519       xb = x + 6*(*idx++);
520       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
521       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
522       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
523       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
524       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
525       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
526       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
527       v += 36;
528     }
529     if (usecprow) z = zarray + 6*ridx[i];
530     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
531     if (!usecprow) z += 6;
532   }
533 
534   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
535   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
536   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 #undef __FUNCT__
541 #define __FUNCT__ "MatMult_SeqBAIJ_7"
542 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
543 {
544   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
545   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
546   const PetscScalar *x,*xb;
547   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
548   const MatScalar   *v;
549   PetscErrorCode    ierr;
550   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
551   PetscTruth        usecprow=a->compressedrow.use;
552 
553   PetscFunctionBegin;
554   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
555   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
556 
557   idx = a->j;
558   v   = a->a;
559   if (usecprow){
560     mbs    = a->compressedrow.nrows;
561     ii     = a->compressedrow.i;
562     ridx = a->compressedrow.rindex;
563   } else {
564     mbs = a->mbs;
565     ii  = a->i;
566     z   = zarray;
567   }
568 
569   for (i=0; i<mbs; i++) {
570     n  = ii[1] - ii[0]; ii++;
571     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
572     nonzerorow += (n>0);
573     for (j=0; j<n; j++) {
574       xb = x + 7*(*idx++);
575       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
576       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
577       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
578       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
579       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
580       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
581       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
582       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
583       v += 49;
584     }
585     if (usecprow) z = zarray + 7*ridx[i];
586     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
587     if (!usecprow) z += 7;
588   }
589 
590   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
591   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
592   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
593   PetscFunctionReturn(0);
594 }
595 
596 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
597 /* Default MatMult for block size 15 */
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
601 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
602 {
603   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
604   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
605   const PetscScalar *x,*xb;
606   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
607   const MatScalar   *v;
608   PetscErrorCode    ierr;
609   const PetscInt    *ii,*ij=a->j,*idx;
610   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
611   PetscTruth        usecprow=a->compressedrow.use;
612 
613   PetscFunctionBegin;
614   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
615   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
616 
617   v   = a->a;
618   if (usecprow){
619     mbs    = a->compressedrow.nrows;
620     ii     = a->compressedrow.i;
621     ridx = a->compressedrow.rindex;
622   } else {
623     mbs = a->mbs;
624     ii  = a->i;
625     z   = zarray;
626   }
627 
628   for (i=0; i<mbs; i++) {
629     n  = ii[i+1] - ii[i];
630     idx = ij + ii[i];
631     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
632     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
633 
634     nonzerorow += (n>0);
635     for (j=0; j<n; j++) {
636       xb = x + 15*(idx[j]);
637       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
638       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
639 
640       for(k=0;k<15;k++){
641 	sum1  += v[0]*xb[k];
642         sum2  += v[1]*xb[k];
643 	sum3  += v[2]*xb[k];
644 	sum4  += v[3]*xb[k];
645 	sum5  += v[4]*xb[k];
646         sum6  += v[5]*xb[k];
647 	sum7  += v[6]*xb[k];
648 	sum8  += v[7]*xb[k];
649         sum9  += v[8]*xb[k];
650         sum10 += v[9]*xb[k];
651 	sum11 += v[10]*xb[k];
652 	sum12 += v[11]*xb[k];
653 	sum13 += v[12]*xb[k];
654         sum14 += v[13]*xb[k];
655 	sum15 += v[14]*xb[k];
656 	v += 15;
657       }
658     }
659     if (usecprow) z = zarray + 15*ridx[i];
660     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
661     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
662 
663     if (!usecprow) z += 15;
664   }
665 
666   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
667   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
668   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
669   PetscFunctionReturn(0);
670 }
671 
672 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
673 #undef __FUNCT__
674 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
675 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
676 {
677   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
678   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
679   const PetscScalar *x,*xb;
680   PetscScalar        x1,x2,x3,x4,*zarray;
681   const MatScalar   *v;
682   PetscErrorCode    ierr;
683   const PetscInt    *ii,*ij=a->j,*idx;
684   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
685   PetscTruth        usecprow=a->compressedrow.use;
686 
687   PetscFunctionBegin;
688   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
689   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
690 
691   v   = a->a;
692   if (usecprow){
693     mbs    = a->compressedrow.nrows;
694     ii     = a->compressedrow.i;
695     ridx = a->compressedrow.rindex;
696   } else {
697     mbs = a->mbs;
698     ii  = a->i;
699     z   = zarray;
700   }
701 
702   for (i=0; i<mbs; i++) {
703     n  = ii[i+1] - ii[i];
704     idx = ij + ii[i];
705     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
706     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
707 
708     nonzerorow += (n>0);
709     for (j=0; j<n; j++) {
710       xb = x + 15*(idx[j]);
711       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
712 
713       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
714       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
715       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
716       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
717       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
718       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
719       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
720       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
721       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
722       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
723       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
724       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
725       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
726       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
727       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
728 
729       v += 60;
730 
731       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
732 
733       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
734       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
735       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
736       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
737       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
738       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
739       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
740       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
741       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
742       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
743       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
744       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
745       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
746       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
747       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
748       v += 60;
749 
750       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
751       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
752       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
753       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
754       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
755       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
756       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
757       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
758       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
759       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
760       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
761       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
762       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
763       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
764       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
765       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
766       v  += 60;
767 
768       x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
769       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
770       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
771       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
772       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
773       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
774       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
775       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
776       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
777       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
778       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
779       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
780       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
781       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
782       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
783       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
784       v += 45;
785     }
786     if (usecprow) z = zarray + 15*ridx[i];
787     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
788     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
789 
790     if (!usecprow) z += 15;
791   }
792 
793   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
794   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
795   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
796   PetscFunctionReturn(0);
797 }
798 
799 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
800 #undef __FUNCT__
801 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
802 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
803 {
804   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
805   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
806   const PetscScalar *x,*xb;
807   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
808   const MatScalar   *v;
809   PetscErrorCode    ierr;
810   const PetscInt    *ii,*ij=a->j,*idx;
811   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
812   PetscTruth        usecprow=a->compressedrow.use;
813 
814   PetscFunctionBegin;
815   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
816   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
817 
818   v   = a->a;
819   if (usecprow){
820     mbs    = a->compressedrow.nrows;
821     ii     = a->compressedrow.i;
822     ridx = a->compressedrow.rindex;
823   } else {
824     mbs = a->mbs;
825     ii  = a->i;
826     z   = zarray;
827   }
828 
829   for (i=0; i<mbs; i++) {
830     n  = ii[i+1] - ii[i];
831     idx = ij + ii[i];
832     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
833     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
834 
835     nonzerorow += (n>0);
836     for (j=0; j<n; j++) {
837       xb = x + 15*(idx[j]);
838       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
839       x8 = xb[7];
840 
841       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
842       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
843       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
844       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
845       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
846       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
847       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
848       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
849       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
850       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
851       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
852       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
853       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
854       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
855       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
856       v += 120;
857 
858       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
859 
860       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
861       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
862       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
863       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
864       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
865       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
866       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
867       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
868       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
869       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
870       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
871       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
872       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
873       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
874       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
875       v += 105;
876     }
877     if (usecprow) z = zarray + 15*ridx[i];
878     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
879     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
880 
881     if (!usecprow) z += 15;
882   }
883 
884   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
885   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
886   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
887   PetscFunctionReturn(0);
888 }
889 
890 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
891 
892 #undef __FUNCT__
893 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
894 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
895 {
896   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
897   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
898   const PetscScalar *x,*xb;
899   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
900   const MatScalar   *v;
901   PetscErrorCode    ierr;
902   const PetscInt    *ii,*ij=a->j,*idx;
903   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
904   PetscTruth        usecprow=a->compressedrow.use;
905 
906   PetscFunctionBegin;
907   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
908   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
909 
910   v   = a->a;
911   if (usecprow){
912     mbs    = a->compressedrow.nrows;
913     ii     = a->compressedrow.i;
914     ridx = a->compressedrow.rindex;
915   } else {
916     mbs = a->mbs;
917     ii  = a->i;
918     z   = zarray;
919   }
920 
921   for (i=0; i<mbs; i++) {
922     n  = ii[i+1] - ii[i];
923     idx = ij + ii[i];
924     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
925     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
926 
927     nonzerorow += (n>0);
928     for (j=0; j<n; j++) {
929       xb = x + 15*(idx[j]);
930       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
931       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
932 
933       sum1 +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
934       sum2 +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
935       sum3 +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
936       sum4 +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
937       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
938       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
939       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
940       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
941       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
942       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
943       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
944       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
945       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
946       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
947       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
948       v += 225;
949     }
950     if (usecprow) z = zarray + 15*ridx[i];
951     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
952     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
953 
954     if (!usecprow) z += 15;
955   }
956 
957   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
958   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
959   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
960   PetscFunctionReturn(0);
961 }
962 
963 
964 /*
965     This will not work with MatScalar == float because it calls the BLAS
966 */
967 #undef __FUNCT__
968 #define __FUNCT__ "MatMult_SeqBAIJ_N"
969 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
970 {
971   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
972   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
973   MatScalar      *v;
974   PetscErrorCode ierr;
975   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
976   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
977   PetscTruth     usecprow=a->compressedrow.use;
978 
979   PetscFunctionBegin;
980   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
981   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
982 
983   idx = a->j;
984   v   = a->a;
985   if (usecprow){
986     mbs  = a->compressedrow.nrows;
987     ii   = a->compressedrow.i;
988     ridx = a->compressedrow.rindex;
989   } else {
990     mbs = a->mbs;
991     ii  = a->i;
992     z   = zarray;
993   }
994 
995   if (!a->mult_work) {
996     k    = PetscMax(A->rmap->n,A->cmap->n);
997     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
998   }
999   work = a->mult_work;
1000   for (i=0; i<mbs; i++) {
1001     n     = ii[1] - ii[0]; ii++;
1002     ncols = n*bs;
1003     workt = work;
1004     nonzerorow += (n>0);
1005     for (j=0; j<n; j++) {
1006       xb = x + bs*(*idx++);
1007       for (k=0; k<bs; k++) workt[k] = xb[k];
1008       workt += bs;
1009     }
1010     if (usecprow) z = zarray + bs*ridx[i];
1011     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1012     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1013     v += n*bs2;
1014     if (!usecprow) z += bs;
1015   }
1016   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1017   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1018   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
1019   PetscFunctionReturn(0);
1020 }
1021 
1022 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
1023 #undef __FUNCT__
1024 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1025 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1026 {
1027   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1028   const PetscScalar  *x;
1029   PetscScalar        *y,*z,sum;
1030   const MatScalar    *v;
1031   PetscErrorCode     ierr;
1032   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1033   const PetscInt     *idx,*ii;
1034   PetscTruth         usecprow=a->compressedrow.use;
1035 
1036   PetscFunctionBegin;
1037   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1038   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1039   if (zz != yy) {
1040     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1041   } else {
1042     z = y;
1043   }
1044 
1045   idx = a->j;
1046   v   = a->a;
1047   if (usecprow){
1048     if (zz != yy){
1049       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1050     }
1051     mbs  = a->compressedrow.nrows;
1052     ii   = a->compressedrow.i;
1053     ridx = a->compressedrow.rindex;
1054   } else {
1055     ii  = a->i;
1056   }
1057 
1058   for (i=0; i<mbs; i++) {
1059     n    = ii[1] - ii[0];
1060     ii++;
1061     if (!usecprow){
1062       nonzerorow += (n>0);
1063       sum = y[i];
1064     } else {
1065       sum = y[ridx[i]];
1066     }
1067     PetscSparseDensePlusDot(sum,x,v,idx,n);
1068     v += n;
1069     idx += n;
1070     if (usecprow){
1071       z[ridx[i]] = sum;
1072     } else {
1073       z[i] = sum;
1074     }
1075   }
1076   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1077   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1078   if (zz != yy) {
1079     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1080   }
1081   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 #undef __FUNCT__
1086 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1087 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1088 {
1089   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1090   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1091   PetscScalar    x1,x2,*yarray,*zarray;
1092   MatScalar      *v;
1093   PetscErrorCode ierr;
1094   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1095   PetscTruth     usecprow=a->compressedrow.use;
1096 
1097   PetscFunctionBegin;
1098   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1099   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1100   if (zz != yy) {
1101     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1102   } else {
1103     zarray = yarray;
1104   }
1105 
1106   idx = a->j;
1107   v   = a->a;
1108   if (usecprow){
1109     if (zz != yy){
1110       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1111     }
1112     mbs  = a->compressedrow.nrows;
1113     ii   = a->compressedrow.i;
1114     ridx = a->compressedrow.rindex;
1115     if (zz != yy){
1116       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1117     }
1118   } else {
1119     ii  = a->i;
1120     y   = yarray;
1121     z   = zarray;
1122   }
1123 
1124   for (i=0; i<mbs; i++) {
1125     n  = ii[1] - ii[0]; ii++;
1126     if (usecprow){
1127       z = zarray + 2*ridx[i];
1128       y = yarray + 2*ridx[i];
1129     }
1130     sum1 = y[0]; sum2 = y[1];
1131     for (j=0; j<n; j++) {
1132       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1133       sum1 += v[0]*x1 + v[2]*x2;
1134       sum2 += v[1]*x1 + v[3]*x2;
1135       v += 4;
1136     }
1137     z[0] = sum1; z[1] = sum2;
1138     if (!usecprow){
1139       z += 2; y += 2;
1140     }
1141   }
1142   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1143   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1144   if (zz != yy) {
1145     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1146   }
1147   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 #undef __FUNCT__
1152 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1153 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1154 {
1155   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1156   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1157   MatScalar      *v;
1158   PetscErrorCode ierr;
1159   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1160   PetscTruth     usecprow=a->compressedrow.use;
1161 
1162   PetscFunctionBegin;
1163   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1164   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1165   if (zz != yy) {
1166     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1167   } else {
1168     zarray = yarray;
1169   }
1170 
1171   idx = a->j;
1172   v   = a->a;
1173   if (usecprow){
1174     if (zz != yy){
1175       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1176     }
1177     mbs  = a->compressedrow.nrows;
1178     ii   = a->compressedrow.i;
1179     ridx = a->compressedrow.rindex;
1180   } else {
1181     ii  = a->i;
1182     y   = yarray;
1183     z   = zarray;
1184   }
1185 
1186   for (i=0; i<mbs; i++) {
1187     n  = ii[1] - ii[0]; ii++;
1188     if (usecprow){
1189       z = zarray + 3*ridx[i];
1190       y = yarray + 3*ridx[i];
1191     }
1192     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1193     for (j=0; j<n; j++) {
1194       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1195       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1196       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1197       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1198       v += 9;
1199     }
1200     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1201     if (!usecprow){
1202       z += 3; y += 3;
1203     }
1204   }
1205   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1206   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1207   if (zz != yy) {
1208     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1209   }
1210   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1211   PetscFunctionReturn(0);
1212 }
1213 
1214 #undef __FUNCT__
1215 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1216 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1217 {
1218   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1219   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1220   MatScalar      *v;
1221   PetscErrorCode ierr;
1222   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1223   PetscTruth     usecprow=a->compressedrow.use;
1224 
1225   PetscFunctionBegin;
1226   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1227   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1228   if (zz != yy) {
1229     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1230   } else {
1231     zarray = yarray;
1232   }
1233 
1234   idx   = a->j;
1235   v     = a->a;
1236   if (usecprow){
1237     if (zz != yy){
1238       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1239     }
1240     mbs  = a->compressedrow.nrows;
1241     ii   = a->compressedrow.i;
1242     ridx = a->compressedrow.rindex;
1243   } else {
1244     ii  = a->i;
1245     y   = yarray;
1246     z   = zarray;
1247   }
1248 
1249   for (i=0; i<mbs; i++) {
1250     n  = ii[1] - ii[0]; ii++;
1251     if (usecprow){
1252       z = zarray + 4*ridx[i];
1253       y = yarray + 4*ridx[i];
1254     }
1255     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1256     for (j=0; j<n; j++) {
1257       xb = x + 4*(*idx++);
1258       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1259       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1260       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1261       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1262       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1263       v += 16;
1264     }
1265     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1266     if (!usecprow){
1267       z += 4; y += 4;
1268     }
1269   }
1270   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1271   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1272   if (zz != yy) {
1273     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1274   }
1275   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNCT__
1280 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1281 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1282 {
1283   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1284   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1285   PetscScalar    *yarray,*zarray;
1286   MatScalar      *v;
1287   PetscErrorCode ierr;
1288   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1289   PetscTruth     usecprow=a->compressedrow.use;
1290 
1291   PetscFunctionBegin;
1292   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1293   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1294   if (zz != yy) {
1295     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1296   } else {
1297     zarray = yarray;
1298   }
1299 
1300   idx = a->j;
1301   v   = a->a;
1302   if (usecprow){
1303     if (zz != yy){
1304       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1305     }
1306     mbs  = a->compressedrow.nrows;
1307     ii   = a->compressedrow.i;
1308     ridx = a->compressedrow.rindex;
1309   } else {
1310     ii  = a->i;
1311     y   = yarray;
1312     z   = zarray;
1313   }
1314 
1315   for (i=0; i<mbs; i++) {
1316     n  = ii[1] - ii[0]; ii++;
1317     if (usecprow){
1318       z = zarray + 5*ridx[i];
1319       y = yarray + 5*ridx[i];
1320     }
1321     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1322     for (j=0; j<n; j++) {
1323       xb = x + 5*(*idx++);
1324       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1325       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1326       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1327       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1328       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1329       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1330       v += 25;
1331     }
1332     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1333     if (!usecprow){
1334       z += 5; y += 5;
1335     }
1336   }
1337   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1338   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1339   if (zz != yy) {
1340     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1341   }
1342   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1343   PetscFunctionReturn(0);
1344 }
1345 #undef __FUNCT__
1346 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1347 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1348 {
1349   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1350   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1351   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1352   MatScalar      *v;
1353   PetscErrorCode ierr;
1354   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1355   PetscTruth     usecprow=a->compressedrow.use;
1356 
1357   PetscFunctionBegin;
1358   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1359   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1360   if (zz != yy) {
1361     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1362   } else {
1363     zarray = yarray;
1364   }
1365 
1366   idx = a->j;
1367   v   = a->a;
1368   if (usecprow){
1369     if (zz != yy){
1370       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1371     }
1372     mbs  = a->compressedrow.nrows;
1373     ii   = a->compressedrow.i;
1374     ridx = a->compressedrow.rindex;
1375   } else {
1376     ii  = a->i;
1377     y   = yarray;
1378     z   = zarray;
1379   }
1380 
1381   for (i=0; i<mbs; i++) {
1382     n  = ii[1] - ii[0]; ii++;
1383     if (usecprow){
1384       z = zarray + 6*ridx[i];
1385       y = yarray + 6*ridx[i];
1386     }
1387     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1388     for (j=0; j<n; j++) {
1389       xb = x + 6*(*idx++);
1390       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1391       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1392       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1393       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1394       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1395       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1396       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1397       v += 36;
1398     }
1399     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1400     if (!usecprow){
1401       z += 6; y += 6;
1402     }
1403   }
1404   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1405   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1406   if (zz != yy) {
1407     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1408   }
1409   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1410   PetscFunctionReturn(0);
1411 }
1412 
1413 #undef __FUNCT__
1414 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1415 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1416 {
1417   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1418   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1419   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1420   MatScalar      *v;
1421   PetscErrorCode ierr;
1422   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1423   PetscTruth     usecprow=a->compressedrow.use;
1424 
1425   PetscFunctionBegin;
1426   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1427   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1428   if (zz != yy) {
1429     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1430   } else {
1431     zarray = yarray;
1432   }
1433 
1434   idx = a->j;
1435   v   = a->a;
1436   if (usecprow){
1437     if (zz != yy){
1438       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1439     }
1440     mbs  = a->compressedrow.nrows;
1441     ii   = a->compressedrow.i;
1442     ridx = a->compressedrow.rindex;
1443   } else {
1444     ii  = a->i;
1445     y   = yarray;
1446     z   = zarray;
1447   }
1448 
1449   for (i=0; i<mbs; i++) {
1450     n  = ii[1] - ii[0]; ii++;
1451     if (usecprow){
1452       z = zarray + 7*ridx[i];
1453       y = yarray + 7*ridx[i];
1454     }
1455     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1456     for (j=0; j<n; j++) {
1457       xb = x + 7*(*idx++);
1458       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1459       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1460       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1461       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1462       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1463       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1464       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1465       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1466       v += 49;
1467     }
1468     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1469     if (!usecprow){
1470       z += 7; y += 7;
1471     }
1472   }
1473   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1474   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1475   if (zz != yy) {
1476     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1477   }
1478   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1479   PetscFunctionReturn(0);
1480 }
1481 
1482 #undef __FUNCT__
1483 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1484 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1485 {
1486   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1487   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1488   MatScalar      *v;
1489   PetscErrorCode ierr;
1490   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1491   PetscInt       ncols,k,*ridx=PETSC_NULL;
1492   PetscTruth     usecprow=a->compressedrow.use;
1493 
1494   PetscFunctionBegin;
1495   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1497   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1498 
1499   idx = a->j;
1500   v   = a->a;
1501   if (usecprow){
1502     mbs    = a->compressedrow.nrows;
1503     ii     = a->compressedrow.i;
1504     ridx = a->compressedrow.rindex;
1505   } else {
1506     mbs = a->mbs;
1507     ii  = a->i;
1508     z   = zarray;
1509   }
1510 
1511   if (!a->mult_work) {
1512     k    = PetscMax(A->rmap->n,A->cmap->n);
1513     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1514   }
1515   work = a->mult_work;
1516   for (i=0; i<mbs; i++) {
1517     n     = ii[1] - ii[0]; ii++;
1518     ncols = n*bs;
1519     workt = work;
1520     for (j=0; j<n; j++) {
1521       xb = x + bs*(*idx++);
1522       for (k=0; k<bs; k++) workt[k] = xb[k];
1523       workt += bs;
1524     }
1525     if (usecprow) z = zarray + bs*ridx[i];
1526     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1527     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1528     v += n*bs2;
1529     if (!usecprow){
1530       z += bs;
1531     }
1532   }
1533   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1534   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1535   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 #undef __FUNCT__
1540 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1541 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1542 {
1543   PetscScalar    zero = 0.0;
1544   PetscErrorCode ierr;
1545 
1546   PetscFunctionBegin;
1547   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1548   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 #undef __FUNCT__
1553 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1554 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1555 {
1556   PetscScalar    zero = 0.0;
1557   PetscErrorCode ierr;
1558 
1559   PetscFunctionBegin;
1560   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1561   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1567 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1568 
1569 {
1570   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1571   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1572   MatScalar         *v;
1573   PetscErrorCode    ierr;
1574   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1575   Mat_CompressedRow cprow = a->compressedrow;
1576   PetscTruth        usecprow=cprow.use;
1577 
1578   PetscFunctionBegin;
1579   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1580   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1581   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1582 
1583   idx = a->j;
1584   v   = a->a;
1585   if (usecprow){
1586     mbs  = cprow.nrows;
1587     ii   = cprow.i;
1588     ridx = cprow.rindex;
1589   } else {
1590     mbs=a->mbs;
1591     ii = a->i;
1592     xb = x;
1593   }
1594 
1595   switch (bs) {
1596   case 1:
1597     for (i=0; i<mbs; i++) {
1598       if (usecprow) xb = x + ridx[i];
1599       x1 = xb[0];
1600       ib = idx + ii[0];
1601       n  = ii[1] - ii[0]; ii++;
1602       for (j=0; j<n; j++) {
1603         rval    = ib[j];
1604         z[rval] += PetscConj(*v) * x1;
1605         v++;
1606       }
1607       if (!usecprow) xb++;
1608     }
1609     break;
1610   case 2:
1611     for (i=0; i<mbs; i++) {
1612       if (usecprow) xb = x + 2*ridx[i];
1613       x1 = xb[0]; x2 = xb[1];
1614       ib = idx + ii[0];
1615       n  = ii[1] - ii[0]; ii++;
1616       for (j=0; j<n; j++) {
1617         rval      = ib[j]*2;
1618         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1619         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1620         v  += 4;
1621       }
1622       if (!usecprow) xb += 2;
1623     }
1624     break;
1625   case 3:
1626     for (i=0; i<mbs; i++) {
1627       if (usecprow) xb = x + 3*ridx[i];
1628       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1629       ib = idx + ii[0];
1630       n  = ii[1] - ii[0]; ii++;
1631       for (j=0; j<n; j++) {
1632         rval      = ib[j]*3;
1633         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1634         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1635         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1636         v  += 9;
1637       }
1638       if (!usecprow) xb += 3;
1639     }
1640     break;
1641   case 4:
1642     for (i=0; i<mbs; i++) {
1643       if (usecprow) xb = x + 4*ridx[i];
1644       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1645       ib = idx + ii[0];
1646       n  = ii[1] - ii[0]; ii++;
1647       for (j=0; j<n; j++) {
1648         rval      = ib[j]*4;
1649         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1650         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1651         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1652         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1653         v  += 16;
1654       }
1655       if (!usecprow) xb += 4;
1656     }
1657     break;
1658   case 5:
1659     for (i=0; i<mbs; i++) {
1660       if (usecprow) xb = x + 5*ridx[i];
1661       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1662       x4 = xb[3]; x5 = xb[4];
1663       ib = idx + ii[0];
1664       n  = ii[1] - ii[0]; ii++;
1665       for (j=0; j<n; j++) {
1666         rval      = ib[j]*5;
1667         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1668         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1669         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1670         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1671         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1672         v  += 25;
1673       }
1674       if (!usecprow) xb += 5;
1675     }
1676     break;
1677   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1678       PetscInt     ncols,k;
1679       PetscScalar  *work,*workt,*xtmp;
1680 
1681       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1682       if (!a->mult_work) {
1683         k = PetscMax(A->rmap->n,A->cmap->n);
1684         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1685       }
1686       work = a->mult_work;
1687       xtmp = x;
1688       for (i=0; i<mbs; i++) {
1689         n     = ii[1] - ii[0]; ii++;
1690         ncols = n*bs;
1691         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1692         if (usecprow) {
1693           xtmp = x + bs*ridx[i];
1694         }
1695         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1696         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1697         v += n*bs2;
1698         if (!usecprow) xtmp += bs;
1699         workt = work;
1700         for (j=0; j<n; j++) {
1701           zb = z + bs*(*idx++);
1702           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1703           workt += bs;
1704         }
1705       }
1706     }
1707   }
1708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1709   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1710   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 #undef __FUNCT__
1715 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1716 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1717 {
1718   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1719   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1720   MatScalar         *v;
1721   PetscErrorCode    ierr;
1722   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1723   Mat_CompressedRow cprow = a->compressedrow;
1724   PetscTruth        usecprow=cprow.use;
1725 
1726   PetscFunctionBegin;
1727   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1728   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1729   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1730 
1731   idx = a->j;
1732   v   = a->a;
1733   if (usecprow){
1734     mbs  = cprow.nrows;
1735     ii   = cprow.i;
1736     ridx = cprow.rindex;
1737   } else {
1738     mbs=a->mbs;
1739     ii = a->i;
1740     xb = x;
1741   }
1742 
1743   switch (bs) {
1744   case 1:
1745     for (i=0; i<mbs; i++) {
1746       if (usecprow) xb = x + ridx[i];
1747       x1 = xb[0];
1748       ib = idx + ii[0];
1749       n  = ii[1] - ii[0]; ii++;
1750       for (j=0; j<n; j++) {
1751         rval    = ib[j];
1752         z[rval] += *v * x1;
1753         v++;
1754       }
1755       if (!usecprow) xb++;
1756     }
1757     break;
1758   case 2:
1759     for (i=0; i<mbs; i++) {
1760       if (usecprow) xb = x + 2*ridx[i];
1761       x1 = xb[0]; x2 = xb[1];
1762       ib = idx + ii[0];
1763       n  = ii[1] - ii[0]; ii++;
1764       for (j=0; j<n; j++) {
1765         rval      = ib[j]*2;
1766         z[rval++] += v[0]*x1 + v[1]*x2;
1767         z[rval++] += v[2]*x1 + v[3]*x2;
1768         v  += 4;
1769       }
1770       if (!usecprow) xb += 2;
1771     }
1772     break;
1773   case 3:
1774     for (i=0; i<mbs; i++) {
1775       if (usecprow) xb = x + 3*ridx[i];
1776       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1777       ib = idx + ii[0];
1778       n  = ii[1] - ii[0]; ii++;
1779       for (j=0; j<n; j++) {
1780         rval      = ib[j]*3;
1781         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1782         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1783         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1784         v  += 9;
1785       }
1786       if (!usecprow) xb += 3;
1787     }
1788     break;
1789   case 4:
1790     for (i=0; i<mbs; i++) {
1791       if (usecprow) xb = x + 4*ridx[i];
1792       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1793       ib = idx + ii[0];
1794       n  = ii[1] - ii[0]; ii++;
1795       for (j=0; j<n; j++) {
1796         rval      = ib[j]*4;
1797         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1798         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1799         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1800         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1801         v  += 16;
1802       }
1803       if (!usecprow) xb += 4;
1804     }
1805     break;
1806   case 5:
1807     for (i=0; i<mbs; i++) {
1808       if (usecprow) xb = x + 5*ridx[i];
1809       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1810       x4 = xb[3]; x5 = xb[4];
1811       ib = idx + ii[0];
1812       n  = ii[1] - ii[0]; ii++;
1813       for (j=0; j<n; j++) {
1814         rval      = ib[j]*5;
1815         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1816         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1817         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1818         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1819         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1820         v  += 25;
1821       }
1822       if (!usecprow) xb += 5;
1823     }
1824     break;
1825   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1826       PetscInt     ncols,k;
1827       PetscScalar  *work,*workt,*xtmp;
1828 
1829       if (!a->mult_work) {
1830         k = PetscMax(A->rmap->n,A->cmap->n);
1831         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1832       }
1833       work = a->mult_work;
1834       xtmp = x;
1835       for (i=0; i<mbs; i++) {
1836         n     = ii[1] - ii[0]; ii++;
1837         ncols = n*bs;
1838         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1839         if (usecprow) {
1840           xtmp = x + bs*ridx[i];
1841         }
1842         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1843         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1844         v += n*bs2;
1845         if (!usecprow) xtmp += bs;
1846         workt = work;
1847         for (j=0; j<n; j++) {
1848           zb = z + bs*(*idx++);
1849           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1850           workt += bs;
1851         }
1852       }
1853     }
1854   }
1855   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1856   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1857   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 #undef __FUNCT__
1862 #define __FUNCT__ "MatScale_SeqBAIJ"
1863 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1864 {
1865   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1866   PetscInt       totalnz = a->bs2*a->nz;
1867   PetscScalar    oalpha = alpha;
1868   PetscErrorCode ierr;
1869   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1870 
1871   PetscFunctionBegin;
1872   BLASscal_(&tnz,&oalpha,a->a,&one);
1873   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1874   PetscFunctionReturn(0);
1875 }
1876 
1877 #undef __FUNCT__
1878 #define __FUNCT__ "MatNorm_SeqBAIJ"
1879 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1880 {
1881   PetscErrorCode ierr;
1882   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1883   MatScalar      *v = a->a;
1884   PetscReal      sum = 0.0;
1885   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1886 
1887   PetscFunctionBegin;
1888   if (type == NORM_FROBENIUS) {
1889     for (i=0; i< bs2*nz; i++) {
1890 #if defined(PETSC_USE_COMPLEX)
1891       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1892 #else
1893       sum += (*v)*(*v); v++;
1894 #endif
1895     }
1896     *norm = sqrt(sum);
1897   } else if (type == NORM_1) { /* maximum column sum */
1898     PetscReal *tmp;
1899     PetscInt  *bcol = a->j;
1900     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1901     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1902     for (i=0; i<nz; i++){
1903       for (j=0; j<bs; j++){
1904         k1 = bs*(*bcol) + j; /* column index */
1905         for (k=0; k<bs; k++){
1906           tmp[k1] += PetscAbsScalar(*v); v++;
1907         }
1908       }
1909       bcol++;
1910     }
1911     *norm = 0.0;
1912     for (j=0; j<A->cmap->n; j++) {
1913       if (tmp[j] > *norm) *norm = tmp[j];
1914     }
1915     ierr = PetscFree(tmp);CHKERRQ(ierr);
1916   } else if (type == NORM_INFINITY) { /* maximum row sum */
1917     *norm = 0.0;
1918     for (k=0; k<bs; k++) {
1919       for (j=0; j<a->mbs; j++) {
1920         v = a->a + bs2*a->i[j] + k;
1921         sum = 0.0;
1922         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1923           for (k1=0; k1<bs; k1++){
1924             sum += PetscAbsScalar(*v);
1925             v   += bs;
1926           }
1927         }
1928         if (sum > *norm) *norm = sum;
1929       }
1930     }
1931   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1932   PetscFunctionReturn(0);
1933 }
1934 
1935 
1936 #undef __FUNCT__
1937 #define __FUNCT__ "MatEqual_SeqBAIJ"
1938 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1939 {
1940   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1941   PetscErrorCode ierr;
1942 
1943   PetscFunctionBegin;
1944   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1945   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1946     *flg = PETSC_FALSE;
1947     PetscFunctionReturn(0);
1948   }
1949 
1950   /* if the a->i are the same */
1951   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1952   if (!*flg) {
1953     PetscFunctionReturn(0);
1954   }
1955 
1956   /* if a->j are the same */
1957   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1958   if (!*flg) {
1959     PetscFunctionReturn(0);
1960   }
1961   /* if a->a are the same */
1962   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1963   PetscFunctionReturn(0);
1964 
1965 }
1966 
1967 #undef __FUNCT__
1968 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1969 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1970 {
1971   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1972   PetscErrorCode ierr;
1973   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1974   PetscScalar    *x,zero = 0.0;
1975   MatScalar      *aa,*aa_j;
1976 
1977   PetscFunctionBegin;
1978   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1979   bs   = A->rmap->bs;
1980   aa   = a->a;
1981   ai   = a->i;
1982   aj   = a->j;
1983   ambs = a->mbs;
1984   bs2  = a->bs2;
1985 
1986   ierr = VecSet(v,zero);CHKERRQ(ierr);
1987   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1988   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1989   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1990   for (i=0; i<ambs; i++) {
1991     for (j=ai[i]; j<ai[i+1]; j++) {
1992       if (aj[j] == i) {
1993         row  = i*bs;
1994         aa_j = aa+j*bs2;
1995         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1996         break;
1997       }
1998     }
1999   }
2000   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2001   PetscFunctionReturn(0);
2002 }
2003 
2004 #undef __FUNCT__
2005 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2006 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2007 {
2008   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2009   PetscScalar    *l,*r,x,*li,*ri;
2010   MatScalar      *aa,*v;
2011   PetscErrorCode ierr;
2012   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
2013 
2014   PetscFunctionBegin;
2015   ai  = a->i;
2016   aj  = a->j;
2017   aa  = a->a;
2018   m   = A->rmap->n;
2019   n   = A->cmap->n;
2020   bs  = A->rmap->bs;
2021   mbs = a->mbs;
2022   bs2 = a->bs2;
2023   if (ll) {
2024     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
2025     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2026     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2027     for (i=0; i<mbs; i++) { /* for each block row */
2028       M  = ai[i+1] - ai[i];
2029       li = l + i*bs;
2030       v  = aa + bs2*ai[i];
2031       for (j=0; j<M; j++) { /* for each block */
2032         for (k=0; k<bs2; k++) {
2033           (*v++) *= li[k%bs];
2034         }
2035       }
2036     }
2037     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
2038     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2039   }
2040 
2041   if (rr) {
2042     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
2043     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2044     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2045     for (i=0; i<mbs; i++) { /* for each block row */
2046       M  = ai[i+1] - ai[i];
2047       v  = aa + bs2*ai[i];
2048       for (j=0; j<M; j++) { /* for each block */
2049         ri = r + bs*aj[ai[i]+j];
2050         for (k=0; k<bs; k++) {
2051           x = ri[k];
2052           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
2053         }
2054       }
2055     }
2056     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
2057     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2058   }
2059   PetscFunctionReturn(0);
2060 }
2061 
2062 
2063 #undef __FUNCT__
2064 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2065 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2066 {
2067   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2068 
2069   PetscFunctionBegin;
2070   info->block_size     = a->bs2;
2071   info->nz_allocated   = a->maxnz;
2072   info->nz_used        = a->bs2*a->nz;
2073   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
2074   info->assemblies   = A->num_ass;
2075   info->mallocs      = a->reallocs;
2076   info->memory       = ((PetscObject)A)->mem;
2077   if (A->factortype) {
2078     info->fill_ratio_given  = A->info.fill_ratio_given;
2079     info->fill_ratio_needed = A->info.fill_ratio_needed;
2080     info->factor_mallocs    = A->info.factor_mallocs;
2081   } else {
2082     info->fill_ratio_given  = 0;
2083     info->fill_ratio_needed = 0;
2084     info->factor_mallocs    = 0;
2085   }
2086   PetscFunctionReturn(0);
2087 }
2088 
2089 
2090 #undef __FUNCT__
2091 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2092 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2093 {
2094   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2095   PetscErrorCode ierr;
2096 
2097   PetscFunctionBegin;
2098   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2099   PetscFunctionReturn(0);
2100 }
2101