xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 6abefa4f446b2ea91915da4e25ea0defa83db48e)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "petscblaslapack.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
15   const PetscInt *idx;
16   PetscInt       start,end,*ai,*aj,bs,*nidx2;
17   PetscBT        table;
18 
19   PetscFunctionBegin;
20   m     = a->mbs;
21   ai    = a->i;
22   aj    = a->j;
23   bs    = A->rmap->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
30 
31   for (i=0; i<is_max; i++) {
32     /* Initialise the two local arrays */
33     isz  = 0;
34     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j=0; j<n ; ++j){
42       ival = idx[j]/bs; /* convert the indices into block indices */
43       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap*/
51       n = isz;
52       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end ; l++){
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++)
65         nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 #undef __FUNCT__
76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
78 {
79   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
80   PetscErrorCode ierr;
81   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
82   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
83   const PetscInt *irow,*icol;
84   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
85   PetscInt       *aj = a->j,*ai = a->i;
86   MatScalar      *mat_a;
87   Mat            C;
88   PetscTruth     flag,sorted;
89 
90   PetscFunctionBegin;
91   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
92   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
93 
94   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
95   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
98 
99   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
100   ssmap = smap;
101   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
102   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
103   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109       for (k=kstart; k<kend; k++) {
110         if (ssmap[aj[k]]) {
111           lens[i]++;
112         }
113       }
114     }
115   /* Create and fill new matrix */
116   if (scall == MAT_REUSE_MATRIX) {
117     c = (Mat_SeqBAIJ *)((*B)->data);
118 
119     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
121     if (!flag) {
122       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
123     }
124     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
125     C = *B;
126   } else {
127     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
128     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
129     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
130     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
131   }
132   c = (Mat_SeqBAIJ *)(C->data);
133   for (i=0; i<nrows; i++) {
134     row    = irow[i];
135     kstart = ai[row];
136     kend   = kstart + a->ilen[row];
137     mat_i  = c->i[i];
138     mat_j  = c->j + mat_i;
139     mat_a  = c->a + mat_i*bs2;
140     mat_ilen = c->ilen + i;
141     for (k=kstart; k<kend; k++) {
142       if ((tcol=ssmap[a->j[k]])) {
143         *mat_j++ = tcol - 1;
144         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
145         mat_a   += bs2;
146         (*mat_ilen)++;
147       }
148     }
149   }
150 
151   /* Free work space */
152   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
153   ierr = PetscFree(smap);CHKERRQ(ierr);
154   ierr = PetscFree(lens);CHKERRQ(ierr);
155   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
157 
158   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
159   *B = C;
160   PetscFunctionReturn(0);
161 }
162 
163 #undef __FUNCT__
164 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
165 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
166 {
167   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
168   IS             is1,is2;
169   PetscErrorCode ierr;
170   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
171   const PetscInt *irow,*icol;
172 
173   PetscFunctionBegin;
174   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
175   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
176   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
177   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
178 
179   /* Verify if the indices corespond to each element in a block
180    and form the IS with compressed IS */
181   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
182   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
183   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
184   count = 0;
185   for (i=0; i<a->mbs; i++) {
186     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
187     if (vary[i]==bs) iary[count++] = i;
188   }
189   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
190 
191   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
192   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
193   count = 0;
194   for (i=0; i<a->mbs; i++) {
195     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
196     if (vary[i]==bs) iary[count++] = i;
197   }
198   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr);
199   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
200   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
201   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
202 
203   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
204   ISDestroy(is1);
205   ISDestroy(is2);
206   PetscFunctionReturn(0);
207 }
208 
209 #undef __FUNCT__
210 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
211 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
212 {
213   PetscErrorCode ierr;
214   PetscInt       i;
215 
216   PetscFunctionBegin;
217   if (scall == MAT_INITIAL_MATRIX) {
218     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
219   }
220 
221   for (i=0; i<n; i++) {
222     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
223   }
224   PetscFunctionReturn(0);
225 }
226 
227 
228 /* -------------------------------------------------------*/
229 /* Should check that shapes of vectors and matrices match */
230 /* -------------------------------------------------------*/
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatMult_SeqBAIJ_1"
234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
235 {
236   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
237   PetscScalar       *z,sum;
238   const PetscScalar *x;
239   const MatScalar   *v;
240   PetscErrorCode    ierr;
241   PetscInt          mbs,i,n,nonzerorow=0;
242   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
243   PetscTruth        usecprow=a->compressedrow.use;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
247   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
248 
249   if (usecprow){
250     mbs  = a->compressedrow.nrows;
251     ii   = a->compressedrow.i;
252     ridx = a->compressedrow.rindex;
253     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
254   } else {
255     mbs = a->mbs;
256     ii  = a->i;
257   }
258 
259   for (i=0; i<mbs; i++) {
260     n    = ii[1] - ii[0];
261     v    = a->a + ii[0];
262     idx  = a->j + ii[0];
263     ii++;
264     sum  = 0.0;
265     PetscSparseDensePlusDot(sum,x,v,idx,n);
266     if (usecprow){
267       z[ridx[i]] = sum;
268     } else {
269       nonzerorow += (n>0);
270       z[i] = sum;
271     }
272   }
273   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
275   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatMult_SeqBAIJ_2"
281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284   PetscScalar       *z = 0,sum1,sum2,*zarray;
285   const PetscScalar *x,*xb;
286   PetscScalar       x1,x2;
287   const MatScalar   *v;
288   PetscErrorCode    ierr;
289   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
290   PetscTruth        usecprow=a->compressedrow.use;
291 
292   PetscFunctionBegin;
293   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
294   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
295 
296   idx = a->j;
297   v   = a->a;
298   if (usecprow){
299     mbs  = a->compressedrow.nrows;
300     ii   = a->compressedrow.i;
301     ridx = a->compressedrow.rindex;
302   } else {
303     mbs = a->mbs;
304     ii  = a->i;
305     z   = zarray;
306   }
307 
308   for (i=0; i<mbs; i++) {
309     n  = ii[1] - ii[0]; ii++;
310     sum1 = 0.0; sum2 = 0.0;
311     nonzerorow += (n>0);
312     for (j=0; j<n; j++) {
313       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
314       sum1 += v[0]*x1 + v[2]*x2;
315       sum2 += v[1]*x1 + v[3]*x2;
316       v += 4;
317     }
318     if (usecprow) z = zarray + 2*ridx[i];
319     z[0] = sum1; z[1] = sum2;
320     if (!usecprow) z += 2;
321   }
322   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
323   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
324   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatMult_SeqBAIJ_3"
330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334   const PetscScalar *x,*xb;
335   const MatScalar   *v;
336   PetscErrorCode    ierr;
337   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
338   PetscTruth        usecprow=a->compressedrow.use;
339 
340 
341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342 #pragma disjoint(*v,*z,*xb)
343 #endif
344 
345   PetscFunctionBegin;
346   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
347   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
348 
349   idx = a->j;
350   v   = a->a;
351   if (usecprow){
352     mbs  = a->compressedrow.nrows;
353     ii   = a->compressedrow.i;
354     ridx = a->compressedrow.rindex;
355   } else {
356     mbs = a->mbs;
357     ii  = a->i;
358     z   = zarray;
359   }
360 
361   for (i=0; i<mbs; i++) {
362     n  = ii[1] - ii[0]; ii++;
363     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
364     nonzerorow += (n>0);
365     for (j=0; j<n; j++) {
366       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
367       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
368       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
369       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
370       v += 9;
371     }
372     if (usecprow) z = zarray + 3*ridx[i];
373     z[0] = sum1; z[1] = sum2; z[2] = sum3;
374     if (!usecprow) z += 3;
375   }
376   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
377   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
378   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
379   PetscFunctionReturn(0);
380 }
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatMult_SeqBAIJ_4"
384 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
385 {
386   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
387   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
388   const PetscScalar *x,*xb;
389   const MatScalar   *v;
390   PetscErrorCode    ierr;
391   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
392   PetscTruth        usecprow=a->compressedrow.use;
393 
394   PetscFunctionBegin;
395   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
396   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
397 
398   idx = a->j;
399   v   = a->a;
400   if (usecprow){
401     mbs  = a->compressedrow.nrows;
402     ii   = a->compressedrow.i;
403     ridx = a->compressedrow.rindex;
404   } else {
405     mbs = a->mbs;
406     ii  = a->i;
407     z   = zarray;
408   }
409 
410   for (i=0; i<mbs; i++) {
411     n  = ii[1] - ii[0]; ii++;
412     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
413     nonzerorow += (n>0);
414     for (j=0; j<n; j++) {
415       xb = x + 4*(*idx++);
416       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
418       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
419       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
420       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
421       v += 16;
422     }
423     if (usecprow) z = zarray + 4*ridx[i];
424     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
425     if (!usecprow) z += 4;
426   }
427   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
428   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
429   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 #undef __FUNCT__
434 #define __FUNCT__ "MatMult_SeqBAIJ_5"
435 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
436 {
437   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
438   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
439   const PetscScalar *xb,*x;
440   const MatScalar   *v;
441   PetscErrorCode    ierr;
442   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
443   PetscInt          mbs,i,j,n,nonzerorow=0;
444   PetscTruth        usecprow=a->compressedrow.use;
445 
446   PetscFunctionBegin;
447   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
448   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
449 
450   idx = a->j;
451   v   = a->a;
452   if (usecprow){
453     mbs  = a->compressedrow.nrows;
454     ii   = a->compressedrow.i;
455     ridx = a->compressedrow.rindex;
456   } else {
457     mbs = a->mbs;
458     ii  = a->i;
459     z   = zarray;
460   }
461 
462   for (i=0; i<mbs; i++) {
463     n  = ii[1] - ii[0]; ii++;
464     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
465     nonzerorow += (n>0);
466     for (j=0; j<n; j++) {
467       xb = x + 5*(*idx++);
468       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
469       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
470       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
471       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
472       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
473       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
474       v += 25;
475     }
476     if (usecprow) z = zarray + 5*ridx[i];
477     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
478     if (!usecprow) z += 5;
479   }
480   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
481   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
482   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 
487 #undef __FUNCT__
488 #define __FUNCT__ "MatMult_SeqBAIJ_6"
489 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
490 {
491   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
492   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
493   const PetscScalar *x,*xb;
494   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
495   const MatScalar   *v;
496   PetscErrorCode    ierr;
497   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
498   PetscTruth        usecprow=a->compressedrow.use;
499 
500   PetscFunctionBegin;
501   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
502   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
503 
504   idx = a->j;
505   v   = a->a;
506   if (usecprow){
507     mbs  = a->compressedrow.nrows;
508     ii   = a->compressedrow.i;
509     ridx = a->compressedrow.rindex;
510   } else {
511     mbs = a->mbs;
512     ii  = a->i;
513     z   = zarray;
514   }
515 
516   for (i=0; i<mbs; i++) {
517     n  = ii[1] - ii[0]; ii++;
518     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
519     nonzerorow += (n>0);
520     for (j=0; j<n; j++) {
521       xb = x + 6*(*idx++);
522       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
523       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
524       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
525       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
526       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
527       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
528       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
529       v += 36;
530     }
531     if (usecprow) z = zarray + 6*ridx[i];
532     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
533     if (!usecprow) z += 6;
534   }
535 
536   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
537   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
538   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
539   PetscFunctionReturn(0);
540 }
541 
542 #undef __FUNCT__
543 #define __FUNCT__ "MatMult_SeqBAIJ_7"
544 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
545 {
546   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
547   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
548   const PetscScalar *x,*xb;
549   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
550   const MatScalar   *v;
551   PetscErrorCode    ierr;
552   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
553   PetscTruth        usecprow=a->compressedrow.use;
554 
555   PetscFunctionBegin;
556   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
557   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
558 
559   idx = a->j;
560   v   = a->a;
561   if (usecprow){
562     mbs    = a->compressedrow.nrows;
563     ii     = a->compressedrow.i;
564     ridx = a->compressedrow.rindex;
565   } else {
566     mbs = a->mbs;
567     ii  = a->i;
568     z   = zarray;
569   }
570 
571   for (i=0; i<mbs; i++) {
572     n  = ii[1] - ii[0]; ii++;
573     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
574     nonzerorow += (n>0);
575     for (j=0; j<n; j++) {
576       xb = x + 7*(*idx++);
577       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
578       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
579       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
580       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
581       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
582       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
583       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
584       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
585       v += 49;
586     }
587     if (usecprow) z = zarray + 7*ridx[i];
588     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
589     if (!usecprow) z += 7;
590   }
591 
592   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
593   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
594   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
599 /* Default MatMult for block size 15 */
600 
601 #undef __FUNCT__
602 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
603 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
604 {
605   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
606   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
607   const PetscScalar *x,*xb;
608   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
609   const MatScalar   *v;
610   PetscErrorCode    ierr;
611   const PetscInt    *ii,*ij=a->j,*idx;
612   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
613   PetscTruth        usecprow=a->compressedrow.use;
614 
615   PetscFunctionBegin;
616   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
617   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
618 
619   v   = a->a;
620   if (usecprow){
621     mbs    = a->compressedrow.nrows;
622     ii     = a->compressedrow.i;
623     ridx = a->compressedrow.rindex;
624   } else {
625     mbs = a->mbs;
626     ii  = a->i;
627     z   = zarray;
628   }
629 
630   for (i=0; i<mbs; i++) {
631     n  = ii[i+1] - ii[i];
632     idx = ij + ii[i];
633     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
634     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
635 
636     nonzerorow += (n>0);
637     for (j=0; j<n; j++) {
638       xb = x + 15*(idx[j]);
639       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
640       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
641 
642       for(k=0;k<15;k++){
643 	sum1  += v[0]*xb[k];
644         sum2  += v[1]*xb[k];
645 	sum3  += v[2]*xb[k];
646 	sum4  += v[3]*xb[k];
647 	sum5  += v[4]*xb[k];
648         sum6  += v[5]*xb[k];
649 	sum7  += v[6]*xb[k];
650 	sum8  += v[7]*xb[k];
651         sum9  += v[8]*xb[k];
652         sum10 += v[9]*xb[k];
653 	sum11 += v[10]*xb[k];
654 	sum12 += v[11]*xb[k];
655 	sum13 += v[12]*xb[k];
656         sum14 += v[13]*xb[k];
657 	sum15 += v[14]*xb[k];
658 	v += 15;
659       }
660     }
661     if (usecprow) z = zarray + 15*ridx[i];
662     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
663     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
664 
665     if (!usecprow) z += 15;
666   }
667 
668   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
669   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
670   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
675 #undef __FUNCT__
676 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
677 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
678 {
679   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
680   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
681   const PetscScalar *x,*xb;
682   PetscScalar        x1,x2,x3,x4,*zarray;
683   const MatScalar   *v;
684   PetscErrorCode    ierr;
685   const PetscInt    *ii,*ij=a->j,*idx;
686   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
687   PetscTruth        usecprow=a->compressedrow.use;
688 
689   PetscFunctionBegin;
690   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
691   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
692 
693   v   = a->a;
694   if (usecprow){
695     mbs    = a->compressedrow.nrows;
696     ii     = a->compressedrow.i;
697     ridx = a->compressedrow.rindex;
698   } else {
699     mbs = a->mbs;
700     ii  = a->i;
701     z   = zarray;
702   }
703 
704   for (i=0; i<mbs; i++) {
705     n  = ii[i+1] - ii[i];
706     idx = ij + ii[i];
707     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
708     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
709 
710     nonzerorow += (n>0);
711     for (j=0; j<n; j++) {
712       xb = x + 15*(idx[j]);
713       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
714 
715       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
716       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
717       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
718       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
719       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
720       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
721       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
722       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
723       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
724       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
725       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
726       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
727       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
728       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
729       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
730 
731       v += 60;
732 
733       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
734 
735       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
736       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
737       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
738       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
739       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
740       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
741       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
742       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
743       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
744       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
745       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
746       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
747       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
748       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
749       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
750       v += 60;
751 
752       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
753       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
754       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
755       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
756       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
757       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
758       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
759       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
760       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
761       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
762       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
763       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
764       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
765       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
766       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
767       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
768       v  += 60;
769 
770       x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
771       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
772       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
773       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
774       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
775       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
776       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
777       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
778       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
779       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
780       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
781       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
782       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
783       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
784       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
785       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
786       v += 45;
787     }
788     if (usecprow) z = zarray + 15*ridx[i];
789     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
790     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
791 
792     if (!usecprow) z += 15;
793   }
794 
795   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
796   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
797   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
798   PetscFunctionReturn(0);
799 }
800 
801 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
802 #undef __FUNCT__
803 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
804 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
805 {
806   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
807   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
808   const PetscScalar *x,*xb;
809   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
810   const MatScalar   *v;
811   PetscErrorCode    ierr;
812   const PetscInt    *ii,*ij=a->j,*idx;
813   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
814   PetscTruth        usecprow=a->compressedrow.use;
815 
816   PetscFunctionBegin;
817   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
818   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
819 
820   v   = a->a;
821   if (usecprow){
822     mbs    = a->compressedrow.nrows;
823     ii     = a->compressedrow.i;
824     ridx = a->compressedrow.rindex;
825   } else {
826     mbs = a->mbs;
827     ii  = a->i;
828     z   = zarray;
829   }
830 
831   for (i=0; i<mbs; i++) {
832     n  = ii[i+1] - ii[i];
833     idx = ij + ii[i];
834     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
835     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
836 
837     nonzerorow += (n>0);
838     for (j=0; j<n; j++) {
839       xb = x + 15*(idx[j]);
840       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
841       x8 = xb[7];
842 
843       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
844       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
845       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
846       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
847       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
848       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
849       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
850       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
851       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
852       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
853       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
854       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
855       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
856       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
857       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
858       v += 120;
859 
860       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
861 
862       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
863       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
864       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
865       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
866       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
867       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
868       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
869       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
870       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
871       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
872       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
873       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
874       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
875       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
876       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
877       v += 105;
878     }
879     if (usecprow) z = zarray + 15*ridx[i];
880     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
881     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
882 
883     if (!usecprow) z += 15;
884   }
885 
886   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
887   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
888   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
889   PetscFunctionReturn(0);
890 }
891 
892 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
893 
894 #undef __FUNCT__
895 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
896 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
897 {
898   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
899   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
900   const PetscScalar *x,*xb;
901   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
902   const MatScalar   *v;
903   PetscErrorCode    ierr;
904   const PetscInt    *ii,*ij=a->j,*idx;
905   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
906   PetscTruth        usecprow=a->compressedrow.use;
907 
908   PetscFunctionBegin;
909   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
910   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
911 
912   v   = a->a;
913   if (usecprow){
914     mbs    = a->compressedrow.nrows;
915     ii     = a->compressedrow.i;
916     ridx = a->compressedrow.rindex;
917   } else {
918     mbs = a->mbs;
919     ii  = a->i;
920     z   = zarray;
921   }
922 
923   for (i=0; i<mbs; i++) {
924     n  = ii[i+1] - ii[i];
925     idx = ij + ii[i];
926     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
927     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
928 
929     nonzerorow += (n>0);
930     for (j=0; j<n; j++) {
931       xb = x + 15*(idx[j]);
932       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
933       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
934 
935       sum1 +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
936       sum2 +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
937       sum3 +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
938       sum4 +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
939       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
940       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
941       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
942       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
943       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
944       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
945       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
946       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
947       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
948       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
949       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
950       v += 225;
951     }
952     if (usecprow) z = zarray + 15*ridx[i];
953     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
954     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
955 
956     if (!usecprow) z += 15;
957   }
958 
959   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
960   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
961   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
962   PetscFunctionReturn(0);
963 }
964 
965 
966 /*
967     This will not work with MatScalar == float because it calls the BLAS
968 */
969 #undef __FUNCT__
970 #define __FUNCT__ "MatMult_SeqBAIJ_N"
971 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
972 {
973   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
974   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
975   MatScalar      *v;
976   PetscErrorCode ierr;
977   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
978   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
979   PetscTruth     usecprow=a->compressedrow.use;
980 
981   PetscFunctionBegin;
982   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
983   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
984 
985   idx = a->j;
986   v   = a->a;
987   if (usecprow){
988     mbs  = a->compressedrow.nrows;
989     ii   = a->compressedrow.i;
990     ridx = a->compressedrow.rindex;
991   } else {
992     mbs = a->mbs;
993     ii  = a->i;
994     z   = zarray;
995   }
996 
997   if (!a->mult_work) {
998     k    = PetscMax(A->rmap->n,A->cmap->n);
999     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1000   }
1001   work = a->mult_work;
1002   for (i=0; i<mbs; i++) {
1003     n     = ii[1] - ii[0]; ii++;
1004     ncols = n*bs;
1005     workt = work;
1006     nonzerorow += (n>0);
1007     for (j=0; j<n; j++) {
1008       xb = x + bs*(*idx++);
1009       for (k=0; k<bs; k++) workt[k] = xb[k];
1010       workt += bs;
1011     }
1012     if (usecprow) z = zarray + bs*ridx[i];
1013     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1014     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1015     v += n*bs2;
1016     if (!usecprow) z += bs;
1017   }
1018   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1019   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1020   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
1025 #undef __FUNCT__
1026 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1027 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1028 {
1029   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1030   const PetscScalar  *x;
1031   PetscScalar        *y,*z,sum;
1032   const MatScalar    *v;
1033   PetscErrorCode     ierr;
1034   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1035   const PetscInt     *idx,*ii;
1036   PetscTruth         usecprow=a->compressedrow.use;
1037 
1038   PetscFunctionBegin;
1039   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1040   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1041   if (zz != yy) {
1042     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1043   } else {
1044     z = y;
1045   }
1046 
1047   idx = a->j;
1048   v   = a->a;
1049   if (usecprow){
1050     if (zz != yy){
1051       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1052     }
1053     mbs  = a->compressedrow.nrows;
1054     ii   = a->compressedrow.i;
1055     ridx = a->compressedrow.rindex;
1056   } else {
1057     ii  = a->i;
1058   }
1059 
1060   for (i=0; i<mbs; i++) {
1061     n    = ii[1] - ii[0];
1062     ii++;
1063     if (!usecprow){
1064       nonzerorow += (n>0);
1065       sum = y[i];
1066     } else {
1067       sum = y[ridx[i]];
1068     }
1069     PetscSparseDensePlusDot(sum,x,v,idx,n);
1070     v += n;
1071     idx += n;
1072     if (usecprow){
1073       z[ridx[i]] = sum;
1074     } else {
1075       z[i] = sum;
1076     }
1077   }
1078   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
1079   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1080   if (zz != yy) {
1081     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1082   }
1083   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 #undef __FUNCT__
1088 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1089 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1090 {
1091   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1092   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1093   PetscScalar    x1,x2,*yarray,*zarray;
1094   MatScalar      *v;
1095   PetscErrorCode ierr;
1096   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1097   PetscTruth     usecprow=a->compressedrow.use;
1098 
1099   PetscFunctionBegin;
1100   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1101   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1102   if (zz != yy) {
1103     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1104   } else {
1105     zarray = yarray;
1106   }
1107 
1108   idx = a->j;
1109   v   = a->a;
1110   if (usecprow){
1111     if (zz != yy){
1112       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1113     }
1114     mbs  = a->compressedrow.nrows;
1115     ii   = a->compressedrow.i;
1116     ridx = a->compressedrow.rindex;
1117     if (zz != yy){
1118       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1119     }
1120   } else {
1121     ii  = a->i;
1122     y   = yarray;
1123     z   = zarray;
1124   }
1125 
1126   for (i=0; i<mbs; i++) {
1127     n  = ii[1] - ii[0]; ii++;
1128     if (usecprow){
1129       z = zarray + 2*ridx[i];
1130       y = yarray + 2*ridx[i];
1131     }
1132     sum1 = y[0]; sum2 = y[1];
1133     for (j=0; j<n; j++) {
1134       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1135       sum1 += v[0]*x1 + v[2]*x2;
1136       sum2 += v[1]*x1 + v[3]*x2;
1137       v += 4;
1138     }
1139     z[0] = sum1; z[1] = sum2;
1140     if (!usecprow){
1141       z += 2; y += 2;
1142     }
1143   }
1144   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1145   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1146   if (zz != yy) {
1147     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1148   }
1149   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 #undef __FUNCT__
1154 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1155 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1156 {
1157   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1158   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1159   MatScalar      *v;
1160   PetscErrorCode ierr;
1161   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1162   PetscTruth     usecprow=a->compressedrow.use;
1163 
1164   PetscFunctionBegin;
1165   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1167   if (zz != yy) {
1168     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1169   } else {
1170     zarray = yarray;
1171   }
1172 
1173   idx = a->j;
1174   v   = a->a;
1175   if (usecprow){
1176     if (zz != yy){
1177       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1178     }
1179     mbs  = a->compressedrow.nrows;
1180     ii   = a->compressedrow.i;
1181     ridx = a->compressedrow.rindex;
1182   } else {
1183     ii  = a->i;
1184     y   = yarray;
1185     z   = zarray;
1186   }
1187 
1188   for (i=0; i<mbs; i++) {
1189     n  = ii[1] - ii[0]; ii++;
1190     if (usecprow){
1191       z = zarray + 3*ridx[i];
1192       y = yarray + 3*ridx[i];
1193     }
1194     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1195     for (j=0; j<n; j++) {
1196       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1197       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1198       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1199       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1200       v += 9;
1201     }
1202     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1203     if (!usecprow){
1204       z += 3; y += 3;
1205     }
1206   }
1207   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1208   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1209   if (zz != yy) {
1210     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1211   }
1212   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1213   PetscFunctionReturn(0);
1214 }
1215 
1216 #undef __FUNCT__
1217 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1218 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1219 {
1220   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1221   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1222   MatScalar      *v;
1223   PetscErrorCode ierr;
1224   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1225   PetscTruth     usecprow=a->compressedrow.use;
1226 
1227   PetscFunctionBegin;
1228   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1229   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1230   if (zz != yy) {
1231     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1232   } else {
1233     zarray = yarray;
1234   }
1235 
1236   idx   = a->j;
1237   v     = a->a;
1238   if (usecprow){
1239     if (zz != yy){
1240       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1241     }
1242     mbs  = a->compressedrow.nrows;
1243     ii   = a->compressedrow.i;
1244     ridx = a->compressedrow.rindex;
1245   } else {
1246     ii  = a->i;
1247     y   = yarray;
1248     z   = zarray;
1249   }
1250 
1251   for (i=0; i<mbs; i++) {
1252     n  = ii[1] - ii[0]; ii++;
1253     if (usecprow){
1254       z = zarray + 4*ridx[i];
1255       y = yarray + 4*ridx[i];
1256     }
1257     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1258     for (j=0; j<n; j++) {
1259       xb = x + 4*(*idx++);
1260       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1261       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1262       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1263       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1264       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1265       v += 16;
1266     }
1267     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1268     if (!usecprow){
1269       z += 4; y += 4;
1270     }
1271   }
1272   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1273   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1274   if (zz != yy) {
1275     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1276   }
1277   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 #undef __FUNCT__
1282 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1283 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1284 {
1285   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1286   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1287   PetscScalar    *yarray,*zarray;
1288   MatScalar      *v;
1289   PetscErrorCode ierr;
1290   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1291   PetscTruth     usecprow=a->compressedrow.use;
1292 
1293   PetscFunctionBegin;
1294   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1295   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1296   if (zz != yy) {
1297     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1298   } else {
1299     zarray = yarray;
1300   }
1301 
1302   idx = a->j;
1303   v   = a->a;
1304   if (usecprow){
1305     if (zz != yy){
1306       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1307     }
1308     mbs  = a->compressedrow.nrows;
1309     ii   = a->compressedrow.i;
1310     ridx = a->compressedrow.rindex;
1311   } else {
1312     ii  = a->i;
1313     y   = yarray;
1314     z   = zarray;
1315   }
1316 
1317   for (i=0; i<mbs; i++) {
1318     n  = ii[1] - ii[0]; ii++;
1319     if (usecprow){
1320       z = zarray + 5*ridx[i];
1321       y = yarray + 5*ridx[i];
1322     }
1323     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1324     for (j=0; j<n; j++) {
1325       xb = x + 5*(*idx++);
1326       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1327       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1328       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1329       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1330       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1331       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1332       v += 25;
1333     }
1334     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1335     if (!usecprow){
1336       z += 5; y += 5;
1337     }
1338   }
1339   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1340   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1341   if (zz != yy) {
1342     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1343   }
1344   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1345   PetscFunctionReturn(0);
1346 }
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1349 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1350 {
1351   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1352   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1353   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1354   MatScalar      *v;
1355   PetscErrorCode ierr;
1356   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1357   PetscTruth     usecprow=a->compressedrow.use;
1358 
1359   PetscFunctionBegin;
1360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1362   if (zz != yy) {
1363     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1364   } else {
1365     zarray = yarray;
1366   }
1367 
1368   idx = a->j;
1369   v   = a->a;
1370   if (usecprow){
1371     if (zz != yy){
1372       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1373     }
1374     mbs  = a->compressedrow.nrows;
1375     ii   = a->compressedrow.i;
1376     ridx = a->compressedrow.rindex;
1377   } else {
1378     ii  = a->i;
1379     y   = yarray;
1380     z   = zarray;
1381   }
1382 
1383   for (i=0; i<mbs; i++) {
1384     n  = ii[1] - ii[0]; ii++;
1385     if (usecprow){
1386       z = zarray + 6*ridx[i];
1387       y = yarray + 6*ridx[i];
1388     }
1389     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1390     for (j=0; j<n; j++) {
1391       xb = x + 6*(*idx++);
1392       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1393       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1394       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1395       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1396       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1397       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1398       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1399       v += 36;
1400     }
1401     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1402     if (!usecprow){
1403       z += 6; y += 6;
1404     }
1405   }
1406   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1407   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1408   if (zz != yy) {
1409     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1410   }
1411   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 #undef __FUNCT__
1416 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1417 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1418 {
1419   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1420   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1421   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1422   MatScalar      *v;
1423   PetscErrorCode ierr;
1424   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1425   PetscTruth     usecprow=a->compressedrow.use;
1426 
1427   PetscFunctionBegin;
1428   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1429   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1430   if (zz != yy) {
1431     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1432   } else {
1433     zarray = yarray;
1434   }
1435 
1436   idx = a->j;
1437   v   = a->a;
1438   if (usecprow){
1439     if (zz != yy){
1440       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1441     }
1442     mbs  = a->compressedrow.nrows;
1443     ii   = a->compressedrow.i;
1444     ridx = a->compressedrow.rindex;
1445   } else {
1446     ii  = a->i;
1447     y   = yarray;
1448     z   = zarray;
1449   }
1450 
1451   for (i=0; i<mbs; i++) {
1452     n  = ii[1] - ii[0]; ii++;
1453     if (usecprow){
1454       z = zarray + 7*ridx[i];
1455       y = yarray + 7*ridx[i];
1456     }
1457     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1458     for (j=0; j<n; j++) {
1459       xb = x + 7*(*idx++);
1460       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1461       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1462       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1463       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1464       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1465       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1466       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1467       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1468       v += 49;
1469     }
1470     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1471     if (!usecprow){
1472       z += 7; y += 7;
1473     }
1474   }
1475   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1476   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1477   if (zz != yy) {
1478     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1479   }
1480   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1481   PetscFunctionReturn(0);
1482 }
1483 
1484 #undef __FUNCT__
1485 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1486 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1487 {
1488   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1489   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1490   MatScalar      *v;
1491   PetscErrorCode ierr;
1492   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1493   PetscInt       ncols,k,*ridx=PETSC_NULL;
1494   PetscTruth     usecprow=a->compressedrow.use;
1495 
1496   PetscFunctionBegin;
1497   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1498   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1499   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1500 
1501   idx = a->j;
1502   v   = a->a;
1503   if (usecprow){
1504     mbs    = a->compressedrow.nrows;
1505     ii     = a->compressedrow.i;
1506     ridx = a->compressedrow.rindex;
1507   } else {
1508     mbs = a->mbs;
1509     ii  = a->i;
1510     z   = zarray;
1511   }
1512 
1513   if (!a->mult_work) {
1514     k    = PetscMax(A->rmap->n,A->cmap->n);
1515     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1516   }
1517   work = a->mult_work;
1518   for (i=0; i<mbs; i++) {
1519     n     = ii[1] - ii[0]; ii++;
1520     ncols = n*bs;
1521     workt = work;
1522     for (j=0; j<n; j++) {
1523       xb = x + bs*(*idx++);
1524       for (k=0; k<bs; k++) workt[k] = xb[k];
1525       workt += bs;
1526     }
1527     if (usecprow) z = zarray + bs*ridx[i];
1528     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1529     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1530     v += n*bs2;
1531     if (!usecprow){
1532       z += bs;
1533     }
1534   }
1535   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1536   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1537   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 #undef __FUNCT__
1542 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1543 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1544 {
1545   PetscScalar    zero = 0.0;
1546   PetscErrorCode ierr;
1547 
1548   PetscFunctionBegin;
1549   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1550   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 #undef __FUNCT__
1555 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1556 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1557 {
1558   PetscScalar    zero = 0.0;
1559   PetscErrorCode ierr;
1560 
1561   PetscFunctionBegin;
1562   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1563   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 #undef __FUNCT__
1568 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1569 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1570 
1571 {
1572   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1573   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1574   MatScalar         *v;
1575   PetscErrorCode    ierr;
1576   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1577   Mat_CompressedRow cprow = a->compressedrow;
1578   PetscTruth        usecprow=cprow.use;
1579 
1580   PetscFunctionBegin;
1581   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1583   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1584 
1585   idx = a->j;
1586   v   = a->a;
1587   if (usecprow){
1588     mbs  = cprow.nrows;
1589     ii   = cprow.i;
1590     ridx = cprow.rindex;
1591   } else {
1592     mbs=a->mbs;
1593     ii = a->i;
1594     xb = x;
1595   }
1596 
1597   switch (bs) {
1598   case 1:
1599     for (i=0; i<mbs; i++) {
1600       if (usecprow) xb = x + ridx[i];
1601       x1 = xb[0];
1602       ib = idx + ii[0];
1603       n  = ii[1] - ii[0]; ii++;
1604       for (j=0; j<n; j++) {
1605         rval    = ib[j];
1606         z[rval] += PetscConj(*v) * x1;
1607         v++;
1608       }
1609       if (!usecprow) xb++;
1610     }
1611     break;
1612   case 2:
1613     for (i=0; i<mbs; i++) {
1614       if (usecprow) xb = x + 2*ridx[i];
1615       x1 = xb[0]; x2 = xb[1];
1616       ib = idx + ii[0];
1617       n  = ii[1] - ii[0]; ii++;
1618       for (j=0; j<n; j++) {
1619         rval      = ib[j]*2;
1620         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1621         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1622         v  += 4;
1623       }
1624       if (!usecprow) xb += 2;
1625     }
1626     break;
1627   case 3:
1628     for (i=0; i<mbs; i++) {
1629       if (usecprow) xb = x + 3*ridx[i];
1630       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1631       ib = idx + ii[0];
1632       n  = ii[1] - ii[0]; ii++;
1633       for (j=0; j<n; j++) {
1634         rval      = ib[j]*3;
1635         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1636         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1637         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1638         v  += 9;
1639       }
1640       if (!usecprow) xb += 3;
1641     }
1642     break;
1643   case 4:
1644     for (i=0; i<mbs; i++) {
1645       if (usecprow) xb = x + 4*ridx[i];
1646       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1647       ib = idx + ii[0];
1648       n  = ii[1] - ii[0]; ii++;
1649       for (j=0; j<n; j++) {
1650         rval      = ib[j]*4;
1651         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1652         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1653         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1654         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1655         v  += 16;
1656       }
1657       if (!usecprow) xb += 4;
1658     }
1659     break;
1660   case 5:
1661     for (i=0; i<mbs; i++) {
1662       if (usecprow) xb = x + 5*ridx[i];
1663       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1664       x4 = xb[3]; x5 = xb[4];
1665       ib = idx + ii[0];
1666       n  = ii[1] - ii[0]; ii++;
1667       for (j=0; j<n; j++) {
1668         rval      = ib[j]*5;
1669         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1670         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1671         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1672         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1673         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1674         v  += 25;
1675       }
1676       if (!usecprow) xb += 5;
1677     }
1678     break;
1679   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1680       PetscInt     ncols,k;
1681       PetscScalar  *work,*workt,*xtmp;
1682 
1683       SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1684       if (!a->mult_work) {
1685         k = PetscMax(A->rmap->n,A->cmap->n);
1686         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1687       }
1688       work = a->mult_work;
1689       xtmp = x;
1690       for (i=0; i<mbs; i++) {
1691         n     = ii[1] - ii[0]; ii++;
1692         ncols = n*bs;
1693         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1694         if (usecprow) {
1695           xtmp = x + bs*ridx[i];
1696         }
1697         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1698         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1699         v += n*bs2;
1700         if (!usecprow) xtmp += bs;
1701         workt = work;
1702         for (j=0; j<n; j++) {
1703           zb = z + bs*(*idx++);
1704           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1705           workt += bs;
1706         }
1707       }
1708     }
1709   }
1710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1711   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1712   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1713   PetscFunctionReturn(0);
1714 }
1715 
1716 #undef __FUNCT__
1717 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1718 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1719 {
1720   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1721   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1722   MatScalar         *v;
1723   PetscErrorCode    ierr;
1724   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1725   Mat_CompressedRow cprow = a->compressedrow;
1726   PetscTruth        usecprow=cprow.use;
1727 
1728   PetscFunctionBegin;
1729   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1731   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1732 
1733   idx = a->j;
1734   v   = a->a;
1735   if (usecprow){
1736     mbs  = cprow.nrows;
1737     ii   = cprow.i;
1738     ridx = cprow.rindex;
1739   } else {
1740     mbs=a->mbs;
1741     ii = a->i;
1742     xb = x;
1743   }
1744 
1745   switch (bs) {
1746   case 1:
1747     for (i=0; i<mbs; i++) {
1748       if (usecprow) xb = x + ridx[i];
1749       x1 = xb[0];
1750       ib = idx + ii[0];
1751       n  = ii[1] - ii[0]; ii++;
1752       for (j=0; j<n; j++) {
1753         rval    = ib[j];
1754         z[rval] += *v * x1;
1755         v++;
1756       }
1757       if (!usecprow) xb++;
1758     }
1759     break;
1760   case 2:
1761     for (i=0; i<mbs; i++) {
1762       if (usecprow) xb = x + 2*ridx[i];
1763       x1 = xb[0]; x2 = xb[1];
1764       ib = idx + ii[0];
1765       n  = ii[1] - ii[0]; ii++;
1766       for (j=0; j<n; j++) {
1767         rval      = ib[j]*2;
1768         z[rval++] += v[0]*x1 + v[1]*x2;
1769         z[rval++] += v[2]*x1 + v[3]*x2;
1770         v  += 4;
1771       }
1772       if (!usecprow) xb += 2;
1773     }
1774     break;
1775   case 3:
1776     for (i=0; i<mbs; i++) {
1777       if (usecprow) xb = x + 3*ridx[i];
1778       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1779       ib = idx + ii[0];
1780       n  = ii[1] - ii[0]; ii++;
1781       for (j=0; j<n; j++) {
1782         rval      = ib[j]*3;
1783         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1784         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1785         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1786         v  += 9;
1787       }
1788       if (!usecprow) xb += 3;
1789     }
1790     break;
1791   case 4:
1792     for (i=0; i<mbs; i++) {
1793       if (usecprow) xb = x + 4*ridx[i];
1794       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1795       ib = idx + ii[0];
1796       n  = ii[1] - ii[0]; ii++;
1797       for (j=0; j<n; j++) {
1798         rval      = ib[j]*4;
1799         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1800         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1801         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1802         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1803         v  += 16;
1804       }
1805       if (!usecprow) xb += 4;
1806     }
1807     break;
1808   case 5:
1809     for (i=0; i<mbs; i++) {
1810       if (usecprow) xb = x + 5*ridx[i];
1811       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1812       x4 = xb[3]; x5 = xb[4];
1813       ib = idx + ii[0];
1814       n  = ii[1] - ii[0]; ii++;
1815       for (j=0; j<n; j++) {
1816         rval      = ib[j]*5;
1817         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1818         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1819         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1820         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1821         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1822         v  += 25;
1823       }
1824       if (!usecprow) xb += 5;
1825     }
1826     break;
1827   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1828       PetscInt     ncols,k;
1829       PetscScalar  *work,*workt,*xtmp;
1830 
1831       if (!a->mult_work) {
1832         k = PetscMax(A->rmap->n,A->cmap->n);
1833         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1834       }
1835       work = a->mult_work;
1836       xtmp = x;
1837       for (i=0; i<mbs; i++) {
1838         n     = ii[1] - ii[0]; ii++;
1839         ncols = n*bs;
1840         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1841         if (usecprow) {
1842           xtmp = x + bs*ridx[i];
1843         }
1844         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1845         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1846         v += n*bs2;
1847         if (!usecprow) xtmp += bs;
1848         workt = work;
1849         for (j=0; j<n; j++) {
1850           zb = z + bs*(*idx++);
1851           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1852           workt += bs;
1853         }
1854       }
1855     }
1856   }
1857   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1858   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1859   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1860   PetscFunctionReturn(0);
1861 }
1862 
1863 #undef __FUNCT__
1864 #define __FUNCT__ "MatScale_SeqBAIJ"
1865 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1866 {
1867   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1868   PetscInt       totalnz = a->bs2*a->nz;
1869   PetscScalar    oalpha = alpha;
1870   PetscErrorCode ierr;
1871   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1872 
1873   PetscFunctionBegin;
1874   BLASscal_(&tnz,&oalpha,a->a,&one);
1875   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 #undef __FUNCT__
1880 #define __FUNCT__ "MatNorm_SeqBAIJ"
1881 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1882 {
1883   PetscErrorCode ierr;
1884   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1885   MatScalar      *v = a->a;
1886   PetscReal      sum = 0.0;
1887   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1888 
1889   PetscFunctionBegin;
1890   if (type == NORM_FROBENIUS) {
1891     for (i=0; i< bs2*nz; i++) {
1892 #if defined(PETSC_USE_COMPLEX)
1893       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1894 #else
1895       sum += (*v)*(*v); v++;
1896 #endif
1897     }
1898     *norm = sqrt(sum);
1899   } else if (type == NORM_1) { /* maximum column sum */
1900     PetscReal *tmp;
1901     PetscInt  *bcol = a->j;
1902     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1903     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1904     for (i=0; i<nz; i++){
1905       for (j=0; j<bs; j++){
1906         k1 = bs*(*bcol) + j; /* column index */
1907         for (k=0; k<bs; k++){
1908           tmp[k1] += PetscAbsScalar(*v); v++;
1909         }
1910       }
1911       bcol++;
1912     }
1913     *norm = 0.0;
1914     for (j=0; j<A->cmap->n; j++) {
1915       if (tmp[j] > *norm) *norm = tmp[j];
1916     }
1917     ierr = PetscFree(tmp);CHKERRQ(ierr);
1918   } else if (type == NORM_INFINITY) { /* maximum row sum */
1919     *norm = 0.0;
1920     for (k=0; k<bs; k++) {
1921       for (j=0; j<a->mbs; j++) {
1922         v = a->a + bs2*a->i[j] + k;
1923         sum = 0.0;
1924         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1925           for (k1=0; k1<bs; k1++){
1926             sum += PetscAbsScalar(*v);
1927             v   += bs;
1928           }
1929         }
1930         if (sum > *norm) *norm = sum;
1931       }
1932     }
1933   } else {
1934     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1935   }
1936   PetscFunctionReturn(0);
1937 }
1938 
1939 
1940 #undef __FUNCT__
1941 #define __FUNCT__ "MatEqual_SeqBAIJ"
1942 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1943 {
1944   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1945   PetscErrorCode ierr;
1946 
1947   PetscFunctionBegin;
1948   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1949   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1950     *flg = PETSC_FALSE;
1951     PetscFunctionReturn(0);
1952   }
1953 
1954   /* if the a->i are the same */
1955   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1956   if (!*flg) {
1957     PetscFunctionReturn(0);
1958   }
1959 
1960   /* if a->j are the same */
1961   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1962   if (!*flg) {
1963     PetscFunctionReturn(0);
1964   }
1965   /* if a->a are the same */
1966   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1967   PetscFunctionReturn(0);
1968 
1969 }
1970 
1971 #undef __FUNCT__
1972 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1973 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1974 {
1975   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1976   PetscErrorCode ierr;
1977   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1978   PetscScalar    *x,zero = 0.0;
1979   MatScalar      *aa,*aa_j;
1980 
1981   PetscFunctionBegin;
1982   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1983   bs   = A->rmap->bs;
1984   aa   = a->a;
1985   ai   = a->i;
1986   aj   = a->j;
1987   ambs = a->mbs;
1988   bs2  = a->bs2;
1989 
1990   ierr = VecSet(v,zero);CHKERRQ(ierr);
1991   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1992   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1993   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1994   for (i=0; i<ambs; i++) {
1995     for (j=ai[i]; j<ai[i+1]; j++) {
1996       if (aj[j] == i) {
1997         row  = i*bs;
1998         aa_j = aa+j*bs2;
1999         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2000         break;
2001       }
2002     }
2003   }
2004   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2005   PetscFunctionReturn(0);
2006 }
2007 
2008 #undef __FUNCT__
2009 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2010 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2011 {
2012   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2013   PetscScalar    *l,*r,x,*li,*ri;
2014   MatScalar      *aa,*v;
2015   PetscErrorCode ierr;
2016   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
2017 
2018   PetscFunctionBegin;
2019   ai  = a->i;
2020   aj  = a->j;
2021   aa  = a->a;
2022   m   = A->rmap->n;
2023   n   = A->cmap->n;
2024   bs  = A->rmap->bs;
2025   mbs = a->mbs;
2026   bs2 = a->bs2;
2027   if (ll) {
2028     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
2029     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2030     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2031     for (i=0; i<mbs; i++) { /* for each block row */
2032       M  = ai[i+1] - ai[i];
2033       li = l + i*bs;
2034       v  = aa + bs2*ai[i];
2035       for (j=0; j<M; j++) { /* for each block */
2036         for (k=0; k<bs2; k++) {
2037           (*v++) *= li[k%bs];
2038         }
2039       }
2040     }
2041     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
2042     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2043   }
2044 
2045   if (rr) {
2046     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
2047     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2048     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2049     for (i=0; i<mbs; i++) { /* for each block row */
2050       M  = ai[i+1] - ai[i];
2051       v  = aa + bs2*ai[i];
2052       for (j=0; j<M; j++) { /* for each block */
2053         ri = r + bs*aj[ai[i]+j];
2054         for (k=0; k<bs; k++) {
2055           x = ri[k];
2056           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
2057         }
2058       }
2059     }
2060     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
2061     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2062   }
2063   PetscFunctionReturn(0);
2064 }
2065 
2066 
2067 #undef __FUNCT__
2068 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2069 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2070 {
2071   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2072 
2073   PetscFunctionBegin;
2074   info->block_size     = a->bs2;
2075   info->nz_allocated   = a->maxnz;
2076   info->nz_used        = a->bs2*a->nz;
2077   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
2078   info->assemblies   = A->num_ass;
2079   info->mallocs      = a->reallocs;
2080   info->memory       = ((PetscObject)A)->mem;
2081   if (A->factor) {
2082     info->fill_ratio_given  = A->info.fill_ratio_given;
2083     info->fill_ratio_needed = A->info.fill_ratio_needed;
2084     info->factor_mallocs    = A->info.factor_mallocs;
2085   } else {
2086     info->fill_ratio_given  = 0;
2087     info->fill_ratio_needed = 0;
2088     info->factor_mallocs    = 0;
2089   }
2090   PetscFunctionReturn(0);
2091 }
2092 
2093 
2094 #undef __FUNCT__
2095 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2096 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2097 {
2098   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2099   PetscErrorCode ierr;
2100 
2101   PetscFunctionBegin;
2102   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2103   PetscFunctionReturn(0);
2104 }
2105