xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision c05d87d6c2fa1d0952fe676de20339d09daa79bc)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m     = a->mbs;
20   ai    = a->i;
21   aj    = a->j;
22   bs    = A->rmap->bs;
23 
24   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
27   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n ; ++j){
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++){ /* for each overlap*/
50       n = isz;
51       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end ; l++){
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++)
64         nidx2[j*bs+k] = nidx[j]*bs+k;
65     }
66     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
67   }
68   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
69   ierr = PetscFree(nidx);CHKERRQ(ierr);
70   ierr = PetscFree(nidx2);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 #undef __FUNCT__
75 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
76 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscTruth     flag,sorted;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
91   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
99   ssmap = smap;
100   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
101   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
102   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for (k=kstart; k<kend; k++) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
119     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
120     if (!flag) {
121       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
122     }
123     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
124     C = *B;
125   } else {
126     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
127     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
128     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
129     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
130   }
131   c = (Mat_SeqBAIJ *)(C->data);
132   for (i=0; i<nrows; i++) {
133     row    = irow[i];
134     kstart = ai[row];
135     kend   = kstart + a->ilen[row];
136     mat_i  = c->i[i];
137     mat_j  = c->j + mat_i;
138     mat_a  = c->a + mat_i*bs2;
139     mat_ilen = c->ilen + i;
140     for (k=kstart; k<kend; k++) {
141       if ((tcol=ssmap[a->j[k]])) {
142         *mat_j++ = tcol - 1;
143         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
144         mat_a   += bs2;
145         (*mat_ilen)++;
146       }
147     }
148   }
149 
150   /* Free work space */
151   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
152   ierr = PetscFree(smap);CHKERRQ(ierr);
153   ierr = PetscFree(lens);CHKERRQ(ierr);
154   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156 
157   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
158   *B = C;
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
164 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
165 {
166   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
167   IS             is1,is2;
168   PetscErrorCode ierr;
169   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
170   const PetscInt *irow,*icol;
171 
172   PetscFunctionBegin;
173   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
174   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
176   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   count = 0;
184   for (i=0; i<a->mbs; i++) {
185     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
186     if (vary[i]==bs) iary[count++] = i;
187   }
188   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
189 
190   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
191   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
192   count = 0;
193   for (i=0; i<a->mbs; i++) {
194     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
195     if (vary[i]==bs) iary[count++] = i;
196   }
197   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr);
198   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
199   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
200   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
201 
202   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
203   ISDestroy(is1);
204   ISDestroy(is2);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
210 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
211 {
212   PetscErrorCode ierr;
213   PetscInt       i;
214 
215   PetscFunctionBegin;
216   if (scall == MAT_INITIAL_MATRIX) {
217     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
218   }
219 
220   for (i=0; i<n; i++) {
221     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 
227 /* -------------------------------------------------------*/
228 /* Should check that shapes of vectors and matrices match */
229 /* -------------------------------------------------------*/
230 #include "petscblaslapack.h"
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatMult_SeqBAIJ_1"
234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
235 {
236   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
237   PetscScalar       *z,sum;
238   const PetscScalar *x;
239   const MatScalar   *v;
240   PetscErrorCode    ierr;
241   PetscInt          mbs,i,n,nonzerorow=0;
242   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
243   PetscTruth        usecprow=a->compressedrow.use;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
247   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
248 
249   if (usecprow){
250     mbs  = a->compressedrow.nrows;
251     ii   = a->compressedrow.i;
252     ridx = a->compressedrow.rindex;
253     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
254   } else {
255     mbs = a->mbs;
256     ii  = a->i;
257   }
258 
259   for (i=0; i<mbs; i++) {
260     n    = ii[1] - ii[0];
261     v    = a->a + ii[0];
262     idx  = a->j + ii[0];
263     ii++;
264     sum  = 0.0;
265     PetscSparseDensePlusDot(sum,x,v,idx,n);
266     if (usecprow){
267       z[ridx[i]] = sum;
268     } else {
269       nonzerorow += (n>0);
270       z[i] = sum;
271     }
272   }
273   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
275   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatMult_SeqBAIJ_2"
281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
282 {
283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284   PetscScalar       *z = 0,sum1,sum2,*zarray;
285   const PetscScalar *x,*xb;
286   PetscScalar       x1,x2;
287   const MatScalar   *v;
288   PetscErrorCode    ierr;
289   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
290   PetscTruth        usecprow=a->compressedrow.use;
291 
292   PetscFunctionBegin;
293   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
294   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
295 
296   idx = a->j;
297   v   = a->a;
298   if (usecprow){
299     mbs  = a->compressedrow.nrows;
300     ii   = a->compressedrow.i;
301     ridx = a->compressedrow.rindex;
302   } else {
303     mbs = a->mbs;
304     ii  = a->i;
305     z   = zarray;
306   }
307 
308   for (i=0; i<mbs; i++) {
309     n  = ii[1] - ii[0]; ii++;
310     sum1 = 0.0; sum2 = 0.0;
311     nonzerorow += (n>0);
312     for (j=0; j<n; j++) {
313       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
314       sum1 += v[0]*x1 + v[2]*x2;
315       sum2 += v[1]*x1 + v[3]*x2;
316       v += 4;
317     }
318     if (usecprow) z = zarray + 2*ridx[i];
319     z[0] = sum1; z[1] = sum2;
320     if (!usecprow) z += 2;
321   }
322   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
323   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
324   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatMult_SeqBAIJ_3"
330 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334   const PetscScalar *x,*xb;
335   const MatScalar   *v;
336   PetscErrorCode    ierr;
337   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
338   PetscTruth        usecprow=a->compressedrow.use;
339 
340 
341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342 #pragma disjoint(*v,*z,*xb)
343 #endif
344 
345   PetscFunctionBegin;
346   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
347   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
348 
349   idx = a->j;
350   v   = a->a;
351   if (usecprow){
352     mbs  = a->compressedrow.nrows;
353     ii   = a->compressedrow.i;
354     ridx = a->compressedrow.rindex;
355   } else {
356     mbs = a->mbs;
357     ii  = a->i;
358     z   = zarray;
359   }
360 
361   for (i=0; i<mbs; i++) {
362     n  = ii[1] - ii[0]; ii++;
363     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
364     nonzerorow += (n>0);
365     for (j=0; j<n; j++) {
366       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
367       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
368       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
369       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
370       v += 9;
371     }
372     if (usecprow) z = zarray + 3*ridx[i];
373     z[0] = sum1; z[1] = sum2; z[2] = sum3;
374     if (!usecprow) z += 3;
375   }
376   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
377   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
378   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
379   PetscFunctionReturn(0);
380 }
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatMult_SeqBAIJ_4"
384 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
385 {
386   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
387   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
388   const PetscScalar *x,*xb;
389   const MatScalar   *v;
390   PetscErrorCode    ierr;
391   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
392   PetscTruth        usecprow=a->compressedrow.use;
393 
394   PetscFunctionBegin;
395   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
396   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
397 
398   idx = a->j;
399   v   = a->a;
400   if (usecprow){
401     mbs  = a->compressedrow.nrows;
402     ii   = a->compressedrow.i;
403     ridx = a->compressedrow.rindex;
404   } else {
405     mbs = a->mbs;
406     ii  = a->i;
407     z   = zarray;
408   }
409 
410   for (i=0; i<mbs; i++) {
411     n  = ii[1] - ii[0]; ii++;
412     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
413     nonzerorow += (n>0);
414     for (j=0; j<n; j++) {
415       xb = x + 4*(*idx++);
416       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
418       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
419       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
420       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
421       v += 16;
422     }
423     if (usecprow) z = zarray + 4*ridx[i];
424     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
425     if (!usecprow) z += 4;
426   }
427   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
428   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
429   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
430   PetscFunctionReturn(0);
431 }
432 
433 #undef __FUNCT__
434 #define __FUNCT__ "MatMult_SeqBAIJ_5"
435 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
436 {
437   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
438   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
439   const PetscScalar *xb,*x;
440   const MatScalar   *v;
441   PetscErrorCode    ierr;
442   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
443   PetscTruth        usecprow=a->compressedrow.use;
444 
445   PetscFunctionBegin;
446   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
447   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
448 
449   idx = a->j;
450   v   = a->a;
451   if (usecprow){
452     mbs  = a->compressedrow.nrows;
453     ii   = a->compressedrow.i;
454     ridx = a->compressedrow.rindex;
455   } else {
456     mbs = a->mbs;
457     ii  = a->i;
458     z   = zarray;
459   }
460 
461   for (i=0; i<mbs; i++) {
462     n  = ii[1] - ii[0]; ii++;
463     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
464     nonzerorow += (n>0);
465     for (j=0; j<n; j++) {
466       xb = x + 5*(*idx++);
467       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
468       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
469       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
470       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
471       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
472       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
473       v += 25;
474     }
475     if (usecprow) z = zarray + 5*ridx[i];
476     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
477     if (!usecprow) z += 5;
478   }
479   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
480   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
481   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 
486 #undef __FUNCT__
487 #define __FUNCT__ "MatMult_SeqBAIJ_6"
488 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
489 {
490   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
491   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
492   const PetscScalar *x,*xb;
493   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
494   const MatScalar   *v;
495   PetscErrorCode    ierr;
496   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
497   PetscTruth        usecprow=a->compressedrow.use;
498 
499   PetscFunctionBegin;
500   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
501   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
502 
503   idx = a->j;
504   v   = a->a;
505   if (usecprow){
506     mbs  = a->compressedrow.nrows;
507     ii   = a->compressedrow.i;
508     ridx = a->compressedrow.rindex;
509   } else {
510     mbs = a->mbs;
511     ii  = a->i;
512     z   = zarray;
513   }
514 
515   for (i=0; i<mbs; i++) {
516     n  = ii[1] - ii[0]; ii++;
517     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
518     nonzerorow += (n>0);
519     for (j=0; j<n; j++) {
520       xb = x + 6*(*idx++);
521       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
522       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
523       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
524       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
525       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
526       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
527       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
528       v += 36;
529     }
530     if (usecprow) z = zarray + 6*ridx[i];
531     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
532     if (!usecprow) z += 6;
533   }
534 
535   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
536   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
537   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
538   PetscFunctionReturn(0);
539 }
540 #undef __FUNCT__
541 #define __FUNCT__ "MatMult_SeqBAIJ_7"
542 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
543 {
544   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
545   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
546   const PetscScalar *x,*xb;
547   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
548   const MatScalar   *v;
549   PetscErrorCode    ierr;
550   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
551   PetscTruth        usecprow=a->compressedrow.use;
552 
553   PetscFunctionBegin;
554   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
555   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
556 
557   idx = a->j;
558   v   = a->a;
559   if (usecprow){
560     mbs    = a->compressedrow.nrows;
561     ii     = a->compressedrow.i;
562     ridx = a->compressedrow.rindex;
563   } else {
564     mbs = a->mbs;
565     ii  = a->i;
566     z   = zarray;
567   }
568 
569   for (i=0; i<mbs; i++) {
570     n  = ii[1] - ii[0]; ii++;
571     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
572     nonzerorow += (n>0);
573     for (j=0; j<n; j++) {
574       xb = x + 7*(*idx++);
575       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
576       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
577       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
578       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
579       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
580       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
581       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
582       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
583       v += 49;
584     }
585     if (usecprow) z = zarray + 7*ridx[i];
586     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
587     if (!usecprow) z += 7;
588   }
589 
590   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
591   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
592   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
593   PetscFunctionReturn(0);
594 }
595 
596 /*
597     This will not work with MatScalar == float because it calls the BLAS
598 */
599 #undef __FUNCT__
600 #define __FUNCT__ "MatMult_SeqBAIJ_N"
601 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
602 {
603   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
604   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
605   MatScalar      *v;
606   PetscErrorCode ierr;
607   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
608   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
609   PetscTruth     usecprow=a->compressedrow.use;
610 
611   PetscFunctionBegin;
612   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
613   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
614 
615   idx = a->j;
616   v   = a->a;
617   if (usecprow){
618     mbs  = a->compressedrow.nrows;
619     ii   = a->compressedrow.i;
620     ridx = a->compressedrow.rindex;
621   } else {
622     mbs = a->mbs;
623     ii  = a->i;
624     z   = zarray;
625   }
626 
627   if (!a->mult_work) {
628     k    = PetscMax(A->rmap->n,A->cmap->n);
629     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
630   }
631   work = a->mult_work;
632   for (i=0; i<mbs; i++) {
633     n     = ii[1] - ii[0]; ii++;
634     ncols = n*bs;
635     workt = work;
636     nonzerorow += (n>0);
637     for (j=0; j<n; j++) {
638       xb = x + bs*(*idx++);
639       for (k=0; k<bs; k++) workt[k] = xb[k];
640       workt += bs;
641     }
642     if (usecprow) z = zarray + bs*ridx[i];
643     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
644     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
645     v += n*bs2;
646     if (!usecprow) z += bs;
647   }
648   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
649   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
650   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
651   PetscFunctionReturn(0);
652 }
653 
654 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
655 #undef __FUNCT__
656 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
657 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
658 {
659   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
660   const PetscScalar  *x;
661   PetscScalar        *y,*z,sum;
662   const MatScalar    *v;
663   PetscErrorCode     ierr;
664   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
665   const PetscInt     *idx,*ii;
666   PetscTruth         usecprow=a->compressedrow.use;
667 
668   PetscFunctionBegin;
669   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
670   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
671   if (zz != yy) {
672     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
673   } else {
674     z = y;
675   }
676 
677   idx = a->j;
678   v   = a->a;
679   if (usecprow){
680     if (zz != yy){
681       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
682     }
683     mbs  = a->compressedrow.nrows;
684     ii   = a->compressedrow.i;
685     ridx = a->compressedrow.rindex;
686   } else {
687     ii  = a->i;
688   }
689 
690   for (i=0; i<mbs; i++) {
691     n    = ii[1] - ii[0];
692     ii++;
693     if (!usecprow){
694       nonzerorow += (n>0);
695       sum = y[i];
696     } else {
697       sum = y[ridx[i]];
698     }
699     PetscSparseDensePlusDot(sum,x,v,idx,n);
700     v += n;
701     idx += n;
702     if (usecprow){
703       z[ridx[i]] = sum;
704     } else {
705       z[i] = sum;
706     }
707   }
708   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
709   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
710   if (zz != yy) {
711     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
712   }
713   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
714   PetscFunctionReturn(0);
715 }
716 
717 #undef __FUNCT__
718 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
719 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
720 {
721   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
722   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
723   PetscScalar    x1,x2,*yarray,*zarray;
724   MatScalar      *v;
725   PetscErrorCode ierr;
726   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
727   PetscTruth     usecprow=a->compressedrow.use;
728 
729   PetscFunctionBegin;
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
732   if (zz != yy) {
733     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
734   } else {
735     zarray = yarray;
736   }
737 
738   idx = a->j;
739   v   = a->a;
740   if (usecprow){
741     if (zz != yy){
742       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
743     }
744     mbs  = a->compressedrow.nrows;
745     ii   = a->compressedrow.i;
746     ridx = a->compressedrow.rindex;
747     if (zz != yy){
748       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
749     }
750   } else {
751     ii  = a->i;
752     y   = yarray;
753     z   = zarray;
754   }
755 
756   for (i=0; i<mbs; i++) {
757     n  = ii[1] - ii[0]; ii++;
758     if (usecprow){
759       z = zarray + 2*ridx[i];
760       y = yarray + 2*ridx[i];
761     }
762     sum1 = y[0]; sum2 = y[1];
763     for (j=0; j<n; j++) {
764       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
765       sum1 += v[0]*x1 + v[2]*x2;
766       sum2 += v[1]*x1 + v[3]*x2;
767       v += 4;
768     }
769     z[0] = sum1; z[1] = sum2;
770     if (!usecprow){
771       z += 2; y += 2;
772     }
773   }
774   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
775   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
776   if (zz != yy) {
777     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
778   }
779   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
780   PetscFunctionReturn(0);
781 }
782 
783 #undef __FUNCT__
784 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
785 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
786 {
787   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
788   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
789   MatScalar      *v;
790   PetscErrorCode ierr;
791   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
792   PetscTruth     usecprow=a->compressedrow.use;
793 
794   PetscFunctionBegin;
795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
796   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
797   if (zz != yy) {
798     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
799   } else {
800     zarray = yarray;
801   }
802 
803   idx = a->j;
804   v   = a->a;
805   if (usecprow){
806     if (zz != yy){
807       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
808     }
809     mbs  = a->compressedrow.nrows;
810     ii   = a->compressedrow.i;
811     ridx = a->compressedrow.rindex;
812   } else {
813     ii  = a->i;
814     y   = yarray;
815     z   = zarray;
816   }
817 
818   for (i=0; i<mbs; i++) {
819     n  = ii[1] - ii[0]; ii++;
820     if (usecprow){
821       z = zarray + 3*ridx[i];
822       y = yarray + 3*ridx[i];
823     }
824     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
825     for (j=0; j<n; j++) {
826       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
827       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
828       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
829       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
830       v += 9;
831     }
832     z[0] = sum1; z[1] = sum2; z[2] = sum3;
833     if (!usecprow){
834       z += 3; y += 3;
835     }
836   }
837   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
838   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
839   if (zz != yy) {
840     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
841   }
842   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
843   PetscFunctionReturn(0);
844 }
845 
846 #undef __FUNCT__
847 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
848 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
849 {
850   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
851   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
852   MatScalar      *v;
853   PetscErrorCode ierr;
854   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
855   PetscTruth     usecprow=a->compressedrow.use;
856 
857   PetscFunctionBegin;
858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
859   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
860   if (zz != yy) {
861     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
862   } else {
863     zarray = yarray;
864   }
865 
866   idx   = a->j;
867   v     = a->a;
868   if (usecprow){
869     if (zz != yy){
870       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
871     }
872     mbs  = a->compressedrow.nrows;
873     ii   = a->compressedrow.i;
874     ridx = a->compressedrow.rindex;
875   } else {
876     ii  = a->i;
877     y   = yarray;
878     z   = zarray;
879   }
880 
881   for (i=0; i<mbs; i++) {
882     n  = ii[1] - ii[0]; ii++;
883     if (usecprow){
884       z = zarray + 4*ridx[i];
885       y = yarray + 4*ridx[i];
886     }
887     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
888     for (j=0; j<n; j++) {
889       xb = x + 4*(*idx++);
890       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
891       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
892       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
893       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
894       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
895       v += 16;
896     }
897     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
898     if (!usecprow){
899       z += 4; y += 4;
900     }
901   }
902   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
903   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
904   if (zz != yy) {
905     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
906   }
907   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
908   PetscFunctionReturn(0);
909 }
910 
911 #undef __FUNCT__
912 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
913 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
914 {
915   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
916   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
917   PetscScalar    *yarray,*zarray;
918   MatScalar      *v;
919   PetscErrorCode ierr;
920   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
921   PetscTruth     usecprow=a->compressedrow.use;
922 
923   PetscFunctionBegin;
924   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
925   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
926   if (zz != yy) {
927     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
928   } else {
929     zarray = yarray;
930   }
931 
932   idx = a->j;
933   v   = a->a;
934   if (usecprow){
935     if (zz != yy){
936       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
937     }
938     mbs  = a->compressedrow.nrows;
939     ii   = a->compressedrow.i;
940     ridx = a->compressedrow.rindex;
941   } else {
942     ii  = a->i;
943     y   = yarray;
944     z   = zarray;
945   }
946 
947   for (i=0; i<mbs; i++) {
948     n  = ii[1] - ii[0]; ii++;
949     if (usecprow){
950       z = zarray + 5*ridx[i];
951       y = yarray + 5*ridx[i];
952     }
953     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
954     for (j=0; j<n; j++) {
955       xb = x + 5*(*idx++);
956       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
957       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
958       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
959       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
960       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
961       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
962       v += 25;
963     }
964     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
965     if (!usecprow){
966       z += 5; y += 5;
967     }
968   }
969   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
970   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
971   if (zz != yy) {
972     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
973   }
974   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
975   PetscFunctionReturn(0);
976 }
977 #undef __FUNCT__
978 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
979 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
980 {
981   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
982   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
983   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
984   MatScalar      *v;
985   PetscErrorCode ierr;
986   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
987   PetscTruth     usecprow=a->compressedrow.use;
988 
989   PetscFunctionBegin;
990   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
991   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
992   if (zz != yy) {
993     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
994   } else {
995     zarray = yarray;
996   }
997 
998   idx = a->j;
999   v   = a->a;
1000   if (usecprow){
1001     if (zz != yy){
1002       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1003     }
1004     mbs  = a->compressedrow.nrows;
1005     ii   = a->compressedrow.i;
1006     ridx = a->compressedrow.rindex;
1007   } else {
1008     ii  = a->i;
1009     y   = yarray;
1010     z   = zarray;
1011   }
1012 
1013   for (i=0; i<mbs; i++) {
1014     n  = ii[1] - ii[0]; ii++;
1015     if (usecprow){
1016       z = zarray + 6*ridx[i];
1017       y = yarray + 6*ridx[i];
1018     }
1019     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1020     for (j=0; j<n; j++) {
1021       xb = x + 6*(*idx++);
1022       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1023       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1024       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1025       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1026       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1027       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1028       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1029       v += 36;
1030     }
1031     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1032     if (!usecprow){
1033       z += 6; y += 6;
1034     }
1035   }
1036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1037   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1038   if (zz != yy) {
1039     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1040   }
1041   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 #undef __FUNCT__
1046 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1047 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1048 {
1049   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1050   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1051   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1052   MatScalar      *v;
1053   PetscErrorCode ierr;
1054   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1055   PetscTruth     usecprow=a->compressedrow.use;
1056 
1057   PetscFunctionBegin;
1058   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1059   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1060   if (zz != yy) {
1061     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1062   } else {
1063     zarray = yarray;
1064   }
1065 
1066   idx = a->j;
1067   v   = a->a;
1068   if (usecprow){
1069     if (zz != yy){
1070       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1071     }
1072     mbs  = a->compressedrow.nrows;
1073     ii   = a->compressedrow.i;
1074     ridx = a->compressedrow.rindex;
1075   } else {
1076     ii  = a->i;
1077     y   = yarray;
1078     z   = zarray;
1079   }
1080 
1081   for (i=0; i<mbs; i++) {
1082     n  = ii[1] - ii[0]; ii++;
1083     if (usecprow){
1084       z = zarray + 7*ridx[i];
1085       y = yarray + 7*ridx[i];
1086     }
1087     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1088     for (j=0; j<n; j++) {
1089       xb = x + 7*(*idx++);
1090       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1091       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1092       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1093       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1094       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1095       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1096       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1097       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1098       v += 49;
1099     }
1100     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1101     if (!usecprow){
1102       z += 7; y += 7;
1103     }
1104   }
1105   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1106   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1107   if (zz != yy) {
1108     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1109   }
1110   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 #undef __FUNCT__
1115 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1116 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1117 {
1118   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1119   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1120   MatScalar      *v;
1121   PetscErrorCode ierr;
1122   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1123   PetscInt       ncols,k,*ridx=PETSC_NULL;
1124   PetscTruth     usecprow=a->compressedrow.use;
1125 
1126   PetscFunctionBegin;
1127   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1128   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1129   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1130 
1131   idx = a->j;
1132   v   = a->a;
1133   if (usecprow){
1134     mbs    = a->compressedrow.nrows;
1135     ii     = a->compressedrow.i;
1136     ridx = a->compressedrow.rindex;
1137   } else {
1138     mbs = a->mbs;
1139     ii  = a->i;
1140     z   = zarray;
1141   }
1142 
1143   if (!a->mult_work) {
1144     k    = PetscMax(A->rmap->n,A->cmap->n);
1145     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1146   }
1147   work = a->mult_work;
1148   for (i=0; i<mbs; i++) {
1149     n     = ii[1] - ii[0]; ii++;
1150     ncols = n*bs;
1151     workt = work;
1152     for (j=0; j<n; j++) {
1153       xb = x + bs*(*idx++);
1154       for (k=0; k<bs; k++) workt[k] = xb[k];
1155       workt += bs;
1156     }
1157     if (usecprow) z = zarray + bs*ridx[i];
1158     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1159     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1160     v += n*bs2;
1161     if (!usecprow){
1162       z += bs;
1163     }
1164   }
1165   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1167   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 #undef __FUNCT__
1172 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1173 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1174 {
1175   PetscScalar    zero = 0.0;
1176   PetscErrorCode ierr;
1177 
1178   PetscFunctionBegin;
1179   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1180   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 #undef __FUNCT__
1185 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1186 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1187 {
1188   PetscScalar    zero = 0.0;
1189   PetscErrorCode ierr;
1190 
1191   PetscFunctionBegin;
1192   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1193   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 #undef __FUNCT__
1198 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1199 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1200 
1201 {
1202   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1203   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1204   MatScalar         *v;
1205   PetscErrorCode    ierr;
1206   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1207   Mat_CompressedRow cprow = a->compressedrow;
1208   PetscTruth        usecprow=cprow.use;
1209 
1210   PetscFunctionBegin;
1211   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1212   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1213   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1214 
1215   idx = a->j;
1216   v   = a->a;
1217   if (usecprow){
1218     mbs  = cprow.nrows;
1219     ii   = cprow.i;
1220     ridx = cprow.rindex;
1221   } else {
1222     mbs=a->mbs;
1223     ii = a->i;
1224     xb = x;
1225   }
1226 
1227   switch (bs) {
1228   case 1:
1229     for (i=0; i<mbs; i++) {
1230       if (usecprow) xb = x + ridx[i];
1231       x1 = xb[0];
1232       ib = idx + ii[0];
1233       n  = ii[1] - ii[0]; ii++;
1234       for (j=0; j<n; j++) {
1235         rval    = ib[j];
1236         z[rval] += PetscConj(*v) * x1;
1237         v++;
1238       }
1239       if (!usecprow) xb++;
1240     }
1241     break;
1242   case 2:
1243     for (i=0; i<mbs; i++) {
1244       if (usecprow) xb = x + 2*ridx[i];
1245       x1 = xb[0]; x2 = xb[1];
1246       ib = idx + ii[0];
1247       n  = ii[1] - ii[0]; ii++;
1248       for (j=0; j<n; j++) {
1249         rval      = ib[j]*2;
1250         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1251         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1252         v  += 4;
1253       }
1254       if (!usecprow) xb += 2;
1255     }
1256     break;
1257   case 3:
1258     for (i=0; i<mbs; i++) {
1259       if (usecprow) xb = x + 3*ridx[i];
1260       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1261       ib = idx + ii[0];
1262       n  = ii[1] - ii[0]; ii++;
1263       for (j=0; j<n; j++) {
1264         rval      = ib[j]*3;
1265         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1266         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1267         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1268         v  += 9;
1269       }
1270       if (!usecprow) xb += 3;
1271     }
1272     break;
1273   case 4:
1274     for (i=0; i<mbs; i++) {
1275       if (usecprow) xb = x + 4*ridx[i];
1276       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1277       ib = idx + ii[0];
1278       n  = ii[1] - ii[0]; ii++;
1279       for (j=0; j<n; j++) {
1280         rval      = ib[j]*4;
1281         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1282         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1283         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1284         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1285         v  += 16;
1286       }
1287       if (!usecprow) xb += 4;
1288     }
1289     break;
1290   case 5:
1291     for (i=0; i<mbs; i++) {
1292       if (usecprow) xb = x + 5*ridx[i];
1293       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1294       x4 = xb[3]; x5 = xb[4];
1295       ib = idx + ii[0];
1296       n  = ii[1] - ii[0]; ii++;
1297       for (j=0; j<n; j++) {
1298         rval      = ib[j]*5;
1299         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1300         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1301         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1302         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1303         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1304         v  += 25;
1305       }
1306       if (!usecprow) xb += 5;
1307     }
1308     break;
1309   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1310       PetscInt     ncols,k;
1311       PetscScalar  *work,*workt,*xtmp;
1312 
1313       SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1314       if (!a->mult_work) {
1315         k = PetscMax(A->rmap->n,A->cmap->n);
1316         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1317       }
1318       work = a->mult_work;
1319       xtmp = x;
1320       for (i=0; i<mbs; i++) {
1321         n     = ii[1] - ii[0]; ii++;
1322         ncols = n*bs;
1323         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1324         if (usecprow) {
1325           xtmp = x + bs*ridx[i];
1326         }
1327         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1328         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1329         v += n*bs2;
1330         if (!usecprow) xtmp += bs;
1331         workt = work;
1332         for (j=0; j<n; j++) {
1333           zb = z + bs*(*idx++);
1334           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1335           workt += bs;
1336         }
1337       }
1338     }
1339   }
1340   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1341   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1342   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 #undef __FUNCT__
1347 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1348 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1349 
1350 {
1351   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1352   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1353   MatScalar         *v;
1354   PetscErrorCode    ierr;
1355   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1356   Mat_CompressedRow cprow = a->compressedrow;
1357   PetscTruth        usecprow=cprow.use;
1358 
1359   PetscFunctionBegin;
1360   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1361   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1362   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1363 
1364   idx = a->j;
1365   v   = a->a;
1366   if (usecprow){
1367     mbs  = cprow.nrows;
1368     ii   = cprow.i;
1369     ridx = cprow.rindex;
1370   } else {
1371     mbs=a->mbs;
1372     ii = a->i;
1373     xb = x;
1374   }
1375 
1376   switch (bs) {
1377   case 1:
1378     for (i=0; i<mbs; i++) {
1379       if (usecprow) xb = x + ridx[i];
1380       x1 = xb[0];
1381       ib = idx + ii[0];
1382       n  = ii[1] - ii[0]; ii++;
1383       for (j=0; j<n; j++) {
1384         rval    = ib[j];
1385         z[rval] += *v * x1;
1386         v++;
1387       }
1388       if (!usecprow) xb++;
1389     }
1390     break;
1391   case 2:
1392     for (i=0; i<mbs; i++) {
1393       if (usecprow) xb = x + 2*ridx[i];
1394       x1 = xb[0]; x2 = xb[1];
1395       ib = idx + ii[0];
1396       n  = ii[1] - ii[0]; ii++;
1397       for (j=0; j<n; j++) {
1398         rval      = ib[j]*2;
1399         z[rval++] += v[0]*x1 + v[1]*x2;
1400         z[rval++] += v[2]*x1 + v[3]*x2;
1401         v  += 4;
1402       }
1403       if (!usecprow) xb += 2;
1404     }
1405     break;
1406   case 3:
1407     for (i=0; i<mbs; i++) {
1408       if (usecprow) xb = x + 3*ridx[i];
1409       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1410       ib = idx + ii[0];
1411       n  = ii[1] - ii[0]; ii++;
1412       for (j=0; j<n; j++) {
1413         rval      = ib[j]*3;
1414         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1415         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1416         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1417         v  += 9;
1418       }
1419       if (!usecprow) xb += 3;
1420     }
1421     break;
1422   case 4:
1423     for (i=0; i<mbs; i++) {
1424       if (usecprow) xb = x + 4*ridx[i];
1425       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1426       ib = idx + ii[0];
1427       n  = ii[1] - ii[0]; ii++;
1428       for (j=0; j<n; j++) {
1429         rval      = ib[j]*4;
1430         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1431         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1432         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1433         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1434         v  += 16;
1435       }
1436       if (!usecprow) xb += 4;
1437     }
1438     break;
1439   case 5:
1440     for (i=0; i<mbs; i++) {
1441       if (usecprow) xb = x + 5*ridx[i];
1442       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1443       x4 = xb[3]; x5 = xb[4];
1444       ib = idx + ii[0];
1445       n  = ii[1] - ii[0]; ii++;
1446       for (j=0; j<n; j++) {
1447         rval      = ib[j]*5;
1448         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1449         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1450         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1451         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1452         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1453         v  += 25;
1454       }
1455       if (!usecprow) xb += 5;
1456     }
1457     break;
1458   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1459       PetscInt     ncols,k;
1460       PetscScalar  *work,*workt,*xtmp;
1461 
1462       if (!a->mult_work) {
1463         k = PetscMax(A->rmap->n,A->cmap->n);
1464         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1465       }
1466       work = a->mult_work;
1467       xtmp = x;
1468       for (i=0; i<mbs; i++) {
1469         n     = ii[1] - ii[0]; ii++;
1470         ncols = n*bs;
1471         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1472         if (usecprow) {
1473           xtmp = x + bs*ridx[i];
1474         }
1475         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1476         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1477         v += n*bs2;
1478         if (!usecprow) xtmp += bs;
1479         workt = work;
1480         for (j=0; j<n; j++) {
1481           zb = z + bs*(*idx++);
1482           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1483           workt += bs;
1484         }
1485       }
1486     }
1487   }
1488   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1489   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1490   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1491   PetscFunctionReturn(0);
1492 }
1493 
1494 #undef __FUNCT__
1495 #define __FUNCT__ "MatScale_SeqBAIJ"
1496 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1497 {
1498   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1499   PetscInt       totalnz = a->bs2*a->nz;
1500   PetscScalar    oalpha = alpha;
1501   PetscErrorCode ierr;
1502   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1503 
1504   PetscFunctionBegin;
1505   BLASscal_(&tnz,&oalpha,a->a,&one);
1506   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1507   PetscFunctionReturn(0);
1508 }
1509 
1510 #undef __FUNCT__
1511 #define __FUNCT__ "MatNorm_SeqBAIJ"
1512 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1513 {
1514   PetscErrorCode ierr;
1515   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1516   MatScalar      *v = a->a;
1517   PetscReal      sum = 0.0;
1518   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1519 
1520   PetscFunctionBegin;
1521   if (type == NORM_FROBENIUS) {
1522     for (i=0; i< bs2*nz; i++) {
1523 #if defined(PETSC_USE_COMPLEX)
1524       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1525 #else
1526       sum += (*v)*(*v); v++;
1527 #endif
1528     }
1529     *norm = sqrt(sum);
1530   } else if (type == NORM_1) { /* maximum column sum */
1531     PetscReal *tmp;
1532     PetscInt  *bcol = a->j;
1533     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1534     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1535     for (i=0; i<nz; i++){
1536       for (j=0; j<bs; j++){
1537         k1 = bs*(*bcol) + j; /* column index */
1538         for (k=0; k<bs; k++){
1539           tmp[k1] += PetscAbsScalar(*v); v++;
1540         }
1541       }
1542       bcol++;
1543     }
1544     *norm = 0.0;
1545     for (j=0; j<A->cmap->n; j++) {
1546       if (tmp[j] > *norm) *norm = tmp[j];
1547     }
1548     ierr = PetscFree(tmp);CHKERRQ(ierr);
1549   } else if (type == NORM_INFINITY) { /* maximum row sum */
1550     *norm = 0.0;
1551     for (k=0; k<bs; k++) {
1552       for (j=0; j<a->mbs; j++) {
1553         v = a->a + bs2*a->i[j] + k;
1554         sum = 0.0;
1555         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1556           for (k1=0; k1<bs; k1++){
1557             sum += PetscAbsScalar(*v);
1558             v   += bs;
1559           }
1560         }
1561         if (sum > *norm) *norm = sum;
1562       }
1563     }
1564   } else {
1565     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1566   }
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 
1571 #undef __FUNCT__
1572 #define __FUNCT__ "MatEqual_SeqBAIJ"
1573 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1574 {
1575   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1576   PetscErrorCode ierr;
1577 
1578   PetscFunctionBegin;
1579   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1580   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1581     *flg = PETSC_FALSE;
1582     PetscFunctionReturn(0);
1583   }
1584 
1585   /* if the a->i are the same */
1586   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1587   if (!*flg) {
1588     PetscFunctionReturn(0);
1589   }
1590 
1591   /* if a->j are the same */
1592   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1593   if (!*flg) {
1594     PetscFunctionReturn(0);
1595   }
1596   /* if a->a are the same */
1597   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1598   PetscFunctionReturn(0);
1599 
1600 }
1601 
1602 #undef __FUNCT__
1603 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1604 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1605 {
1606   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1607   PetscErrorCode ierr;
1608   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1609   PetscScalar    *x,zero = 0.0;
1610   MatScalar      *aa,*aa_j;
1611 
1612   PetscFunctionBegin;
1613   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1614   bs   = A->rmap->bs;
1615   aa   = a->a;
1616   ai   = a->i;
1617   aj   = a->j;
1618   ambs = a->mbs;
1619   bs2  = a->bs2;
1620 
1621   ierr = VecSet(v,zero);CHKERRQ(ierr);
1622   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1623   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1624   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1625   for (i=0; i<ambs; i++) {
1626     for (j=ai[i]; j<ai[i+1]; j++) {
1627       if (aj[j] == i) {
1628         row  = i*bs;
1629         aa_j = aa+j*bs2;
1630         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1631         break;
1632       }
1633     }
1634   }
1635   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1636   PetscFunctionReturn(0);
1637 }
1638 
1639 #undef __FUNCT__
1640 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
1641 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1642 {
1643   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1644   PetscScalar    *l,*r,x,*li,*ri;
1645   MatScalar      *aa,*v;
1646   PetscErrorCode ierr;
1647   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1648 
1649   PetscFunctionBegin;
1650   ai  = a->i;
1651   aj  = a->j;
1652   aa  = a->a;
1653   m   = A->rmap->n;
1654   n   = A->cmap->n;
1655   bs  = A->rmap->bs;
1656   mbs = a->mbs;
1657   bs2 = a->bs2;
1658   if (ll) {
1659     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1660     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1661     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1662     for (i=0; i<mbs; i++) { /* for each block row */
1663       M  = ai[i+1] - ai[i];
1664       li = l + i*bs;
1665       v  = aa + bs2*ai[i];
1666       for (j=0; j<M; j++) { /* for each block */
1667         for (k=0; k<bs2; k++) {
1668           (*v++) *= li[k%bs];
1669         }
1670       }
1671     }
1672     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1673     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1674   }
1675 
1676   if (rr) {
1677     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1678     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1679     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1680     for (i=0; i<mbs; i++) { /* for each block row */
1681       M  = ai[i+1] - ai[i];
1682       v  = aa + bs2*ai[i];
1683       for (j=0; j<M; j++) { /* for each block */
1684         ri = r + bs*aj[ai[i]+j];
1685         for (k=0; k<bs; k++) {
1686           x = ri[k];
1687           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1688         }
1689       }
1690     }
1691     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1692     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1693   }
1694   PetscFunctionReturn(0);
1695 }
1696 
1697 
1698 #undef __FUNCT__
1699 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
1700 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1701 {
1702   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1703 
1704   PetscFunctionBegin;
1705   info->block_size     = a->bs2;
1706   info->nz_allocated   = a->maxnz;
1707   info->nz_used        = a->bs2*a->nz;
1708   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1709   info->assemblies   = A->num_ass;
1710   info->mallocs      = a->reallocs;
1711   info->memory       = ((PetscObject)A)->mem;
1712   if (A->factor) {
1713     info->fill_ratio_given  = A->info.fill_ratio_given;
1714     info->fill_ratio_needed = A->info.fill_ratio_needed;
1715     info->factor_mallocs    = A->info.factor_mallocs;
1716   } else {
1717     info->fill_ratio_given  = 0;
1718     info->fill_ratio_needed = 0;
1719     info->factor_mallocs    = 0;
1720   }
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 
1725 #undef __FUNCT__
1726 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
1727 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1728 {
1729   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1730   PetscErrorCode ierr;
1731 
1732   PetscFunctionBegin;
1733   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736