xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision f3fe499b4cc4d64bf04aa4f5e4963dcc4eb56541)
1 #define PETSCMAT_DLL
2 
3 #include "../src/mat/impls/baij/seq/baij.h"
4 #include "../src/mat/blockinvert.h"
5 #include "petscbt.h"
6 #include "petscblaslapack.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
15   const PetscInt *idx;
16   PetscInt       start,end,*ai,*aj,bs,*nidx2;
17   PetscBT        table;
18 
19   PetscFunctionBegin;
20   m     = a->mbs;
21   ai    = a->i;
22   aj    = a->j;
23   bs    = A->rmap->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(m,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
30 
31   for (i=0; i<is_max; i++) {
32     /* Initialise the two local arrays */
33     isz  = 0;
34     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j=0; j<n ; ++j){
42       ival = idx[j]/bs; /* convert the indices into block indices */
43       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap*/
51       n = isz;
52       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end ; l++){
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++)
65         nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 #undef __FUNCT__
76 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
77 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
78 {
79   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
80   PetscErrorCode ierr;
81   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
82   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
83   const PetscInt *irow,*icol;
84   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
85   PetscInt       *aj = a->j,*ai = a->i;
86   MatScalar      *mat_a;
87   Mat            C;
88   PetscTruth     flag,sorted;
89 
90   PetscFunctionBegin;
91   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
92   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
93 
94   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
95   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
98 
99   ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
100   ssmap = smap;
101   ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
102   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
103   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109       for (k=kstart; k<kend; k++) {
110         if (ssmap[aj[k]]) {
111           lens[i]++;
112         }
113       }
114     }
115   /* Create and fill new matrix */
116   if (scall == MAT_REUSE_MATRIX) {
117     c = (Mat_SeqBAIJ *)((*B)->data);
118 
119     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
121     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
122     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
123     C = *B;
124   } else {
125     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
126     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
127     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
128     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
129   }
130   c = (Mat_SeqBAIJ *)(C->data);
131   for (i=0; i<nrows; i++) {
132     row    = irow[i];
133     kstart = ai[row];
134     kend   = kstart + a->ilen[row];
135     mat_i  = c->i[i];
136     mat_j  = c->j + mat_i;
137     mat_a  = c->a + mat_i*bs2;
138     mat_ilen = c->ilen + i;
139     for (k=kstart; k<kend; k++) {
140       if ((tcol=ssmap[a->j[k]])) {
141         *mat_j++ = tcol - 1;
142         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
143         mat_a   += bs2;
144         (*mat_ilen)++;
145       }
146     }
147   }
148 
149   /* Free work space */
150   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
151   ierr = PetscFree(smap);CHKERRQ(ierr);
152   ierr = PetscFree(lens);CHKERRQ(ierr);
153   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
154   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155 
156   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
157   *B = C;
158   PetscFunctionReturn(0);
159 }
160 
161 #undef __FUNCT__
162 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164 {
165   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166   IS             is1,is2;
167   PetscErrorCode ierr;
168   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
169   const PetscInt *irow,*icol;
170 
171   PetscFunctionBegin;
172   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176 
177   /* Verify if the indices corespond to each element in a block
178    and form the IS with compressed IS */
179   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
180   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
181   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182   count = 0;
183   for (i=0; i<a->mbs; i++) {
184     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185     if (vary[i]==bs) iary[count++] = i;
186   }
187   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
188 
189   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
190   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191   count = 0;
192   for (i=0; i<a->mbs; i++) {
193     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
194     if (vary[i]==bs) iary[count++] = i;
195   }
196   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);CHKERRQ(ierr);
197   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
198   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
200 
201   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
202   ISDestroy(is1);
203   ISDestroy(is2);
204   PetscFunctionReturn(0);
205 }
206 
207 #undef __FUNCT__
208 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
209 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210 {
211   PetscErrorCode ierr;
212   PetscInt       i;
213 
214   PetscFunctionBegin;
215   if (scall == MAT_INITIAL_MATRIX) {
216     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
217   }
218 
219   for (i=0; i<n; i++) {
220     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 
226 /* -------------------------------------------------------*/
227 /* Should check that shapes of vectors and matrices match */
228 /* -------------------------------------------------------*/
229 
230 #undef __FUNCT__
231 #define __FUNCT__ "MatMult_SeqBAIJ_1"
232 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
233 {
234   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
235   PetscScalar       *z,sum;
236   const PetscScalar *x;
237   const MatScalar   *v;
238   PetscErrorCode    ierr;
239   PetscInt          mbs,i,n,nonzerorow=0;
240   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
241   PetscTruth        usecprow=a->compressedrow.use;
242 
243   PetscFunctionBegin;
244   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
245   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
246 
247   if (usecprow){
248     mbs  = a->compressedrow.nrows;
249     ii   = a->compressedrow.i;
250     ridx = a->compressedrow.rindex;
251     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
252   } else {
253     mbs = a->mbs;
254     ii  = a->i;
255   }
256 
257   for (i=0; i<mbs; i++) {
258     n    = ii[1] - ii[0];
259     v    = a->a + ii[0];
260     idx  = a->j + ii[0];
261     ii++;
262     sum  = 0.0;
263     PetscSparseDensePlusDot(sum,x,v,idx,n);
264     if (usecprow){
265       z[ridx[i]] = sum;
266     } else {
267       nonzerorow += (n>0);
268       z[i] = sum;
269     }
270   }
271   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
272   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
273   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatMult_SeqBAIJ_2"
279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282   PetscScalar       *z = 0,sum1,sum2,*zarray;
283   const PetscScalar *x,*xb;
284   PetscScalar       x1,x2;
285   const MatScalar   *v;
286   PetscErrorCode    ierr;
287   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
288   PetscTruth        usecprow=a->compressedrow.use;
289 
290   PetscFunctionBegin;
291   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
293 
294   idx = a->j;
295   v   = a->a;
296   if (usecprow){
297     mbs  = a->compressedrow.nrows;
298     ii   = a->compressedrow.i;
299     ridx = a->compressedrow.rindex;
300   } else {
301     mbs = a->mbs;
302     ii  = a->i;
303     z   = zarray;
304   }
305 
306   for (i=0; i<mbs; i++) {
307     n  = ii[1] - ii[0]; ii++;
308     sum1 = 0.0; sum2 = 0.0;
309     nonzerorow += (n>0);
310     for (j=0; j<n; j++) {
311       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
312       sum1 += v[0]*x1 + v[2]*x2;
313       sum2 += v[1]*x1 + v[3]*x2;
314       v += 4;
315     }
316     if (usecprow) z = zarray + 2*ridx[i];
317     z[0] = sum1; z[1] = sum2;
318     if (!usecprow) z += 2;
319   }
320   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
321   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
322   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatMult_SeqBAIJ_3"
328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
329 {
330   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
331   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
332   const PetscScalar *x,*xb;
333   const MatScalar   *v;
334   PetscErrorCode    ierr;
335   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
336   PetscTruth        usecprow=a->compressedrow.use;
337 
338 
339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
340 #pragma disjoint(*v,*z,*xb)
341 #endif
342 
343   PetscFunctionBegin;
344   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
345   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
346 
347   idx = a->j;
348   v   = a->a;
349   if (usecprow){
350     mbs  = a->compressedrow.nrows;
351     ii   = a->compressedrow.i;
352     ridx = a->compressedrow.rindex;
353   } else {
354     mbs = a->mbs;
355     ii  = a->i;
356     z   = zarray;
357   }
358 
359   for (i=0; i<mbs; i++) {
360     n  = ii[1] - ii[0]; ii++;
361     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
362     nonzerorow += (n>0);
363     for (j=0; j<n; j++) {
364       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
365       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
366       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
367       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
368       v += 9;
369     }
370     if (usecprow) z = zarray + 3*ridx[i];
371     z[0] = sum1; z[1] = sum2; z[2] = sum3;
372     if (!usecprow) z += 3;
373   }
374   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
375   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
376   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 #undef __FUNCT__
381 #define __FUNCT__ "MatMult_SeqBAIJ_4"
382 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
383 {
384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
385   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
386   const PetscScalar *x,*xb;
387   const MatScalar   *v;
388   PetscErrorCode    ierr;
389   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
390   PetscTruth        usecprow=a->compressedrow.use;
391 
392   PetscFunctionBegin;
393   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
395 
396   idx = a->j;
397   v   = a->a;
398   if (usecprow){
399     mbs  = a->compressedrow.nrows;
400     ii   = a->compressedrow.i;
401     ridx = a->compressedrow.rindex;
402   } else {
403     mbs = a->mbs;
404     ii  = a->i;
405     z   = zarray;
406   }
407 
408   for (i=0; i<mbs; i++) {
409     n  = ii[1] - ii[0]; ii++;
410     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
411     nonzerorow += (n>0);
412     for (j=0; j<n; j++) {
413       xb = x + 4*(*idx++);
414       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
415       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
416       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
417       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
418       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
419       v += 16;
420     }
421     if (usecprow) z = zarray + 4*ridx[i];
422     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
423     if (!usecprow) z += 4;
424   }
425   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
426   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
427   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 #undef __FUNCT__
432 #define __FUNCT__ "MatMult_SeqBAIJ_5"
433 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
434 {
435   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
436   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
437   const PetscScalar *xb,*x;
438   const MatScalar   *v;
439   PetscErrorCode    ierr;
440   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
441   PetscInt          mbs,i,j,n,nonzerorow=0;
442   PetscTruth        usecprow=a->compressedrow.use;
443 
444   PetscFunctionBegin;
445   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
446   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
447 
448   idx = a->j;
449   v   = a->a;
450   if (usecprow){
451     mbs  = a->compressedrow.nrows;
452     ii   = a->compressedrow.i;
453     ridx = a->compressedrow.rindex;
454   } else {
455     mbs = a->mbs;
456     ii  = a->i;
457     z   = zarray;
458   }
459 
460   for (i=0; i<mbs; i++) {
461     n  = ii[1] - ii[0]; ii++;
462     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
463     nonzerorow += (n>0);
464     for (j=0; j<n; j++) {
465       xb = x + 5*(*idx++);
466       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
467       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
468       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
469       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
470       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
471       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
472       v += 25;
473     }
474     if (usecprow) z = zarray + 5*ridx[i];
475     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
476     if (!usecprow) z += 5;
477   }
478   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
479   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
480   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
481   PetscFunctionReturn(0);
482 }
483 
484 
485 #undef __FUNCT__
486 #define __FUNCT__ "MatMult_SeqBAIJ_6"
487 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
488 {
489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
491   const PetscScalar *x,*xb;
492   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
493   const MatScalar   *v;
494   PetscErrorCode    ierr;
495   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
496   PetscTruth        usecprow=a->compressedrow.use;
497 
498   PetscFunctionBegin;
499   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
500   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
501 
502   idx = a->j;
503   v   = a->a;
504   if (usecprow){
505     mbs  = a->compressedrow.nrows;
506     ii   = a->compressedrow.i;
507     ridx = a->compressedrow.rindex;
508   } else {
509     mbs = a->mbs;
510     ii  = a->i;
511     z   = zarray;
512   }
513 
514   for (i=0; i<mbs; i++) {
515     n  = ii[1] - ii[0]; ii++;
516     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
517     nonzerorow += (n>0);
518     for (j=0; j<n; j++) {
519       xb = x + 6*(*idx++);
520       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
521       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
522       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
523       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
524       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
525       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
526       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
527       v += 36;
528     }
529     if (usecprow) z = zarray + 6*ridx[i];
530     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
531     if (!usecprow) z += 6;
532   }
533 
534   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
535   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
536   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 #undef __FUNCT__
541 #define __FUNCT__ "MatMult_SeqBAIJ_7"
542 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
543 {
544   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
545   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
546   const PetscScalar *x,*xb;
547   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
548   const MatScalar   *v;
549   PetscErrorCode    ierr;
550   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
551   PetscTruth        usecprow=a->compressedrow.use;
552 
553   PetscFunctionBegin;
554   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
555   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
556 
557   idx = a->j;
558   v   = a->a;
559   if (usecprow){
560     mbs    = a->compressedrow.nrows;
561     ii     = a->compressedrow.i;
562     ridx = a->compressedrow.rindex;
563   } else {
564     mbs = a->mbs;
565     ii  = a->i;
566     z   = zarray;
567   }
568 
569   for (i=0; i<mbs; i++) {
570     n  = ii[1] - ii[0]; ii++;
571     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
572     nonzerorow += (n>0);
573     for (j=0; j<n; j++) {
574       xb = x + 7*(*idx++);
575       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
576       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
577       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
578       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
579       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
580       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
581       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
582       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
583       v += 49;
584     }
585     if (usecprow) z = zarray + 7*ridx[i];
586     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
587     if (!usecprow) z += 7;
588   }
589 
590   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
591   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
592   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
593   PetscFunctionReturn(0);
594 }
595 
596 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
597 /* Default MatMult for block size 15 */
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
601 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
602 {
603   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
604   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
605   const PetscScalar *x,*xb;
606   PetscScalar       *zarray,xv;
607   const MatScalar   *v;
608   PetscErrorCode    ierr;
609   const PetscInt    *ii,*ij=a->j,*idx;
610   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
611   PetscTruth        usecprow=a->compressedrow.use;
612 
613   PetscFunctionBegin;
614   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
615   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
616 
617   v   = a->a;
618   if (usecprow){
619     mbs    = a->compressedrow.nrows;
620     ii     = a->compressedrow.i;
621     ridx = a->compressedrow.rindex;
622   } else {
623     mbs = a->mbs;
624     ii  = a->i;
625     z   = zarray;
626   }
627 
628   for (i=0; i<mbs; i++) {
629     n  = ii[i+1] - ii[i];
630     idx = ij + ii[i];
631     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
632     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
633 
634     nonzerorow += (n>0);
635     for (j=0; j<n; j++) {
636       xb = x + 15*(idx[j]);
637 
638       for(k=0;k<15;k++){
639         xv    =  xb[k];
640 	sum1  += v[0]*xv;
641         sum2  += v[1]*xv;
642 	sum3  += v[2]*xv;
643 	sum4  += v[3]*xv;
644 	sum5  += v[4]*xv;
645         sum6  += v[5]*xv;
646 	sum7  += v[6]*xv;
647 	sum8  += v[7]*xv;
648         sum9  += v[8]*xv;
649         sum10 += v[9]*xv;
650 	sum11 += v[10]*xv;
651 	sum12 += v[11]*xv;
652 	sum13 += v[12]*xv;
653         sum14 += v[13]*xv;
654 	sum15 += v[14]*xv;
655 	v += 15;
656       }
657     }
658     if (usecprow) z = zarray + 15*ridx[i];
659     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
660     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
661 
662     if (!usecprow) z += 15;
663   }
664 
665   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
666   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
667   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
668   PetscFunctionReturn(0);
669 }
670 
671 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
672 #undef __FUNCT__
673 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
674 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
675 {
676   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
677   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
678   const PetscScalar *x,*xb;
679   PetscScalar        x1,x2,x3,x4,*zarray;
680   const MatScalar   *v;
681   PetscErrorCode    ierr;
682   const PetscInt    *ii,*ij=a->j,*idx;
683   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
684   PetscTruth        usecprow=a->compressedrow.use;
685 
686   PetscFunctionBegin;
687   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
688   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
689 
690   v   = a->a;
691   if (usecprow){
692     mbs    = a->compressedrow.nrows;
693     ii     = a->compressedrow.i;
694     ridx = a->compressedrow.rindex;
695   } else {
696     mbs = a->mbs;
697     ii  = a->i;
698     z   = zarray;
699   }
700 
701   for (i=0; i<mbs; i++) {
702     n  = ii[i+1] - ii[i];
703     idx = ij + ii[i];
704     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
705     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
706 
707     nonzerorow += (n>0);
708     for (j=0; j<n; j++) {
709       xb = x + 15*(idx[j]);
710       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
711 
712       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
713       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
714       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
715       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
716       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
717       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
718       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
719       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
720       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
721       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
722       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
723       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
724       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
725       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
726       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
727 
728       v += 60;
729 
730       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
731 
732       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
733       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
734       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
735       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
736       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
737       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
738       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
739       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
740       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
741       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
742       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
743       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
744       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
745       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
746       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
747       v += 60;
748 
749       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
750       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
751       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
752       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
753       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
754       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
755       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
756       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
757       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
758       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
759       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
760       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
761       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
762       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
763       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
764       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
765       v  += 60;
766 
767       x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
768       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
769       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
770       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
771       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
772       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
773       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
774       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
775       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
776       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
777       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
778       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
779       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
780       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
781       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
782       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
783       v += 45;
784     }
785     if (usecprow) z = zarray + 15*ridx[i];
786     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
787     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
788 
789     if (!usecprow) z += 15;
790   }
791 
792   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
793   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
794   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
795   PetscFunctionReturn(0);
796 }
797 
798 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
799 #undef __FUNCT__
800 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
801 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
802 {
803   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
804   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
805   const PetscScalar *x,*xb;
806   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
807   const MatScalar   *v;
808   PetscErrorCode    ierr;
809   const PetscInt    *ii,*ij=a->j,*idx;
810   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
811   PetscTruth        usecprow=a->compressedrow.use;
812 
813   PetscFunctionBegin;
814   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
815   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
816 
817   v   = a->a;
818   if (usecprow){
819     mbs    = a->compressedrow.nrows;
820     ii     = a->compressedrow.i;
821     ridx = a->compressedrow.rindex;
822   } else {
823     mbs = a->mbs;
824     ii  = a->i;
825     z   = zarray;
826   }
827 
828   for (i=0; i<mbs; i++) {
829     n  = ii[i+1] - ii[i];
830     idx = ij + ii[i];
831     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
832     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
833 
834     nonzerorow += (n>0);
835     for (j=0; j<n; j++) {
836       xb = x + 15*(idx[j]);
837       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
838       x8 = xb[7];
839 
840       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
841       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
842       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
843       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
844       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
845       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
846       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
847       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
848       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
849       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
850       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
851       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
852       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
853       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
854       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
855       v += 120;
856 
857       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
858 
859       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
860       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
861       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
862       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
863       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
864       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
865       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
866       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
867       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
868       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
869       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
870       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
871       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
872       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
873       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
874       v += 105;
875     }
876     if (usecprow) z = zarray + 15*ridx[i];
877     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
878     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
879 
880     if (!usecprow) z += 15;
881   }
882 
883   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
884   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
885   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
886   PetscFunctionReturn(0);
887 }
888 
889 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
890 
891 #undef __FUNCT__
892 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
893 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
894 {
895   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
896   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
897   const PetscScalar *x,*xb;
898   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
899   const MatScalar   *v;
900   PetscErrorCode    ierr;
901   const PetscInt    *ii,*ij=a->j,*idx;
902   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
903   PetscTruth        usecprow=a->compressedrow.use;
904 
905   PetscFunctionBegin;
906   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
907   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
908 
909   v   = a->a;
910   if (usecprow){
911     mbs    = a->compressedrow.nrows;
912     ii     = a->compressedrow.i;
913     ridx = a->compressedrow.rindex;
914   } else {
915     mbs = a->mbs;
916     ii  = a->i;
917     z   = zarray;
918   }
919 
920   for (i=0; i<mbs; i++) {
921     n  = ii[i+1] - ii[i];
922     idx = ij + ii[i];
923     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
924     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
925 
926     nonzerorow += (n>0);
927     for (j=0; j<n; j++) {
928       xb = x + 15*(idx[j]);
929       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
930       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
931 
932       sum1 +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
933       sum2 +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
934       sum3 +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
935       sum4 +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
936       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
937       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
938       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
939       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
940       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
941       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
942       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
943       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
944       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
945       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
946       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
947       v += 225;
948     }
949     if (usecprow) z = zarray + 15*ridx[i];
950     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
951     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
952 
953     if (!usecprow) z += 15;
954   }
955 
956   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
957   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
958   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
959   PetscFunctionReturn(0);
960 }
961 
962 
963 /*
964     This will not work with MatScalar == float because it calls the BLAS
965 */
966 #undef __FUNCT__
967 #define __FUNCT__ "MatMult_SeqBAIJ_N"
968 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
969 {
970   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
971   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
972   MatScalar      *v;
973   PetscErrorCode ierr;
974   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
975   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
976   PetscTruth     usecprow=a->compressedrow.use;
977 
978   PetscFunctionBegin;
979   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
980   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
981 
982   idx = a->j;
983   v   = a->a;
984   if (usecprow){
985     mbs  = a->compressedrow.nrows;
986     ii   = a->compressedrow.i;
987     ridx = a->compressedrow.rindex;
988   } else {
989     mbs = a->mbs;
990     ii  = a->i;
991     z   = zarray;
992   }
993 
994   if (!a->mult_work) {
995     k    = PetscMax(A->rmap->n,A->cmap->n);
996     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
997   }
998   work = a->mult_work;
999   for (i=0; i<mbs; i++) {
1000     n     = ii[1] - ii[0]; ii++;
1001     ncols = n*bs;
1002     workt = work;
1003     nonzerorow += (n>0);
1004     for (j=0; j<n; j++) {
1005       xb = x + bs*(*idx++);
1006       for (k=0; k<bs; k++) workt[k] = xb[k];
1007       workt += bs;
1008     }
1009     if (usecprow) z = zarray + bs*ridx[i];
1010     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1011     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1012     v += n*bs2;
1013     if (!usecprow) z += bs;
1014   }
1015   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1016   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1017   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
1018   PetscFunctionReturn(0);
1019 }
1020 
1021 extern PetscErrorCode VecCopy_Seq(Vec,Vec);
1022 #undef __FUNCT__
1023 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1024 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1025 {
1026   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1027   const PetscScalar  *x;
1028   PetscScalar        *y,*z,sum;
1029   const MatScalar    *v;
1030   PetscErrorCode     ierr;
1031   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1032   const PetscInt     *idx,*ii;
1033   PetscTruth         usecprow=a->compressedrow.use;
1034 
1035   PetscFunctionBegin;
1036   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1037   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1038   if (zz != yy) {
1039     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1040   } else {
1041     z = y;
1042   }
1043 
1044   idx = a->j;
1045   v   = a->a;
1046   if (usecprow){
1047     if (zz != yy){
1048       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1049     }
1050     mbs  = a->compressedrow.nrows;
1051     ii   = a->compressedrow.i;
1052     ridx = a->compressedrow.rindex;
1053   } else {
1054     ii  = a->i;
1055   }
1056 
1057   for (i=0; i<mbs; i++) {
1058     n    = ii[1] - ii[0];
1059     ii++;
1060     if (!usecprow){
1061       nonzerorow += (n>0);
1062       sum = y[i];
1063     } else {
1064       sum = y[ridx[i]];
1065     }
1066     PetscSparseDensePlusDot(sum,x,v,idx,n);
1067     v += n;
1068     idx += n;
1069     if (usecprow){
1070       z[ridx[i]] = sum;
1071     } else {
1072       z[i] = sum;
1073     }
1074   }
1075   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1076   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1077   if (zz != yy) {
1078     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1079   }
1080   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
1081   PetscFunctionReturn(0);
1082 }
1083 
1084 #undef __FUNCT__
1085 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1086 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1087 {
1088   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1089   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1090   PetscScalar    x1,x2,*yarray,*zarray;
1091   MatScalar      *v;
1092   PetscErrorCode ierr;
1093   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1094   PetscTruth     usecprow=a->compressedrow.use;
1095 
1096   PetscFunctionBegin;
1097   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1098   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1099   if (zz != yy) {
1100     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1101   } else {
1102     zarray = yarray;
1103   }
1104 
1105   idx = a->j;
1106   v   = a->a;
1107   if (usecprow){
1108     if (zz != yy){
1109       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1110     }
1111     mbs  = a->compressedrow.nrows;
1112     ii   = a->compressedrow.i;
1113     ridx = a->compressedrow.rindex;
1114     if (zz != yy){
1115       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1116     }
1117   } else {
1118     ii  = a->i;
1119     y   = yarray;
1120     z   = zarray;
1121   }
1122 
1123   for (i=0; i<mbs; i++) {
1124     n  = ii[1] - ii[0]; ii++;
1125     if (usecprow){
1126       z = zarray + 2*ridx[i];
1127       y = yarray + 2*ridx[i];
1128     }
1129     sum1 = y[0]; sum2 = y[1];
1130     for (j=0; j<n; j++) {
1131       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1132       sum1 += v[0]*x1 + v[2]*x2;
1133       sum2 += v[1]*x1 + v[3]*x2;
1134       v += 4;
1135     }
1136     z[0] = sum1; z[1] = sum2;
1137     if (!usecprow){
1138       z += 2; y += 2;
1139     }
1140   }
1141   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1142   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1143   if (zz != yy) {
1144     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1145   }
1146   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 #undef __FUNCT__
1151 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1152 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1153 {
1154   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1155   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1156   MatScalar      *v;
1157   PetscErrorCode ierr;
1158   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1159   PetscTruth     usecprow=a->compressedrow.use;
1160 
1161   PetscFunctionBegin;
1162   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1163   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1164   if (zz != yy) {
1165     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1166   } else {
1167     zarray = yarray;
1168   }
1169 
1170   idx = a->j;
1171   v   = a->a;
1172   if (usecprow){
1173     if (zz != yy){
1174       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1175     }
1176     mbs  = a->compressedrow.nrows;
1177     ii   = a->compressedrow.i;
1178     ridx = a->compressedrow.rindex;
1179   } else {
1180     ii  = a->i;
1181     y   = yarray;
1182     z   = zarray;
1183   }
1184 
1185   for (i=0; i<mbs; i++) {
1186     n  = ii[1] - ii[0]; ii++;
1187     if (usecprow){
1188       z = zarray + 3*ridx[i];
1189       y = yarray + 3*ridx[i];
1190     }
1191     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1192     for (j=0; j<n; j++) {
1193       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1194       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1195       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1196       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1197       v += 9;
1198     }
1199     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1200     if (!usecprow){
1201       z += 3; y += 3;
1202     }
1203   }
1204   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1205   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1206   if (zz != yy) {
1207     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1208   }
1209   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1210   PetscFunctionReturn(0);
1211 }
1212 
1213 #undef __FUNCT__
1214 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1215 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1216 {
1217   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1218   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1219   MatScalar      *v;
1220   PetscErrorCode ierr;
1221   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1222   PetscTruth     usecprow=a->compressedrow.use;
1223 
1224   PetscFunctionBegin;
1225   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1226   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1227   if (zz != yy) {
1228     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1229   } else {
1230     zarray = yarray;
1231   }
1232 
1233   idx   = a->j;
1234   v     = a->a;
1235   if (usecprow){
1236     if (zz != yy){
1237       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1238     }
1239     mbs  = a->compressedrow.nrows;
1240     ii   = a->compressedrow.i;
1241     ridx = a->compressedrow.rindex;
1242   } else {
1243     ii  = a->i;
1244     y   = yarray;
1245     z   = zarray;
1246   }
1247 
1248   for (i=0; i<mbs; i++) {
1249     n  = ii[1] - ii[0]; ii++;
1250     if (usecprow){
1251       z = zarray + 4*ridx[i];
1252       y = yarray + 4*ridx[i];
1253     }
1254     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1255     for (j=0; j<n; j++) {
1256       xb = x + 4*(*idx++);
1257       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1258       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1259       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1260       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1261       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1262       v += 16;
1263     }
1264     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1265     if (!usecprow){
1266       z += 4; y += 4;
1267     }
1268   }
1269   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1270   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1271   if (zz != yy) {
1272     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1273   }
1274   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1280 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1281 {
1282   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1283   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1284   PetscScalar    *yarray,*zarray;
1285   MatScalar      *v;
1286   PetscErrorCode ierr;
1287   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1288   PetscTruth     usecprow=a->compressedrow.use;
1289 
1290   PetscFunctionBegin;
1291   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1292   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1293   if (zz != yy) {
1294     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1295   } else {
1296     zarray = yarray;
1297   }
1298 
1299   idx = a->j;
1300   v   = a->a;
1301   if (usecprow){
1302     if (zz != yy){
1303       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1304     }
1305     mbs  = a->compressedrow.nrows;
1306     ii   = a->compressedrow.i;
1307     ridx = a->compressedrow.rindex;
1308   } else {
1309     ii  = a->i;
1310     y   = yarray;
1311     z   = zarray;
1312   }
1313 
1314   for (i=0; i<mbs; i++) {
1315     n  = ii[1] - ii[0]; ii++;
1316     if (usecprow){
1317       z = zarray + 5*ridx[i];
1318       y = yarray + 5*ridx[i];
1319     }
1320     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1321     for (j=0; j<n; j++) {
1322       xb = x + 5*(*idx++);
1323       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1324       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1325       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1326       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1327       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1328       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1329       v += 25;
1330     }
1331     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1332     if (!usecprow){
1333       z += 5; y += 5;
1334     }
1335   }
1336   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1337   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1338   if (zz != yy) {
1339     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1340   }
1341   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1342   PetscFunctionReturn(0);
1343 }
1344 #undef __FUNCT__
1345 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1346 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1347 {
1348   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1349   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1350   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1351   MatScalar      *v;
1352   PetscErrorCode ierr;
1353   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1354   PetscTruth     usecprow=a->compressedrow.use;
1355 
1356   PetscFunctionBegin;
1357   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1358   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1359   if (zz != yy) {
1360     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1361   } else {
1362     zarray = yarray;
1363   }
1364 
1365   idx = a->j;
1366   v   = a->a;
1367   if (usecprow){
1368     if (zz != yy){
1369       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1370     }
1371     mbs  = a->compressedrow.nrows;
1372     ii   = a->compressedrow.i;
1373     ridx = a->compressedrow.rindex;
1374   } else {
1375     ii  = a->i;
1376     y   = yarray;
1377     z   = zarray;
1378   }
1379 
1380   for (i=0; i<mbs; i++) {
1381     n  = ii[1] - ii[0]; ii++;
1382     if (usecprow){
1383       z = zarray + 6*ridx[i];
1384       y = yarray + 6*ridx[i];
1385     }
1386     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1387     for (j=0; j<n; j++) {
1388       xb = x + 6*(*idx++);
1389       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1390       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1391       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1392       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1393       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1394       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1395       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1396       v += 36;
1397     }
1398     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1399     if (!usecprow){
1400       z += 6; y += 6;
1401     }
1402   }
1403   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1404   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1405   if (zz != yy) {
1406     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1407   }
1408   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1409   PetscFunctionReturn(0);
1410 }
1411 
1412 #undef __FUNCT__
1413 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1414 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1415 {
1416   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1417   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1418   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1419   MatScalar      *v;
1420   PetscErrorCode ierr;
1421   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1422   PetscTruth     usecprow=a->compressedrow.use;
1423 
1424   PetscFunctionBegin;
1425   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1426   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1427   if (zz != yy) {
1428     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1429   } else {
1430     zarray = yarray;
1431   }
1432 
1433   idx = a->j;
1434   v   = a->a;
1435   if (usecprow){
1436     if (zz != yy){
1437       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1438     }
1439     mbs  = a->compressedrow.nrows;
1440     ii   = a->compressedrow.i;
1441     ridx = a->compressedrow.rindex;
1442   } else {
1443     ii  = a->i;
1444     y   = yarray;
1445     z   = zarray;
1446   }
1447 
1448   for (i=0; i<mbs; i++) {
1449     n  = ii[1] - ii[0]; ii++;
1450     if (usecprow){
1451       z = zarray + 7*ridx[i];
1452       y = yarray + 7*ridx[i];
1453     }
1454     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1455     for (j=0; j<n; j++) {
1456       xb = x + 7*(*idx++);
1457       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1458       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1459       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1460       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1461       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1462       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1463       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1464       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1465       v += 49;
1466     }
1467     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1468     if (!usecprow){
1469       z += 7; y += 7;
1470     }
1471   }
1472   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1474   if (zz != yy) {
1475     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1476   }
1477   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1478   PetscFunctionReturn(0);
1479 }
1480 
1481 #undef __FUNCT__
1482 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1483 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1484 {
1485   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1486   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1487   MatScalar      *v;
1488   PetscErrorCode ierr;
1489   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1490   PetscInt       ncols,k,*ridx=PETSC_NULL;
1491   PetscTruth     usecprow=a->compressedrow.use;
1492 
1493   PetscFunctionBegin;
1494   ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr);
1495   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1496   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1497 
1498   idx = a->j;
1499   v   = a->a;
1500   if (usecprow){
1501     mbs    = a->compressedrow.nrows;
1502     ii     = a->compressedrow.i;
1503     ridx = a->compressedrow.rindex;
1504   } else {
1505     mbs = a->mbs;
1506     ii  = a->i;
1507     z   = zarray;
1508   }
1509 
1510   if (!a->mult_work) {
1511     k    = PetscMax(A->rmap->n,A->cmap->n);
1512     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1513   }
1514   work = a->mult_work;
1515   for (i=0; i<mbs; i++) {
1516     n     = ii[1] - ii[0]; ii++;
1517     ncols = n*bs;
1518     workt = work;
1519     for (j=0; j<n; j++) {
1520       xb = x + bs*(*idx++);
1521       for (k=0; k<bs; k++) workt[k] = xb[k];
1522       workt += bs;
1523     }
1524     if (usecprow) z = zarray + bs*ridx[i];
1525     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1526     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1527     v += n*bs2;
1528     if (!usecprow){
1529       z += bs;
1530     }
1531   }
1532   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1533   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1534   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 #undef __FUNCT__
1539 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1540 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1541 {
1542   PetscScalar    zero = 0.0;
1543   PetscErrorCode ierr;
1544 
1545   PetscFunctionBegin;
1546   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1547   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 #undef __FUNCT__
1552 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1553 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1554 {
1555   PetscScalar    zero = 0.0;
1556   PetscErrorCode ierr;
1557 
1558   PetscFunctionBegin;
1559   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1560   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1561   PetscFunctionReturn(0);
1562 }
1563 
1564 #undef __FUNCT__
1565 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1566 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1567 
1568 {
1569   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1570   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1571   MatScalar         *v;
1572   PetscErrorCode    ierr;
1573   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1574   Mat_CompressedRow cprow = a->compressedrow;
1575   PetscTruth        usecprow=cprow.use;
1576 
1577   PetscFunctionBegin;
1578   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1579   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1580   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1581 
1582   idx = a->j;
1583   v   = a->a;
1584   if (usecprow){
1585     mbs  = cprow.nrows;
1586     ii   = cprow.i;
1587     ridx = cprow.rindex;
1588   } else {
1589     mbs=a->mbs;
1590     ii = a->i;
1591     xb = x;
1592   }
1593 
1594   switch (bs) {
1595   case 1:
1596     for (i=0; i<mbs; i++) {
1597       if (usecprow) xb = x + ridx[i];
1598       x1 = xb[0];
1599       ib = idx + ii[0];
1600       n  = ii[1] - ii[0]; ii++;
1601       for (j=0; j<n; j++) {
1602         rval    = ib[j];
1603         z[rval] += PetscConj(*v) * x1;
1604         v++;
1605       }
1606       if (!usecprow) xb++;
1607     }
1608     break;
1609   case 2:
1610     for (i=0; i<mbs; i++) {
1611       if (usecprow) xb = x + 2*ridx[i];
1612       x1 = xb[0]; x2 = xb[1];
1613       ib = idx + ii[0];
1614       n  = ii[1] - ii[0]; ii++;
1615       for (j=0; j<n; j++) {
1616         rval      = ib[j]*2;
1617         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1618         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1619         v  += 4;
1620       }
1621       if (!usecprow) xb += 2;
1622     }
1623     break;
1624   case 3:
1625     for (i=0; i<mbs; i++) {
1626       if (usecprow) xb = x + 3*ridx[i];
1627       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1628       ib = idx + ii[0];
1629       n  = ii[1] - ii[0]; ii++;
1630       for (j=0; j<n; j++) {
1631         rval      = ib[j]*3;
1632         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1633         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1634         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1635         v  += 9;
1636       }
1637       if (!usecprow) xb += 3;
1638     }
1639     break;
1640   case 4:
1641     for (i=0; i<mbs; i++) {
1642       if (usecprow) xb = x + 4*ridx[i];
1643       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1644       ib = idx + ii[0];
1645       n  = ii[1] - ii[0]; ii++;
1646       for (j=0; j<n; j++) {
1647         rval      = ib[j]*4;
1648         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1649         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1650         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1651         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1652         v  += 16;
1653       }
1654       if (!usecprow) xb += 4;
1655     }
1656     break;
1657   case 5:
1658     for (i=0; i<mbs; i++) {
1659       if (usecprow) xb = x + 5*ridx[i];
1660       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1661       x4 = xb[3]; x5 = xb[4];
1662       ib = idx + ii[0];
1663       n  = ii[1] - ii[0]; ii++;
1664       for (j=0; j<n; j++) {
1665         rval      = ib[j]*5;
1666         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1667         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1668         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1669         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1670         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1671         v  += 25;
1672       }
1673       if (!usecprow) xb += 5;
1674     }
1675     break;
1676   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1677       PetscInt     ncols,k;
1678       PetscScalar  *work,*workt,*xtmp;
1679 
1680       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1681       if (!a->mult_work) {
1682         k = PetscMax(A->rmap->n,A->cmap->n);
1683         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1684       }
1685       work = a->mult_work;
1686       xtmp = x;
1687       for (i=0; i<mbs; i++) {
1688         n     = ii[1] - ii[0]; ii++;
1689         ncols = n*bs;
1690         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1691         if (usecprow) {
1692           xtmp = x + bs*ridx[i];
1693         }
1694         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1695         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1696         v += n*bs2;
1697         if (!usecprow) xtmp += bs;
1698         workt = work;
1699         for (j=0; j<n; j++) {
1700           zb = z + bs*(*idx++);
1701           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1702           workt += bs;
1703         }
1704       }
1705     }
1706   }
1707   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1708   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1709   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 #undef __FUNCT__
1714 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1715 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1716 {
1717   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1718   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1719   MatScalar         *v;
1720   PetscErrorCode    ierr;
1721   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1722   Mat_CompressedRow cprow = a->compressedrow;
1723   PetscTruth        usecprow=cprow.use;
1724 
1725   PetscFunctionBegin;
1726   if (yy != zz) { ierr = VecCopy_Seq(yy,zz);CHKERRQ(ierr); }
1727   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1728   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1729 
1730   idx = a->j;
1731   v   = a->a;
1732   if (usecprow){
1733     mbs  = cprow.nrows;
1734     ii   = cprow.i;
1735     ridx = cprow.rindex;
1736   } else {
1737     mbs=a->mbs;
1738     ii = a->i;
1739     xb = x;
1740   }
1741 
1742   switch (bs) {
1743   case 1:
1744     for (i=0; i<mbs; i++) {
1745       if (usecprow) xb = x + ridx[i];
1746       x1 = xb[0];
1747       ib = idx + ii[0];
1748       n  = ii[1] - ii[0]; ii++;
1749       for (j=0; j<n; j++) {
1750         rval    = ib[j];
1751         z[rval] += *v * x1;
1752         v++;
1753       }
1754       if (!usecprow) xb++;
1755     }
1756     break;
1757   case 2:
1758     for (i=0; i<mbs; i++) {
1759       if (usecprow) xb = x + 2*ridx[i];
1760       x1 = xb[0]; x2 = xb[1];
1761       ib = idx + ii[0];
1762       n  = ii[1] - ii[0]; ii++;
1763       for (j=0; j<n; j++) {
1764         rval      = ib[j]*2;
1765         z[rval++] += v[0]*x1 + v[1]*x2;
1766         z[rval++] += v[2]*x1 + v[3]*x2;
1767         v  += 4;
1768       }
1769       if (!usecprow) xb += 2;
1770     }
1771     break;
1772   case 3:
1773     for (i=0; i<mbs; i++) {
1774       if (usecprow) xb = x + 3*ridx[i];
1775       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1776       ib = idx + ii[0];
1777       n  = ii[1] - ii[0]; ii++;
1778       for (j=0; j<n; j++) {
1779         rval      = ib[j]*3;
1780         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1781         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1782         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1783         v  += 9;
1784       }
1785       if (!usecprow) xb += 3;
1786     }
1787     break;
1788   case 4:
1789     for (i=0; i<mbs; i++) {
1790       if (usecprow) xb = x + 4*ridx[i];
1791       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1792       ib = idx + ii[0];
1793       n  = ii[1] - ii[0]; ii++;
1794       for (j=0; j<n; j++) {
1795         rval      = ib[j]*4;
1796         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1797         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1798         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1799         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1800         v  += 16;
1801       }
1802       if (!usecprow) xb += 4;
1803     }
1804     break;
1805   case 5:
1806     for (i=0; i<mbs; i++) {
1807       if (usecprow) xb = x + 5*ridx[i];
1808       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1809       x4 = xb[3]; x5 = xb[4];
1810       ib = idx + ii[0];
1811       n  = ii[1] - ii[0]; ii++;
1812       for (j=0; j<n; j++) {
1813         rval      = ib[j]*5;
1814         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1815         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1816         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1817         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1818         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1819         v  += 25;
1820       }
1821       if (!usecprow) xb += 5;
1822     }
1823     break;
1824   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1825       PetscInt     ncols,k;
1826       PetscScalar  *work,*workt,*xtmp;
1827 
1828       if (!a->mult_work) {
1829         k = PetscMax(A->rmap->n,A->cmap->n);
1830         ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1831       }
1832       work = a->mult_work;
1833       xtmp = x;
1834       for (i=0; i<mbs; i++) {
1835         n     = ii[1] - ii[0]; ii++;
1836         ncols = n*bs;
1837         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1838         if (usecprow) {
1839           xtmp = x + bs*ridx[i];
1840         }
1841         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1842         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1843         v += n*bs2;
1844         if (!usecprow) xtmp += bs;
1845         workt = work;
1846         for (j=0; j<n; j++) {
1847           zb = z + bs*(*idx++);
1848           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1849           workt += bs;
1850         }
1851       }
1852     }
1853   }
1854   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1855   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1856   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1857   PetscFunctionReturn(0);
1858 }
1859 
1860 #undef __FUNCT__
1861 #define __FUNCT__ "MatScale_SeqBAIJ"
1862 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1863 {
1864   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1865   PetscInt       totalnz = a->bs2*a->nz;
1866   PetscScalar    oalpha = alpha;
1867   PetscErrorCode ierr;
1868   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);
1869 
1870   PetscFunctionBegin;
1871   BLASscal_(&tnz,&oalpha,a->a,&one);
1872   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 #undef __FUNCT__
1877 #define __FUNCT__ "MatNorm_SeqBAIJ"
1878 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1879 {
1880   PetscErrorCode ierr;
1881   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1882   MatScalar      *v = a->a;
1883   PetscReal      sum = 0.0;
1884   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1885 
1886   PetscFunctionBegin;
1887   if (type == NORM_FROBENIUS) {
1888     for (i=0; i< bs2*nz; i++) {
1889 #if defined(PETSC_USE_COMPLEX)
1890       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1891 #else
1892       sum += (*v)*(*v); v++;
1893 #endif
1894     }
1895     *norm = sqrt(sum);
1896   } else if (type == NORM_1) { /* maximum column sum */
1897     PetscReal *tmp;
1898     PetscInt  *bcol = a->j;
1899     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1900     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1901     for (i=0; i<nz; i++){
1902       for (j=0; j<bs; j++){
1903         k1 = bs*(*bcol) + j; /* column index */
1904         for (k=0; k<bs; k++){
1905           tmp[k1] += PetscAbsScalar(*v); v++;
1906         }
1907       }
1908       bcol++;
1909     }
1910     *norm = 0.0;
1911     for (j=0; j<A->cmap->n; j++) {
1912       if (tmp[j] > *norm) *norm = tmp[j];
1913     }
1914     ierr = PetscFree(tmp);CHKERRQ(ierr);
1915   } else if (type == NORM_INFINITY) { /* maximum row sum */
1916     *norm = 0.0;
1917     for (k=0; k<bs; k++) {
1918       for (j=0; j<a->mbs; j++) {
1919         v = a->a + bs2*a->i[j] + k;
1920         sum = 0.0;
1921         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1922           for (k1=0; k1<bs; k1++){
1923             sum += PetscAbsScalar(*v);
1924             v   += bs;
1925           }
1926         }
1927         if (sum > *norm) *norm = sum;
1928       }
1929     }
1930   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1931   PetscFunctionReturn(0);
1932 }
1933 
1934 
1935 #undef __FUNCT__
1936 #define __FUNCT__ "MatEqual_SeqBAIJ"
1937 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1938 {
1939   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1940   PetscErrorCode ierr;
1941 
1942   PetscFunctionBegin;
1943   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1944   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1945     *flg = PETSC_FALSE;
1946     PetscFunctionReturn(0);
1947   }
1948 
1949   /* if the a->i are the same */
1950   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1951   if (!*flg) {
1952     PetscFunctionReturn(0);
1953   }
1954 
1955   /* if a->j are the same */
1956   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1957   if (!*flg) {
1958     PetscFunctionReturn(0);
1959   }
1960   /* if a->a are the same */
1961   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1962   PetscFunctionReturn(0);
1963 
1964 }
1965 
1966 #undef __FUNCT__
1967 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1968 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1969 {
1970   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1971   PetscErrorCode ierr;
1972   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1973   PetscScalar    *x,zero = 0.0;
1974   MatScalar      *aa,*aa_j;
1975 
1976   PetscFunctionBegin;
1977   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1978   bs   = A->rmap->bs;
1979   aa   = a->a;
1980   ai   = a->i;
1981   aj   = a->j;
1982   ambs = a->mbs;
1983   bs2  = a->bs2;
1984 
1985   ierr = VecSet(v,zero);CHKERRQ(ierr);
1986   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1987   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1988   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1989   for (i=0; i<ambs; i++) {
1990     for (j=ai[i]; j<ai[i+1]; j++) {
1991       if (aj[j] == i) {
1992         row  = i*bs;
1993         aa_j = aa+j*bs2;
1994         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1995         break;
1996       }
1997     }
1998   }
1999   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2000   PetscFunctionReturn(0);
2001 }
2002 
2003 #undef __FUNCT__
2004 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2005 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2006 {
2007   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2008   const PetscScalar *l,*r,*li,*ri;
2009   PetscScalar       x;
2010   MatScalar         *aa, *v;
2011   PetscErrorCode    ierr;
2012   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2013   const PetscInt    *ai,*aj;
2014 
2015   PetscFunctionBegin;
2016   ai  = a->i;
2017   aj  = a->j;
2018   aa  = a->a;
2019   m   = A->rmap->n;
2020   n   = A->cmap->n;
2021   bs  = A->rmap->bs;
2022   mbs = a->mbs;
2023   bs2 = a->bs2;
2024   if (ll) {
2025     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2026     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2027     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2028     for (i=0; i<mbs; i++) { /* for each block row */
2029       M  = ai[i+1] - ai[i];
2030       li = l + i*bs;
2031       v  = aa + bs2*ai[i];
2032       for (j=0; j<M; j++) { /* for each block */
2033         for (k=0; k<bs2; k++) {
2034           (*v++) *= li[k%bs];
2035         }
2036       }
2037     }
2038     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2039     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2040   }
2041 
2042   if (rr) {
2043     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2044     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2045     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2046     for (i=0; i<mbs; i++) { /* for each block row */
2047       iai = ai[i];
2048       M   = ai[i+1] - iai;
2049       v   = aa + bs2*iai;
2050       for (j=0; j<M; j++) { /* for each block */
2051         ri = r + bs*aj[iai+j];
2052         for (k=0; k<bs; k++) {
2053           x = ri[k];
2054           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2055           v += bs;
2056         }
2057       }
2058     }
2059     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2060     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2061   }
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 
2066 #undef __FUNCT__
2067 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2068 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2069 {
2070   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2071 
2072   PetscFunctionBegin;
2073   info->block_size     = a->bs2;
2074   info->nz_allocated   = a->bs2*a->maxnz;
2075   info->nz_used        = a->bs2*a->nz;
2076   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
2077   info->assemblies   = A->num_ass;
2078   info->mallocs      = A->info.mallocs;
2079   info->memory       = ((PetscObject)A)->mem;
2080   if (A->factortype) {
2081     info->fill_ratio_given  = A->info.fill_ratio_given;
2082     info->fill_ratio_needed = A->info.fill_ratio_needed;
2083     info->factor_mallocs    = A->info.factor_mallocs;
2084   } else {
2085     info->fill_ratio_given  = 0;
2086     info->fill_ratio_needed = 0;
2087     info->factor_mallocs    = 0;
2088   }
2089   PetscFunctionReturn(0);
2090 }
2091 
2092 
2093 #undef __FUNCT__
2094 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2095 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2096 {
2097   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2098   PetscErrorCode ierr;
2099 
2100   PetscFunctionBegin;
2101   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2102   PetscFunctionReturn(0);
2103 }
2104