xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 047240e14af00aad1ef65e96f6fface8924f7f7e)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m  = a->mbs;
20   ai = a->i;
21   aj = a->j;
22   bs = A->rmap->bs;
23 
24   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
27   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n; ++j) {
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64     }
65     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
66   }
67   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
68   ierr = PetscFree(nidx);CHKERRQ(ierr);
69   ierr = PetscFree(nidx2);CHKERRQ(ierr);
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag,sorted;
87 
88   PetscFunctionBegin;
89   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
90   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
91 
92   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
93   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
94   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
96 
97   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
98   ssmap = smap;
99   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
100   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
101   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102   /* determine lens of each row */
103   for (i=0; i<nrows; i++) {
104     kstart  = ai[irow[i]];
105     kend    = kstart + a->ilen[irow[i]];
106     lens[i] = 0;
107     for (k=kstart; k<kend; k++) {
108       if (ssmap[aj[k]]) lens[i]++;
109     }
110   }
111   /* Create and fill new matrix */
112   if (scall == MAT_REUSE_MATRIX) {
113     c = (Mat_SeqBAIJ*)((*B)->data);
114 
115     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
117     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
118     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
119     C    = *B;
120   } else {
121     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
122     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
123     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
124     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
125   }
126   c = (Mat_SeqBAIJ*)(C->data);
127   for (i=0; i<nrows; i++) {
128     row      = irow[i];
129     kstart   = ai[row];
130     kend     = kstart + a->ilen[row];
131     mat_i    = c->i[i];
132     mat_j    = c->j + mat_i;
133     mat_a    = c->a + mat_i*bs2;
134     mat_ilen = c->ilen + i;
135     for (k=kstart; k<kend; k++) {
136       if ((tcol=ssmap[a->j[k]])) {
137         *mat_j++ = tcol - 1;
138         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
139         mat_a   += bs2;
140         (*mat_ilen)++;
141       }
142     }
143   }
144 
145   /* Free work space */
146   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
147   ierr = PetscFree(smap);CHKERRQ(ierr);
148   ierr = PetscFree(lens);CHKERRQ(ierr);
149   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
150   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
151 
152   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
153   *B   = C;
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
159 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
160 {
161   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
162   IS             is1,is2;
163   PetscErrorCode ierr;
164   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
165   const PetscInt *irow,*icol;
166 
167   PetscFunctionBegin;
168   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
169   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
170   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
171   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
172 
173   /* Verify if the indices corespond to each element in a block
174    and form the IS with compressed IS */
175   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
176   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
177   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
178   count = 0;
179   for (i=0; i<a->mbs; i++) {
180     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
181     if (vary[i]==bs) iary[count++] = i;
182   }
183   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
184 
185   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
186   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
187   count = 0;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
190     if (vary[i]==bs) iary[count++] = i;
191   }
192   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
193   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
194   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
195   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
196 
197   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
198   ierr = ISDestroy(&is1);CHKERRQ(ierr);
199   ierr = ISDestroy(&is2);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNCT__
204 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
205 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
206 {
207   PetscErrorCode ierr;
208   PetscInt       i;
209 
210   PetscFunctionBegin;
211   if (scall == MAT_INITIAL_MATRIX) {
212     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
213   }
214 
215   for (i=0; i<n; i++) {
216     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
217   }
218   PetscFunctionReturn(0);
219 }
220 
221 
222 /* -------------------------------------------------------*/
223 /* Should check that shapes of vectors and matrices match */
224 /* -------------------------------------------------------*/
225 
226 #undef __FUNCT__
227 #define __FUNCT__ "MatMult_SeqBAIJ_1"
228 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229 {
230   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
231   PetscScalar       *z,sum;
232   const PetscScalar *x;
233   const MatScalar   *v;
234   PetscErrorCode    ierr;
235   PetscInt          mbs,i,n,nonzerorow=0;
236   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
237   PetscBool         usecprow=a->compressedrow.use;
238 
239   PetscFunctionBegin;
240   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
241   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
242 
243   if (usecprow) {
244     mbs  = a->compressedrow.nrows;
245     ii   = a->compressedrow.i;
246     ridx = a->compressedrow.rindex;
247     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
248   } else {
249     mbs = a->mbs;
250     ii  = a->i;
251   }
252 
253   for (i=0; i<mbs; i++) {
254     n   = ii[1] - ii[0];
255     v   = a->a + ii[0];
256     idx = a->j + ii[0];
257     ii++;
258     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
259     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
260     sum = 0.0;
261     PetscSparseDensePlusDot(sum,x,v,idx,n);
262     if (usecprow) {
263       z[ridx[i]] = sum;
264     } else {
265       nonzerorow += (n>0);
266       z[i]        = sum;
267     }
268   }
269   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
270   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
271   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
272   PetscFunctionReturn(0);
273 }
274 
275 #undef __FUNCT__
276 #define __FUNCT__ "MatMult_SeqBAIJ_2"
277 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
278 {
279   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
280   PetscScalar       *z = 0,sum1,sum2,*zarray;
281   const PetscScalar *x,*xb;
282   PetscScalar       x1,x2;
283   const MatScalar   *v;
284   PetscErrorCode    ierr;
285   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
286   PetscBool         usecprow=a->compressedrow.use;
287 
288   PetscFunctionBegin;
289   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
290   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
291 
292   idx = a->j;
293   v   = a->a;
294   if (usecprow) {
295     mbs  = a->compressedrow.nrows;
296     ii   = a->compressedrow.i;
297     ridx = a->compressedrow.rindex;
298   } else {
299     mbs = a->mbs;
300     ii  = a->i;
301     z   = zarray;
302   }
303 
304   for (i=0; i<mbs; i++) {
305     n           = ii[1] - ii[0]; ii++;
306     sum1        = 0.0; sum2 = 0.0;
307     nonzerorow += (n>0);
308     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
309     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
310     for (j=0; j<n; j++) {
311       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
312       sum1 += v[0]*x1 + v[2]*x2;
313       sum2 += v[1]*x1 + v[3]*x2;
314       v    += 4;
315     }
316     if (usecprow) z = zarray + 2*ridx[i];
317     z[0] = sum1; z[1] = sum2;
318     if (!usecprow) z += 2;
319   }
320   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
321   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
322   ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatMult_SeqBAIJ_3"
328 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
329 {
330   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
331   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
332   const PetscScalar *x,*xb;
333   const MatScalar   *v;
334   PetscErrorCode    ierr;
335   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
336   PetscBool         usecprow=a->compressedrow.use;
337 
338 
339 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
340 #pragma disjoint(*v,*z,*xb)
341 #endif
342 
343   PetscFunctionBegin;
344   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
345   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
346 
347   idx = a->j;
348   v   = a->a;
349   if (usecprow) {
350     mbs  = a->compressedrow.nrows;
351     ii   = a->compressedrow.i;
352     ridx = a->compressedrow.rindex;
353   } else {
354     mbs = a->mbs;
355     ii  = a->i;
356     z   = zarray;
357   }
358 
359   for (i=0; i<mbs; i++) {
360     n           = ii[1] - ii[0]; ii++;
361     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
362     nonzerorow += (n>0);
363     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
364     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
365     for (j=0; j<n; j++) {
366       xb = x + 3*(*idx++);
367       x1 = xb[0];
368       x2 = xb[1];
369       x3 = xb[2];
370 
371       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
372       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
373       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
374       v    += 9;
375     }
376     if (usecprow) z = zarray + 3*ridx[i];
377     z[0] = sum1; z[1] = sum2; z[2] = sum3;
378     if (!usecprow) z += 3;
379   }
380   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
381   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
382   ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatMult_SeqBAIJ_4"
388 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
389 {
390   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
391   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
392   const PetscScalar *x,*xb;
393   const MatScalar   *v;
394   PetscErrorCode    ierr;
395   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
396   PetscBool         usecprow=a->compressedrow.use;
397 
398   PetscFunctionBegin;
399   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
400   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
401 
402   idx = a->j;
403   v   = a->a;
404   if (usecprow) {
405     mbs  = a->compressedrow.nrows;
406     ii   = a->compressedrow.i;
407     ridx = a->compressedrow.rindex;
408   } else {
409     mbs = a->mbs;
410     ii  = a->i;
411     z   = zarray;
412   }
413 
414   for (i=0; i<mbs; i++) {
415     n = ii[1] - ii[0];
416     ii++;
417     sum1 = 0.0;
418     sum2 = 0.0;
419     sum3 = 0.0;
420     sum4 = 0.0;
421 
422     nonzerorow += (n>0);
423     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
424     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
425     for (j=0; j<n; j++) {
426       xb    = x + 4*(*idx++);
427       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
428       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
429       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
430       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
431       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
432       v    += 16;
433     }
434     if (usecprow) z = zarray + 4*ridx[i];
435     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
436     if (!usecprow) z += 4;
437   }
438   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
439   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
440   ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
441   PetscFunctionReturn(0);
442 }
443 
444 #undef __FUNCT__
445 #define __FUNCT__ "MatMult_SeqBAIJ_5"
446 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
447 {
448   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
449   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
450   const PetscScalar *xb,*x;
451   const MatScalar   *v;
452   PetscErrorCode    ierr;
453   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
454   PetscInt          mbs,i,j,n,nonzerorow=0;
455   PetscBool         usecprow=a->compressedrow.use;
456 
457   PetscFunctionBegin;
458   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
459   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
460 
461   idx = a->j;
462   v   = a->a;
463   if (usecprow) {
464     mbs  = a->compressedrow.nrows;
465     ii   = a->compressedrow.i;
466     ridx = a->compressedrow.rindex;
467   } else {
468     mbs = a->mbs;
469     ii  = a->i;
470     z   = zarray;
471   }
472 
473   for (i=0; i<mbs; i++) {
474     n           = ii[1] - ii[0]; ii++;
475     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
476     nonzerorow += (n>0);
477     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
478     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
479     for (j=0; j<n; j++) {
480       xb    = x + 5*(*idx++);
481       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
482       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
483       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
484       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
485       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
486       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
487       v    += 25;
488     }
489     if (usecprow) z = zarray + 5*ridx[i];
490     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
491     if (!usecprow) z += 5;
492   }
493   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
495   ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);CHKERRQ(ierr);
496   PetscFunctionReturn(0);
497 }
498 
499 
500 #undef __FUNCT__
501 #define __FUNCT__ "MatMult_SeqBAIJ_6"
502 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
503 {
504   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
505   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
506   const PetscScalar *x,*xb;
507   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
508   const MatScalar   *v;
509   PetscErrorCode    ierr;
510   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
511   PetscBool         usecprow=a->compressedrow.use;
512 
513   PetscFunctionBegin;
514   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
515   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
516 
517   idx = a->j;
518   v   = a->a;
519   if (usecprow) {
520     mbs  = a->compressedrow.nrows;
521     ii   = a->compressedrow.i;
522     ridx = a->compressedrow.rindex;
523   } else {
524     mbs = a->mbs;
525     ii  = a->i;
526     z   = zarray;
527   }
528 
529   for (i=0; i<mbs; i++) {
530     n  = ii[1] - ii[0];
531     ii++;
532     sum1 = 0.0;
533     sum2 = 0.0;
534     sum3 = 0.0;
535     sum4 = 0.0;
536     sum5 = 0.0;
537     sum6 = 0.0;
538 
539     nonzerorow += (n>0);
540     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
541     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
542     for (j=0; j<n; j++) {
543       xb    = x + 6*(*idx++);
544       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
545       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
546       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
547       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
548       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
549       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
550       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
551       v    += 36;
552     }
553     if (usecprow) z = zarray + 6*ridx[i];
554     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
555     if (!usecprow) z += 6;
556   }
557 
558   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
559   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
560   ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);CHKERRQ(ierr);
561   PetscFunctionReturn(0);
562 }
563 
564 #undef __FUNCT__
565 #define __FUNCT__ "MatMult_SeqBAIJ_7"
566 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
567 {
568   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
569   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
570   const PetscScalar *x,*xb;
571   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
572   const MatScalar   *v;
573   PetscErrorCode    ierr;
574   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
575   PetscBool         usecprow=a->compressedrow.use;
576 
577   PetscFunctionBegin;
578   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
579   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
580 
581   idx = a->j;
582   v   = a->a;
583   if (usecprow) {
584     mbs  = a->compressedrow.nrows;
585     ii   = a->compressedrow.i;
586     ridx = a->compressedrow.rindex;
587   } else {
588     mbs = a->mbs;
589     ii  = a->i;
590     z   = zarray;
591   }
592 
593   for (i=0; i<mbs; i++) {
594     n  = ii[1] - ii[0];
595     ii++;
596     sum1 = 0.0;
597     sum2 = 0.0;
598     sum3 = 0.0;
599     sum4 = 0.0;
600     sum5 = 0.0;
601     sum6 = 0.0;
602     sum7 = 0.0;
603 
604     nonzerorow += (n>0);
605     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
606     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
607     for (j=0; j<n; j++) {
608       xb    = x + 7*(*idx++);
609       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
610       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
611       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
612       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
613       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
614       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
615       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
616       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
617       v    += 49;
618     }
619     if (usecprow) z = zarray + 7*ridx[i];
620     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
621     if (!usecprow) z += 7;
622   }
623 
624   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
625   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
626   ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);CHKERRQ(ierr);
627   PetscFunctionReturn(0);
628 }
629 
630 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
631 /* Default MatMult for block size 15 */
632 
633 #undef __FUNCT__
634 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
635 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
636 {
637   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
638   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
639   const PetscScalar *x,*xb;
640   PetscScalar       *zarray,xv;
641   const MatScalar   *v;
642   PetscErrorCode    ierr;
643   const PetscInt    *ii,*ij=a->j,*idx;
644   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
645   PetscBool         usecprow=a->compressedrow.use;
646 
647   PetscFunctionBegin;
648   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
649   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
650 
651   v = a->a;
652   if (usecprow) {
653     mbs  = a->compressedrow.nrows;
654     ii   = a->compressedrow.i;
655     ridx = a->compressedrow.rindex;
656   } else {
657     mbs = a->mbs;
658     ii  = a->i;
659     z   = zarray;
660   }
661 
662   for (i=0; i<mbs; i++) {
663     n    = ii[i+1] - ii[i];
664     idx  = ij + ii[i];
665     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
666     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
667 
668     nonzerorow += (n>0);
669     for (j=0; j<n; j++) {
670       xb = x + 15*(idx[j]);
671 
672       for (k=0; k<15; k++) {
673         xv     =  xb[k];
674         sum1  += v[0]*xv;
675         sum2  += v[1]*xv;
676         sum3  += v[2]*xv;
677         sum4  += v[3]*xv;
678         sum5  += v[4]*xv;
679         sum6  += v[5]*xv;
680         sum7  += v[6]*xv;
681         sum8  += v[7]*xv;
682         sum9  += v[8]*xv;
683         sum10 += v[9]*xv;
684         sum11 += v[10]*xv;
685         sum12 += v[11]*xv;
686         sum13 += v[12]*xv;
687         sum14 += v[13]*xv;
688         sum15 += v[14]*xv;
689         v     += 15;
690       }
691     }
692     if (usecprow) z = zarray + 15*ridx[i];
693     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
694     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
695 
696     if (!usecprow) z += 15;
697   }
698 
699   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
700   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
701   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
702   PetscFunctionReturn(0);
703 }
704 
705 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
706 #undef __FUNCT__
707 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
708 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
709 {
710   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
711   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
712   const PetscScalar *x,*xb;
713   PetscScalar       x1,x2,x3,x4,*zarray;
714   const MatScalar   *v;
715   PetscErrorCode    ierr;
716   const PetscInt    *ii,*ij=a->j,*idx;
717   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
718   PetscBool         usecprow=a->compressedrow.use;
719 
720   PetscFunctionBegin;
721   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
722   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
723 
724   v = a->a;
725   if (usecprow) {
726     mbs  = a->compressedrow.nrows;
727     ii   = a->compressedrow.i;
728     ridx = a->compressedrow.rindex;
729   } else {
730     mbs = a->mbs;
731     ii  = a->i;
732     z   = zarray;
733   }
734 
735   for (i=0; i<mbs; i++) {
736     n    = ii[i+1] - ii[i];
737     idx  = ij + ii[i];
738     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
739     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
740 
741     nonzerorow += (n>0);
742     for (j=0; j<n; j++) {
743       xb = x + 15*(idx[j]);
744       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
745 
746       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
747       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
748       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
749       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
750       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
751       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
752       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
753       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
754       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
755       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
756       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
757       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
758       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
759       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
760       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
761 
762       v += 60;
763 
764       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
765 
766       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
767       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
768       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
769       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
770       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
771       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
772       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
773       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
774       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
775       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
776       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
777       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
778       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
779       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
780       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
781       v     += 60;
782 
783       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
784       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
785       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
786       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
787       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
788       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
789       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
790       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
791       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
792       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
793       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
794       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
795       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
796       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
797       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
798       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
799       v     += 60;
800 
801       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
802       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
803       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
804       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
805       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
806       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
807       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
808       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
809       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
810       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
811       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
812       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
813       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
814       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
815       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
816       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
817       v     += 45;
818     }
819     if (usecprow) z = zarray + 15*ridx[i];
820     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
821     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
822 
823     if (!usecprow) z += 15;
824   }
825 
826   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
827   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
828   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
829   PetscFunctionReturn(0);
830 }
831 
832 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
833 #undef __FUNCT__
834 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
835 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
836 {
837   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
838   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
839   const PetscScalar *x,*xb;
840   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
841   const MatScalar   *v;
842   PetscErrorCode    ierr;
843   const PetscInt    *ii,*ij=a->j,*idx;
844   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
845   PetscBool         usecprow=a->compressedrow.use;
846 
847   PetscFunctionBegin;
848   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
849   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
850 
851   v = a->a;
852   if (usecprow) {
853     mbs  = a->compressedrow.nrows;
854     ii   = a->compressedrow.i;
855     ridx = a->compressedrow.rindex;
856   } else {
857     mbs = a->mbs;
858     ii  = a->i;
859     z   = zarray;
860   }
861 
862   for (i=0; i<mbs; i++) {
863     n    = ii[i+1] - ii[i];
864     idx  = ij + ii[i];
865     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
866     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
867 
868     nonzerorow += (n>0);
869     for (j=0; j<n; j++) {
870       xb = x + 15*(idx[j]);
871       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
872       x8 = xb[7];
873 
874       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
875       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
876       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
877       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
878       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
879       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
880       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
881       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
882       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
883       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
884       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
885       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
886       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
887       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
888       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
889       v     += 120;
890 
891       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
892 
893       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
894       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
895       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
896       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
897       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
898       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
899       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
900       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
901       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
902       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
903       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
904       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
905       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
906       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
907       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
908       v     += 105;
909     }
910     if (usecprow) z = zarray + 15*ridx[i];
911     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
912     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
913 
914     if (!usecprow) z += 15;
915   }
916 
917   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
918   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
919   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
920   PetscFunctionReturn(0);
921 }
922 
923 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
924 
925 #undef __FUNCT__
926 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
927 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
928 {
929   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
930   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931   const PetscScalar *x,*xb;
932   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
933   const MatScalar   *v;
934   PetscErrorCode    ierr;
935   const PetscInt    *ii,*ij=a->j,*idx;
936   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
937   PetscBool         usecprow=a->compressedrow.use;
938 
939   PetscFunctionBegin;
940   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
941   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
942 
943   v = a->a;
944   if (usecprow) {
945     mbs  = a->compressedrow.nrows;
946     ii   = a->compressedrow.i;
947     ridx = a->compressedrow.rindex;
948   } else {
949     mbs = a->mbs;
950     ii  = a->i;
951     z   = zarray;
952   }
953 
954   for (i=0; i<mbs; i++) {
955     n    = ii[i+1] - ii[i];
956     idx  = ij + ii[i];
957     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
958     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
959 
960     nonzerorow += (n>0);
961     for (j=0; j<n; j++) {
962       xb = x + 15*(idx[j]);
963       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
964       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
965 
966       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
967       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
968       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
969       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
970       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
971       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
972       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
973       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
974       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
975       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
976       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
977       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
978       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
979       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
980       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
981       v     += 225;
982     }
983     if (usecprow) z = zarray + 15*ridx[i];
984     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
985     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
986 
987     if (!usecprow) z += 15;
988   }
989 
990   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
991   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
992   ierr = PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);CHKERRQ(ierr);
993   PetscFunctionReturn(0);
994 }
995 
996 
997 /*
998     This will not work with MatScalar == float because it calls the BLAS
999 */
1000 #undef __FUNCT__
1001 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1002 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1003 {
1004   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1005   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1006   MatScalar      *v;
1007   PetscErrorCode ierr;
1008   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1009   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
1010   PetscBool      usecprow=a->compressedrow.use;
1011 
1012   PetscFunctionBegin;
1013   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1014   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1015 
1016   idx = a->j;
1017   v   = a->a;
1018   if (usecprow) {
1019     mbs  = a->compressedrow.nrows;
1020     ii   = a->compressedrow.i;
1021     ridx = a->compressedrow.rindex;
1022   } else {
1023     mbs = a->mbs;
1024     ii  = a->i;
1025     z   = zarray;
1026   }
1027 
1028   if (!a->mult_work) {
1029     k    = PetscMax(A->rmap->n,A->cmap->n);
1030     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1031   }
1032   work = a->mult_work;
1033   for (i=0; i<mbs; i++) {
1034     n           = ii[1] - ii[0]; ii++;
1035     ncols       = n*bs;
1036     workt       = work;
1037     nonzerorow += (n>0);
1038     for (j=0; j<n; j++) {
1039       xb = x + bs*(*idx++);
1040       for (k=0; k<bs; k++) workt[k] = xb[k];
1041       workt += bs;
1042     }
1043     if (usecprow) z = zarray + bs*ridx[i];
1044     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1045     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1046     v += n*bs2;
1047     if (!usecprow) z += bs;
1048   }
1049   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1050   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1051   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 #undef __FUNCT__
1056 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1057 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1058 {
1059   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1060   const PetscScalar *x;
1061   PetscScalar       *y,*z,sum;
1062   const MatScalar   *v;
1063   PetscErrorCode    ierr;
1064   PetscInt          mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1065   const PetscInt    *idx,*ii;
1066   PetscBool         usecprow=a->compressedrow.use;
1067 
1068   PetscFunctionBegin;
1069   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1070   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1071   if (zz != yy) {
1072     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1073   } else {
1074     z = y;
1075   }
1076 
1077   idx = a->j;
1078   v   = a->a;
1079   if (usecprow) {
1080     if (zz != yy) {
1081       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1082     }
1083     mbs  = a->compressedrow.nrows;
1084     ii   = a->compressedrow.i;
1085     ridx = a->compressedrow.rindex;
1086   } else {
1087     ii = a->i;
1088   }
1089 
1090   for (i=0; i<mbs; i++) {
1091     n = ii[1] - ii[0];
1092     ii++;
1093     if (!usecprow) {
1094       nonzerorow += (n>0);
1095       sum         = y[i];
1096     } else {
1097       sum = y[ridx[i]];
1098     }
1099     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1100     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1101     PetscSparseDensePlusDot(sum,x,v,idx,n);
1102     v   += n;
1103     idx += n;
1104     if (usecprow) {
1105       z[ridx[i]] = sum;
1106     } else {
1107       z[i] = sum;
1108     }
1109   }
1110   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1111   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1112   if (zz != yy) {
1113     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1114   }
1115   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1121 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1122 {
1123   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1124   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1125   PetscScalar    x1,x2,*yarray,*zarray;
1126   MatScalar      *v;
1127   PetscErrorCode ierr;
1128   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1129   PetscBool      usecprow=a->compressedrow.use;
1130 
1131   PetscFunctionBegin;
1132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1134   if (zz != yy) {
1135     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1136   } else {
1137     zarray = yarray;
1138   }
1139 
1140   idx = a->j;
1141   v   = a->a;
1142   if (usecprow) {
1143     if (zz != yy) {
1144       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1145     }
1146     mbs  = a->compressedrow.nrows;
1147     ii   = a->compressedrow.i;
1148     ridx = a->compressedrow.rindex;
1149     if (zz != yy) {
1150       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1151     }
1152   } else {
1153     ii = a->i;
1154     y  = yarray;
1155     z  = zarray;
1156   }
1157 
1158   for (i=0; i<mbs; i++) {
1159     n = ii[1] - ii[0]; ii++;
1160     if (usecprow) {
1161       z = zarray + 2*ridx[i];
1162       y = yarray + 2*ridx[i];
1163     }
1164     sum1 = y[0]; sum2 = y[1];
1165     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1166     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1167     for (j=0; j<n; j++) {
1168       xb = x + 2*(*idx++);
1169       x1 = xb[0];
1170       x2 = xb[1];
1171 
1172       sum1 += v[0]*x1 + v[2]*x2;
1173       sum2 += v[1]*x1 + v[3]*x2;
1174       v    += 4;
1175     }
1176     z[0] = sum1; z[1] = sum2;
1177     if (!usecprow) {
1178       z += 2; y += 2;
1179     }
1180   }
1181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1182   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1183   if (zz != yy) {
1184     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1185   }
1186   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 #undef __FUNCT__
1191 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1192 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1193 {
1194   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1195   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1196   MatScalar      *v;
1197   PetscErrorCode ierr;
1198   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1199   PetscBool      usecprow=a->compressedrow.use;
1200 
1201   PetscFunctionBegin;
1202   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1203   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1204   if (zz != yy) {
1205     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1206   } else {
1207     zarray = yarray;
1208   }
1209 
1210   idx = a->j;
1211   v   = a->a;
1212   if (usecprow) {
1213     if (zz != yy) {
1214       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1215     }
1216     mbs  = a->compressedrow.nrows;
1217     ii   = a->compressedrow.i;
1218     ridx = a->compressedrow.rindex;
1219   } else {
1220     ii = a->i;
1221     y  = yarray;
1222     z  = zarray;
1223   }
1224 
1225   for (i=0; i<mbs; i++) {
1226     n = ii[1] - ii[0]; ii++;
1227     if (usecprow) {
1228       z = zarray + 3*ridx[i];
1229       y = yarray + 3*ridx[i];
1230     }
1231     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1232     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1233     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1234     for (j=0; j<n; j++) {
1235       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1236       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1237       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1238       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1239       v    += 9;
1240     }
1241     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1242     if (!usecprow) {
1243       z += 3; y += 3;
1244     }
1245   }
1246   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1247   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1248   if (zz != yy) {
1249     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1250   }
1251   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 #undef __FUNCT__
1256 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1257 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1258 {
1259   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1260   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1261   MatScalar      *v;
1262   PetscErrorCode ierr;
1263   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1264   PetscBool      usecprow=a->compressedrow.use;
1265 
1266   PetscFunctionBegin;
1267   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1268   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1269   if (zz != yy) {
1270     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1271   } else {
1272     zarray = yarray;
1273   }
1274 
1275   idx = a->j;
1276   v   = a->a;
1277   if (usecprow) {
1278     if (zz != yy) {
1279       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1280     }
1281     mbs  = a->compressedrow.nrows;
1282     ii   = a->compressedrow.i;
1283     ridx = a->compressedrow.rindex;
1284   } else {
1285     ii = a->i;
1286     y  = yarray;
1287     z  = zarray;
1288   }
1289 
1290   for (i=0; i<mbs; i++) {
1291     n = ii[1] - ii[0]; ii++;
1292     if (usecprow) {
1293       z = zarray + 4*ridx[i];
1294       y = yarray + 4*ridx[i];
1295     }
1296     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1297     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1298     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1299     for (j=0; j<n; j++) {
1300       xb    = x + 4*(*idx++);
1301       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1302       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1303       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1304       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1305       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1306       v    += 16;
1307     }
1308     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1309     if (!usecprow) {
1310       z += 4; y += 4;
1311     }
1312   }
1313   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1314   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1315   if (zz != yy) {
1316     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1317   }
1318   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1319   PetscFunctionReturn(0);
1320 }
1321 
1322 #undef __FUNCT__
1323 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1324 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1325 {
1326   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1327   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1328   PetscScalar    *yarray,*zarray;
1329   MatScalar      *v;
1330   PetscErrorCode ierr;
1331   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1332   PetscBool      usecprow=a->compressedrow.use;
1333 
1334   PetscFunctionBegin;
1335   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1336   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1337   if (zz != yy) {
1338     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1339   } else {
1340     zarray = yarray;
1341   }
1342 
1343   idx = a->j;
1344   v   = a->a;
1345   if (usecprow) {
1346     if (zz != yy) {
1347       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1348     }
1349     mbs  = a->compressedrow.nrows;
1350     ii   = a->compressedrow.i;
1351     ridx = a->compressedrow.rindex;
1352   } else {
1353     ii = a->i;
1354     y  = yarray;
1355     z  = zarray;
1356   }
1357 
1358   for (i=0; i<mbs; i++) {
1359     n = ii[1] - ii[0]; ii++;
1360     if (usecprow) {
1361       z = zarray + 5*ridx[i];
1362       y = yarray + 5*ridx[i];
1363     }
1364     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1365     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1366     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1367     for (j=0; j<n; j++) {
1368       xb    = x + 5*(*idx++);
1369       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1370       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1371       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1372       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1373       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1374       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1375       v    += 25;
1376     }
1377     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1378     if (!usecprow) {
1379       z += 5; y += 5;
1380     }
1381   }
1382   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1383   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1384   if (zz != yy) {
1385     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1386   }
1387   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1388   PetscFunctionReturn(0);
1389 }
1390 #undef __FUNCT__
1391 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1392 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1393 {
1394   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1395   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1396   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1397   MatScalar      *v;
1398   PetscErrorCode ierr;
1399   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1400   PetscBool      usecprow=a->compressedrow.use;
1401 
1402   PetscFunctionBegin;
1403   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1404   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1405   if (zz != yy) {
1406     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1407   } else {
1408     zarray = yarray;
1409   }
1410 
1411   idx = a->j;
1412   v   = a->a;
1413   if (usecprow) {
1414     if (zz != yy) {
1415       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1416     }
1417     mbs  = a->compressedrow.nrows;
1418     ii   = a->compressedrow.i;
1419     ridx = a->compressedrow.rindex;
1420   } else {
1421     ii = a->i;
1422     y  = yarray;
1423     z  = zarray;
1424   }
1425 
1426   for (i=0; i<mbs; i++) {
1427     n = ii[1] - ii[0]; ii++;
1428     if (usecprow) {
1429       z = zarray + 6*ridx[i];
1430       y = yarray + 6*ridx[i];
1431     }
1432     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1433     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1434     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1435     for (j=0; j<n; j++) {
1436       xb    = x + 6*(*idx++);
1437       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1438       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1439       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1440       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1441       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1442       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1443       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1444       v    += 36;
1445     }
1446     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1447     if (!usecprow) {
1448       z += 6; y += 6;
1449     }
1450   }
1451   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1452   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1453   if (zz != yy) {
1454     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1455   }
1456   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 #undef __FUNCT__
1461 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1462 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1463 {
1464   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1465   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1466   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1467   MatScalar      *v;
1468   PetscErrorCode ierr;
1469   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1470   PetscBool      usecprow=a->compressedrow.use;
1471 
1472   PetscFunctionBegin;
1473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1474   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1475   if (zz != yy) {
1476     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1477   } else {
1478     zarray = yarray;
1479   }
1480 
1481   idx = a->j;
1482   v   = a->a;
1483   if (usecprow) {
1484     if (zz != yy) {
1485       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1486     }
1487     mbs  = a->compressedrow.nrows;
1488     ii   = a->compressedrow.i;
1489     ridx = a->compressedrow.rindex;
1490   } else {
1491     ii = a->i;
1492     y  = yarray;
1493     z  = zarray;
1494   }
1495 
1496   for (i=0; i<mbs; i++) {
1497     n = ii[1] - ii[0]; ii++;
1498     if (usecprow) {
1499       z = zarray + 7*ridx[i];
1500       y = yarray + 7*ridx[i];
1501     }
1502     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1503     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1504     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1505     for (j=0; j<n; j++) {
1506       xb    = x + 7*(*idx++);
1507       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1508       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1509       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1510       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1511       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1512       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1513       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1514       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1515       v    += 49;
1516     }
1517     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1518     if (!usecprow) {
1519       z += 7; y += 7;
1520     }
1521   }
1522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1523   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1524   if (zz != yy) {
1525     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1526   }
1527   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 #undef __FUNCT__
1532 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1533 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1534 {
1535   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1536   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1537   MatScalar      *v;
1538   PetscErrorCode ierr;
1539   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1540   PetscInt       ncols,k,*ridx=PETSC_NULL;
1541   PetscBool      usecprow=a->compressedrow.use;
1542 
1543   PetscFunctionBegin;
1544   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1545   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1546   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1547 
1548   idx = a->j;
1549   v   = a->a;
1550   if (usecprow) {
1551     mbs  = a->compressedrow.nrows;
1552     ii   = a->compressedrow.i;
1553     ridx = a->compressedrow.rindex;
1554   } else {
1555     mbs = a->mbs;
1556     ii  = a->i;
1557     z   = zarray;
1558   }
1559 
1560   if (!a->mult_work) {
1561     k    = PetscMax(A->rmap->n,A->cmap->n);
1562     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1563   }
1564   work = a->mult_work;
1565   for (i=0; i<mbs; i++) {
1566     n     = ii[1] - ii[0]; ii++;
1567     ncols = n*bs;
1568     workt = work;
1569     for (j=0; j<n; j++) {
1570       xb = x + bs*(*idx++);
1571       for (k=0; k<bs; k++) workt[k] = xb[k];
1572       workt += bs;
1573     }
1574     if (usecprow) z = zarray + bs*ridx[i];
1575     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1576     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1577     v += n*bs2;
1578     if (!usecprow) z += bs;
1579   }
1580   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1581   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1582   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 #undef __FUNCT__
1587 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1588 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1589 {
1590   PetscScalar    zero = 0.0;
1591   PetscErrorCode ierr;
1592 
1593   PetscFunctionBegin;
1594   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1595   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNCT__
1600 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1601 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1602 {
1603   PetscScalar    zero = 0.0;
1604   PetscErrorCode ierr;
1605 
1606   PetscFunctionBegin;
1607   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1608   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 #undef __FUNCT__
1613 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1614 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1615 {
1616   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1617   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1618   MatScalar         *v;
1619   PetscErrorCode    ierr;
1620   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1621   Mat_CompressedRow cprow   = a->compressedrow;
1622   PetscBool         usecprow=cprow.use;
1623 
1624   PetscFunctionBegin;
1625   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1626   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1627   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1628 
1629   idx = a->j;
1630   v   = a->a;
1631   if (usecprow) {
1632     mbs  = cprow.nrows;
1633     ii   = cprow.i;
1634     ridx = cprow.rindex;
1635   } else {
1636     mbs=a->mbs;
1637     ii = a->i;
1638     xb = x;
1639   }
1640 
1641   switch (bs) {
1642   case 1:
1643     for (i=0; i<mbs; i++) {
1644       if (usecprow) xb = x + ridx[i];
1645       x1 = xb[0];
1646       ib = idx + ii[0];
1647       n  = ii[1] - ii[0]; ii++;
1648       for (j=0; j<n; j++) {
1649         rval     = ib[j];
1650         z[rval] += PetscConj(*v) * x1;
1651         v++;
1652       }
1653       if (!usecprow) xb++;
1654     }
1655     break;
1656   case 2:
1657     for (i=0; i<mbs; i++) {
1658       if (usecprow) xb = x + 2*ridx[i];
1659       x1 = xb[0]; x2 = xb[1];
1660       ib = idx + ii[0];
1661       n  = ii[1] - ii[0]; ii++;
1662       for (j=0; j<n; j++) {
1663         rval       = ib[j]*2;
1664         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1665         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1666         v         += 4;
1667       }
1668       if (!usecprow) xb += 2;
1669     }
1670     break;
1671   case 3:
1672     for (i=0; i<mbs; i++) {
1673       if (usecprow) xb = x + 3*ridx[i];
1674       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1675       ib = idx + ii[0];
1676       n  = ii[1] - ii[0]; ii++;
1677       for (j=0; j<n; j++) {
1678         rval       = ib[j]*3;
1679         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1680         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1681         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1682         v         += 9;
1683       }
1684       if (!usecprow) xb += 3;
1685     }
1686     break;
1687   case 4:
1688     for (i=0; i<mbs; i++) {
1689       if (usecprow) xb = x + 4*ridx[i];
1690       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1691       ib = idx + ii[0];
1692       n  = ii[1] - ii[0]; ii++;
1693       for (j=0; j<n; j++) {
1694         rval       = ib[j]*4;
1695         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1696         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1697         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1698         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1699         v         += 16;
1700       }
1701       if (!usecprow) xb += 4;
1702     }
1703     break;
1704   case 5:
1705     for (i=0; i<mbs; i++) {
1706       if (usecprow) xb = x + 5*ridx[i];
1707       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1708       x4 = xb[3]; x5 = xb[4];
1709       ib = idx + ii[0];
1710       n  = ii[1] - ii[0]; ii++;
1711       for (j=0; j<n; j++) {
1712         rval       = ib[j]*5;
1713         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1714         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1715         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1716         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1717         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1718         v         += 25;
1719       }
1720       if (!usecprow) xb += 5;
1721     }
1722     break;
1723   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1724     PetscInt    ncols,k;
1725     PetscScalar *work,*workt,*xtmp;
1726 
1727     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1728     if (!a->mult_work) {
1729       k    = PetscMax(A->rmap->n,A->cmap->n);
1730       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1731     }
1732     work = a->mult_work;
1733     xtmp = x;
1734     for (i=0; i<mbs; i++) {
1735       n     = ii[1] - ii[0]; ii++;
1736       ncols = n*bs;
1737       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1738       if (usecprow) xtmp = x + bs*ridx[i];
1739       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1740       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1741       v += n*bs2;
1742       if (!usecprow) xtmp += bs;
1743       workt = work;
1744       for (j=0; j<n; j++) {
1745         zb = z + bs*(*idx++);
1746         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1747         workt += bs;
1748       }
1749     }
1750     }
1751   }
1752   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1753   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1754   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 #undef __FUNCT__
1759 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1760 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1761 {
1762   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1763   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1764   MatScalar         *v;
1765   PetscErrorCode    ierr;
1766   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1767   Mat_CompressedRow cprow   = a->compressedrow;
1768   PetscBool         usecprow=cprow.use;
1769 
1770   PetscFunctionBegin;
1771   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1772   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1773   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1774 
1775   idx = a->j;
1776   v   = a->a;
1777   if (usecprow) {
1778     mbs  = cprow.nrows;
1779     ii   = cprow.i;
1780     ridx = cprow.rindex;
1781   } else {
1782     mbs=a->mbs;
1783     ii = a->i;
1784     xb = x;
1785   }
1786 
1787   switch (bs) {
1788   case 1:
1789     for (i=0; i<mbs; i++) {
1790       if (usecprow) xb = x + ridx[i];
1791       x1 = xb[0];
1792       ib = idx + ii[0];
1793       n  = ii[1] - ii[0]; ii++;
1794       for (j=0; j<n; j++) {
1795         rval     = ib[j];
1796         z[rval] += *v * x1;
1797         v++;
1798       }
1799       if (!usecprow) xb++;
1800     }
1801     break;
1802   case 2:
1803     for (i=0; i<mbs; i++) {
1804       if (usecprow) xb = x + 2*ridx[i];
1805       x1 = xb[0]; x2 = xb[1];
1806       ib = idx + ii[0];
1807       n  = ii[1] - ii[0]; ii++;
1808       for (j=0; j<n; j++) {
1809         rval       = ib[j]*2;
1810         z[rval++] += v[0]*x1 + v[1]*x2;
1811         z[rval++] += v[2]*x1 + v[3]*x2;
1812         v         += 4;
1813       }
1814       if (!usecprow) xb += 2;
1815     }
1816     break;
1817   case 3:
1818     for (i=0; i<mbs; i++) {
1819       if (usecprow) xb = x + 3*ridx[i];
1820       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1821       ib = idx + ii[0];
1822       n  = ii[1] - ii[0]; ii++;
1823       for (j=0; j<n; j++) {
1824         rval       = ib[j]*3;
1825         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1826         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1827         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1828         v         += 9;
1829       }
1830       if (!usecprow) xb += 3;
1831     }
1832     break;
1833   case 4:
1834     for (i=0; i<mbs; i++) {
1835       if (usecprow) xb = x + 4*ridx[i];
1836       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1837       ib = idx + ii[0];
1838       n  = ii[1] - ii[0]; ii++;
1839       for (j=0; j<n; j++) {
1840         rval       = ib[j]*4;
1841         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1842         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1843         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1844         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1845         v         += 16;
1846       }
1847       if (!usecprow) xb += 4;
1848     }
1849     break;
1850   case 5:
1851     for (i=0; i<mbs; i++) {
1852       if (usecprow) xb = x + 5*ridx[i];
1853       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1854       x4 = xb[3]; x5 = xb[4];
1855       ib = idx + ii[0];
1856       n  = ii[1] - ii[0]; ii++;
1857       for (j=0; j<n; j++) {
1858         rval       = ib[j]*5;
1859         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1860         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1861         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1862         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1863         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1864         v         += 25;
1865       }
1866       if (!usecprow) xb += 5;
1867     }
1868     break;
1869   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1870     PetscInt    ncols,k;
1871     PetscScalar *work,*workt,*xtmp;
1872 
1873     if (!a->mult_work) {
1874       k    = PetscMax(A->rmap->n,A->cmap->n);
1875       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1876     }
1877     work = a->mult_work;
1878     xtmp = x;
1879     for (i=0; i<mbs; i++) {
1880       n     = ii[1] - ii[0]; ii++;
1881       ncols = n*bs;
1882       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1883       if (usecprow) xtmp = x + bs*ridx[i];
1884       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1885       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1886       v += n*bs2;
1887       if (!usecprow) xtmp += bs;
1888       workt = work;
1889       for (j=0; j<n; j++) {
1890         zb = z + bs*(*idx++);
1891         for (k=0; k<bs; k++) zb[k] += workt[k];
1892         workt += bs;
1893       }
1894     }
1895     }
1896   }
1897   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1898   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1899   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1900   PetscFunctionReturn(0);
1901 }
1902 
1903 #undef __FUNCT__
1904 #define __FUNCT__ "MatScale_SeqBAIJ"
1905 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1906 {
1907   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1908   PetscInt       totalnz = a->bs2*a->nz;
1909   PetscScalar    oalpha  = alpha;
1910   PetscErrorCode ierr;
1911   PetscBLASInt   one = 1,tnz;
1912 
1913   PetscFunctionBegin;
1914   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1915   BLASscal_(&tnz,&oalpha,a->a,&one);
1916   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1917   PetscFunctionReturn(0);
1918 }
1919 
1920 #undef __FUNCT__
1921 #define __FUNCT__ "MatNorm_SeqBAIJ"
1922 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1923 {
1924   PetscErrorCode ierr;
1925   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1926   MatScalar      *v  = a->a;
1927   PetscReal      sum = 0.0;
1928   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1929 
1930   PetscFunctionBegin;
1931   if (type == NORM_FROBENIUS) {
1932     for (i=0; i< bs2*nz; i++) {
1933       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1934     }
1935     *norm = PetscSqrtReal(sum);
1936   } else if (type == NORM_1) { /* maximum column sum */
1937     PetscReal *tmp;
1938     PetscInt  *bcol = a->j;
1939     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1940     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
1941     for (i=0; i<nz; i++) {
1942       for (j=0; j<bs; j++) {
1943         k1 = bs*(*bcol) + j; /* column index */
1944         for (k=0; k<bs; k++) {
1945           tmp[k1] += PetscAbsScalar(*v); v++;
1946         }
1947       }
1948       bcol++;
1949     }
1950     *norm = 0.0;
1951     for (j=0; j<A->cmap->n; j++) {
1952       if (tmp[j] > *norm) *norm = tmp[j];
1953     }
1954     ierr = PetscFree(tmp);CHKERRQ(ierr);
1955   } else if (type == NORM_INFINITY) { /* maximum row sum */
1956     *norm = 0.0;
1957     for (k=0; k<bs; k++) {
1958       for (j=0; j<a->mbs; j++) {
1959         v   = a->a + bs2*a->i[j] + k;
1960         sum = 0.0;
1961         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1962           for (k1=0; k1<bs; k1++) {
1963             sum += PetscAbsScalar(*v);
1964             v   += bs;
1965           }
1966         }
1967         if (sum > *norm) *norm = sum;
1968       }
1969     }
1970   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1971   PetscFunctionReturn(0);
1972 }
1973 
1974 
1975 #undef __FUNCT__
1976 #define __FUNCT__ "MatEqual_SeqBAIJ"
1977 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1978 {
1979   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1980   PetscErrorCode ierr;
1981 
1982   PetscFunctionBegin;
1983   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1984   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1985     *flg = PETSC_FALSE;
1986     PetscFunctionReturn(0);
1987   }
1988 
1989   /* if the a->i are the same */
1990   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1991   if (!*flg) PetscFunctionReturn(0);
1992 
1993   /* if a->j are the same */
1994   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1995   if (!*flg) PetscFunctionReturn(0);
1996 
1997   /* if a->a are the same */
1998   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1999   PetscFunctionReturn(0);
2000 
2001 }
2002 
2003 #undef __FUNCT__
2004 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2005 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2006 {
2007   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2008   PetscErrorCode ierr;
2009   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2010   PetscScalar    *x,zero = 0.0;
2011   MatScalar      *aa,*aa_j;
2012 
2013   PetscFunctionBegin;
2014   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2015   bs   = A->rmap->bs;
2016   aa   = a->a;
2017   ai   = a->i;
2018   aj   = a->j;
2019   ambs = a->mbs;
2020   bs2  = a->bs2;
2021 
2022   ierr = VecSet(v,zero);CHKERRQ(ierr);
2023   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2024   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2025   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2026   for (i=0; i<ambs; i++) {
2027     for (j=ai[i]; j<ai[i+1]; j++) {
2028       if (aj[j] == i) {
2029         row  = i*bs;
2030         aa_j = aa+j*bs2;
2031         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2032         break;
2033       }
2034     }
2035   }
2036   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2037   PetscFunctionReturn(0);
2038 }
2039 
2040 #undef __FUNCT__
2041 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2042 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2043 {
2044   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2045   const PetscScalar *l,*r,*li,*ri;
2046   PetscScalar       x;
2047   MatScalar         *aa, *v;
2048   PetscErrorCode    ierr;
2049   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2050   const PetscInt    *ai,*aj;
2051 
2052   PetscFunctionBegin;
2053   ai  = a->i;
2054   aj  = a->j;
2055   aa  = a->a;
2056   m   = A->rmap->n;
2057   n   = A->cmap->n;
2058   bs  = A->rmap->bs;
2059   mbs = a->mbs;
2060   bs2 = a->bs2;
2061   if (ll) {
2062     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2063     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2064     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2065     for (i=0; i<mbs; i++) { /* for each block row */
2066       M  = ai[i+1] - ai[i];
2067       li = l + i*bs;
2068       v  = aa + bs2*ai[i];
2069       for (j=0; j<M; j++) { /* for each block */
2070         for (k=0; k<bs2; k++) {
2071           (*v++) *= li[k%bs];
2072         }
2073       }
2074     }
2075     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2076     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2077   }
2078 
2079   if (rr) {
2080     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2081     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2082     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2083     for (i=0; i<mbs; i++) { /* for each block row */
2084       iai = ai[i];
2085       M   = ai[i+1] - iai;
2086       v   = aa + bs2*iai;
2087       for (j=0; j<M; j++) { /* for each block */
2088         ri = r + bs*aj[iai+j];
2089         for (k=0; k<bs; k++) {
2090           x = ri[k];
2091           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2092           v += bs;
2093         }
2094       }
2095     }
2096     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2097     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2098   }
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 
2103 #undef __FUNCT__
2104 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2105 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2106 {
2107   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2108 
2109   PetscFunctionBegin;
2110   info->block_size   = a->bs2;
2111   info->nz_allocated = a->bs2*a->maxnz;
2112   info->nz_used      = a->bs2*a->nz;
2113   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2114   info->assemblies   = A->num_ass;
2115   info->mallocs      = A->info.mallocs;
2116   info->memory       = ((PetscObject)A)->mem;
2117   if (A->factortype) {
2118     info->fill_ratio_given  = A->info.fill_ratio_given;
2119     info->fill_ratio_needed = A->info.fill_ratio_needed;
2120     info->factor_mallocs    = A->info.factor_mallocs;
2121   } else {
2122     info->fill_ratio_given  = 0;
2123     info->fill_ratio_needed = 0;
2124     info->factor_mallocs    = 0;
2125   }
2126   PetscFunctionReturn(0);
2127 }
2128 
2129 
2130 #undef __FUNCT__
2131 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2132 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2133 {
2134   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2135   PetscErrorCode ierr;
2136 
2137   PetscFunctionBegin;
2138   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2139   PetscFunctionReturn(0);
2140 }
2141