xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 8895d3f2aaf765dc4e8d62d73f37241d00bd6bbd)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m  = a->mbs;
20   ai = a->i;
21   aj = a->j;
22   bs = A->rmap->bs;
23 
24   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
27   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n; ++j) {
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64     }
65     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
66   }
67   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
68   ierr = PetscFree(nidx);CHKERRQ(ierr);
69   ierr = PetscFree(nidx2);CHKERRQ(ierr);
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag;
87 
88   PetscFunctionBegin;
89 
90   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
91   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
93   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
94 
95   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
96   ssmap = smap;
97   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
98   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99   /* determine lens of each row */
100   for (i=0; i<nrows; i++) {
101     kstart  = ai[irow[i]];
102     kend    = kstart + a->ilen[irow[i]];
103     lens[i] = 0;
104     for (k=kstart; k<kend; k++) {
105       if (ssmap[aj[k]]) lens[i]++;
106     }
107   }
108   /* Create and fill new matrix */
109   if (scall == MAT_REUSE_MATRIX) {
110     c = (Mat_SeqBAIJ*)((*B)->data);
111 
112     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
114     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
116     C    = *B;
117   } else {
118     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
119     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
120     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
121     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
122   }
123   c = (Mat_SeqBAIJ*)(C->data);
124   for (i=0; i<nrows; i++) {
125     row      = irow[i];
126     kstart   = ai[row];
127     kend     = kstart + a->ilen[row];
128     mat_i    = c->i[i];
129     mat_j    = c->j + mat_i;
130     mat_a    = c->a + mat_i*bs2;
131     mat_ilen = c->ilen + i;
132     for (k=kstart; k<kend; k++) {
133       if ((tcol=ssmap[a->j[k]])) {
134         *mat_j++ = tcol - 1;
135         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
136         mat_a   += bs2;
137         (*mat_ilen)++;
138       }
139     }
140   }
141   /* sort */
142   {
143     MatScalar *work;
144 
145     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
146     for (i=0; i<nrows; i++) {
147       PetscInt ilen;
148       mat_i = c->i[i];
149       mat_j = c->j + mat_i;
150       mat_a = c->a + mat_i*bs2;
151       ilen  = c->ilen[i];
152       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
153     }
154     ierr = PetscFree(work);CHKERRQ(ierr);
155   }
156 
157   /* Free work space */
158   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
159   ierr = PetscFree(smap);CHKERRQ(ierr);
160   ierr = PetscFree(lens);CHKERRQ(ierr);
161   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
163 
164   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
165   *B   = C;
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
171 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
172 {
173   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
174   IS             is1,is2;
175   PetscErrorCode ierr;
176   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
177   const PetscInt *irow,*icol;
178 
179   PetscFunctionBegin;
180   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
181   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
182   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
183   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
184 
185   /* Verify if the indices corespond to each element in a block
186    and form the IS with compressed IS */
187   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
188   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
189   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
190   for (i=0; i<a->mbs; i++) {
191     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
192   }
193   count = 0;
194   for (i=0; i<nrows; i++) {
195     PetscInt j = irow[i] / bs;
196     if ((vary[j]--)==bs) iary[count++] = j;
197   }
198   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
199 
200   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
201   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
204   }
205   count = 0;
206   for (i=0; i<ncols; i++) {
207     PetscInt j = icol[i] / bs;
208     if ((vary[j]--)==bs) iary[count++] = j;
209   }
210   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
211   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
212   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
213   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
214 
215   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
216   ierr = ISDestroy(&is1);CHKERRQ(ierr);
217   ierr = ISDestroy(&is2);CHKERRQ(ierr);
218   PetscFunctionReturn(0);
219 }
220 
221 #undef __FUNCT__
222 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
223 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
224 {
225   PetscErrorCode ierr;
226   PetscInt       i;
227 
228   PetscFunctionBegin;
229   if (scall == MAT_INITIAL_MATRIX) {
230     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
231   }
232 
233   for (i=0; i<n; i++) {
234     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
235   }
236   PetscFunctionReturn(0);
237 }
238 
239 
240 /* -------------------------------------------------------*/
241 /* Should check that shapes of vectors and matrices match */
242 /* -------------------------------------------------------*/
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatMult_SeqBAIJ_1"
246 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
247 {
248   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
249   PetscScalar       *z,sum;
250   const PetscScalar *x;
251   const MatScalar   *v;
252   PetscErrorCode    ierr;
253   PetscInt          mbs,i,n;
254   const PetscInt    *idx,*ii,*ridx=NULL;
255   PetscBool         usecprow=a->compressedrow.use;
256 
257   PetscFunctionBegin;
258   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
259   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
260 
261   if (usecprow) {
262     mbs  = a->compressedrow.nrows;
263     ii   = a->compressedrow.i;
264     ridx = a->compressedrow.rindex;
265     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
266   } else {
267     mbs = a->mbs;
268     ii  = a->i;
269   }
270 
271   for (i=0; i<mbs; i++) {
272     n   = ii[1] - ii[0];
273     v   = a->a + ii[0];
274     idx = a->j + ii[0];
275     ii++;
276     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
277     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
278     sum = 0.0;
279     PetscSparseDensePlusDot(sum,x,v,idx,n);
280     if (usecprow) {
281       z[ridx[i]] = sum;
282     } else {
283       z[i]        = sum;
284     }
285   }
286   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
287   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
288   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 #undef __FUNCT__
293 #define __FUNCT__ "MatMult_SeqBAIJ_2"
294 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
295 {
296   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
297   PetscScalar       *z = 0,sum1,sum2,*zarray;
298   const PetscScalar *x,*xb;
299   PetscScalar       x1,x2;
300   const MatScalar   *v;
301   PetscErrorCode    ierr;
302   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
303   PetscBool         usecprow=a->compressedrow.use;
304 
305   PetscFunctionBegin;
306   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
307   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
308 
309   idx = a->j;
310   v   = a->a;
311   if (usecprow) {
312     mbs  = a->compressedrow.nrows;
313     ii   = a->compressedrow.i;
314     ridx = a->compressedrow.rindex;
315   } else {
316     mbs = a->mbs;
317     ii  = a->i;
318     z   = zarray;
319   }
320 
321   for (i=0; i<mbs; i++) {
322     n           = ii[1] - ii[0]; ii++;
323     sum1        = 0.0; sum2 = 0.0;
324     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
325     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
326     for (j=0; j<n; j++) {
327       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
328       sum1 += v[0]*x1 + v[2]*x2;
329       sum2 += v[1]*x1 + v[3]*x2;
330       v    += 4;
331     }
332     if (usecprow) z = zarray + 2*ridx[i];
333     z[0] = sum1; z[1] = sum2;
334     if (!usecprow) z += 2;
335   }
336   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
338   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNCT__
343 #define __FUNCT__ "MatMult_SeqBAIJ_3"
344 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
345 {
346   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
347   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
348   const PetscScalar *x,*xb;
349   const MatScalar   *v;
350   PetscErrorCode    ierr;
351   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
352   PetscBool         usecprow=a->compressedrow.use;
353 
354 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
355 #pragma disjoint(*v,*z,*xb)
356 #endif
357 
358   PetscFunctionBegin;
359   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
360   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
361 
362   idx = a->j;
363   v   = a->a;
364   if (usecprow) {
365     mbs  = a->compressedrow.nrows;
366     ii   = a->compressedrow.i;
367     ridx = a->compressedrow.rindex;
368   } else {
369     mbs = a->mbs;
370     ii  = a->i;
371     z   = zarray;
372   }
373 
374   for (i=0; i<mbs; i++) {
375     n           = ii[1] - ii[0]; ii++;
376     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
377     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
378     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
379     for (j=0; j<n; j++) {
380       xb = x + 3*(*idx++);
381       x1 = xb[0];
382       x2 = xb[1];
383       x3 = xb[2];
384 
385       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
386       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
387       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
388       v    += 9;
389     }
390     if (usecprow) z = zarray + 3*ridx[i];
391     z[0] = sum1; z[1] = sum2; z[2] = sum3;
392     if (!usecprow) z += 3;
393   }
394   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
395   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
396   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 #undef __FUNCT__
401 #define __FUNCT__ "MatMult_SeqBAIJ_4"
402 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
403 {
404   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
405   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
406   const PetscScalar *x,*xb;
407   const MatScalar   *v;
408   PetscErrorCode    ierr;
409   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
410   PetscBool         usecprow=a->compressedrow.use;
411 
412   PetscFunctionBegin;
413   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
414   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
415 
416   idx = a->j;
417   v   = a->a;
418   if (usecprow) {
419     mbs  = a->compressedrow.nrows;
420     ii   = a->compressedrow.i;
421     ridx = a->compressedrow.rindex;
422   } else {
423     mbs = a->mbs;
424     ii  = a->i;
425     z   = zarray;
426   }
427 
428   for (i=0; i<mbs; i++) {
429     n = ii[1] - ii[0];
430     ii++;
431     sum1 = 0.0;
432     sum2 = 0.0;
433     sum3 = 0.0;
434     sum4 = 0.0;
435 
436     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
437     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
438     for (j=0; j<n; j++) {
439       xb    = x + 4*(*idx++);
440       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
441       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
442       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
443       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
444       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
445       v    += 16;
446     }
447     if (usecprow) z = zarray + 4*ridx[i];
448     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
449     if (!usecprow) z += 4;
450   }
451   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
452   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
453   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
454   PetscFunctionReturn(0);
455 }
456 
457 #undef __FUNCT__
458 #define __FUNCT__ "MatMult_SeqBAIJ_5"
459 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
460 {
461   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
462   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
463   const PetscScalar *xb,*x;
464   const MatScalar   *v;
465   PetscErrorCode    ierr;
466   const PetscInt    *idx,*ii,*ridx=NULL;
467   PetscInt          mbs,i,j,n;
468   PetscBool         usecprow=a->compressedrow.use;
469 
470   PetscFunctionBegin;
471   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
472   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
473 
474   idx = a->j;
475   v   = a->a;
476   if (usecprow) {
477     mbs  = a->compressedrow.nrows;
478     ii   = a->compressedrow.i;
479     ridx = a->compressedrow.rindex;
480   } else {
481     mbs = a->mbs;
482     ii  = a->i;
483     z   = zarray;
484   }
485 
486   for (i=0; i<mbs; i++) {
487     n           = ii[1] - ii[0]; ii++;
488     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
489     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
490     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
491     for (j=0; j<n; j++) {
492       xb    = x + 5*(*idx++);
493       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
494       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
495       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
496       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
497       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
498       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
499       v    += 25;
500     }
501     if (usecprow) z = zarray + 5*ridx[i];
502     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
503     if (!usecprow) z += 5;
504   }
505   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
506   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
507   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
508   PetscFunctionReturn(0);
509 }
510 
511 
512 
513 #undef __FUNCT__
514 #define __FUNCT__ "MatMult_SeqBAIJ_6"
515 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
516 {
517   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
518   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
519   const PetscScalar *x,*xb;
520   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
521   const MatScalar   *v;
522   PetscErrorCode    ierr;
523   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
524   PetscBool         usecprow=a->compressedrow.use;
525 
526   PetscFunctionBegin;
527   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
528   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
529 
530   idx = a->j;
531   v   = a->a;
532   if (usecprow) {
533     mbs  = a->compressedrow.nrows;
534     ii   = a->compressedrow.i;
535     ridx = a->compressedrow.rindex;
536   } else {
537     mbs = a->mbs;
538     ii  = a->i;
539     z   = zarray;
540   }
541 
542   for (i=0; i<mbs; i++) {
543     n  = ii[1] - ii[0];
544     ii++;
545     sum1 = 0.0;
546     sum2 = 0.0;
547     sum3 = 0.0;
548     sum4 = 0.0;
549     sum5 = 0.0;
550     sum6 = 0.0;
551 
552     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
553     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
554     for (j=0; j<n; j++) {
555       xb    = x + 6*(*idx++);
556       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
557       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
558       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
559       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
560       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
561       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
562       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
563       v    += 36;
564     }
565     if (usecprow) z = zarray + 6*ridx[i];
566     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
567     if (!usecprow) z += 6;
568   }
569 
570   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
571   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
572   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 
576 #undef __FUNCT__
577 #define __FUNCT__ "MatMult_SeqBAIJ_7"
578 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
579 {
580   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
581   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
582   const PetscScalar *x,*xb;
583   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
584   const MatScalar   *v;
585   PetscErrorCode    ierr;
586   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
587   PetscBool         usecprow=a->compressedrow.use;
588 
589   PetscFunctionBegin;
590   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
591   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
592 
593   idx = a->j;
594   v   = a->a;
595   if (usecprow) {
596     mbs  = a->compressedrow.nrows;
597     ii   = a->compressedrow.i;
598     ridx = a->compressedrow.rindex;
599   } else {
600     mbs = a->mbs;
601     ii  = a->i;
602     z   = zarray;
603   }
604 
605   for (i=0; i<mbs; i++) {
606     n  = ii[1] - ii[0];
607     ii++;
608     sum1 = 0.0;
609     sum2 = 0.0;
610     sum3 = 0.0;
611     sum4 = 0.0;
612     sum5 = 0.0;
613     sum6 = 0.0;
614     sum7 = 0.0;
615 
616     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
617     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
618     for (j=0; j<n; j++) {
619       xb    = x + 7*(*idx++);
620       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
621       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
622       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
623       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
624       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
625       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
626       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
627       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
628       v    += 49;
629     }
630     if (usecprow) z = zarray + 7*ridx[i];
631     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
632     if (!usecprow) z += 7;
633   }
634 
635   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
636   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
637   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
642 /* Default MatMult for block size 15 */
643 
644 #undef __FUNCT__
645 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
646 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
647 {
648   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
649   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
650   const PetscScalar *x,*xb;
651   PetscScalar       *zarray,xv;
652   const MatScalar   *v;
653   PetscErrorCode    ierr;
654   const PetscInt    *ii,*ij=a->j,*idx;
655   PetscInt          mbs,i,j,k,n,*ridx=NULL;
656   PetscBool         usecprow=a->compressedrow.use;
657 
658   PetscFunctionBegin;
659   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
660   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
661 
662   v = a->a;
663   if (usecprow) {
664     mbs  = a->compressedrow.nrows;
665     ii   = a->compressedrow.i;
666     ridx = a->compressedrow.rindex;
667   } else {
668     mbs = a->mbs;
669     ii  = a->i;
670     z   = zarray;
671   }
672 
673   for (i=0; i<mbs; i++) {
674     n    = ii[i+1] - ii[i];
675     idx  = ij + ii[i];
676     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
677     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
678 
679     for (j=0; j<n; j++) {
680       xb = x + 15*(idx[j]);
681 
682       for (k=0; k<15; k++) {
683         xv     =  xb[k];
684         sum1  += v[0]*xv;
685         sum2  += v[1]*xv;
686         sum3  += v[2]*xv;
687         sum4  += v[3]*xv;
688         sum5  += v[4]*xv;
689         sum6  += v[5]*xv;
690         sum7  += v[6]*xv;
691         sum8  += v[7]*xv;
692         sum9  += v[8]*xv;
693         sum10 += v[9]*xv;
694         sum11 += v[10]*xv;
695         sum12 += v[11]*xv;
696         sum13 += v[12]*xv;
697         sum14 += v[13]*xv;
698         sum15 += v[14]*xv;
699         v     += 15;
700       }
701     }
702     if (usecprow) z = zarray + 15*ridx[i];
703     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
704     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
705 
706     if (!usecprow) z += 15;
707   }
708 
709   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
710   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
711   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
716 #undef __FUNCT__
717 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
718 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
719 {
720   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
721   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
722   const PetscScalar *x,*xb;
723   PetscScalar       x1,x2,x3,x4,*zarray;
724   const MatScalar   *v;
725   PetscErrorCode    ierr;
726   const PetscInt    *ii,*ij=a->j,*idx;
727   PetscInt          mbs,i,j,n,*ridx=NULL;
728   PetscBool         usecprow=a->compressedrow.use;
729 
730   PetscFunctionBegin;
731   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
732   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
733 
734   v = a->a;
735   if (usecprow) {
736     mbs  = a->compressedrow.nrows;
737     ii   = a->compressedrow.i;
738     ridx = a->compressedrow.rindex;
739   } else {
740     mbs = a->mbs;
741     ii  = a->i;
742     z   = zarray;
743   }
744 
745   for (i=0; i<mbs; i++) {
746     n    = ii[i+1] - ii[i];
747     idx  = ij + ii[i];
748     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
749     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
750 
751     for (j=0; j<n; j++) {
752       xb = x + 15*(idx[j]);
753       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
754 
755       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
756       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
757       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
758       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
759       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
760       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
761       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
762       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
763       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
764       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
765       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
766       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
767       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
768       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
769       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
770 
771       v += 60;
772 
773       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
774 
775       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
776       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
777       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
778       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
779       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
780       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
781       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
782       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
783       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
784       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
785       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
786       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
787       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
788       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
789       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
790       v     += 60;
791 
792       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
793       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
794       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
795       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
796       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
797       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
798       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
799       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
800       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
801       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
802       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
803       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
804       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
805       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
806       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
807       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
808       v     += 60;
809 
810       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
811       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
812       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
813       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
814       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
815       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
816       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
817       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
818       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
819       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
820       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
821       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
822       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
823       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
824       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
825       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
826       v     += 45;
827     }
828     if (usecprow) z = zarray + 15*ridx[i];
829     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
830     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
831 
832     if (!usecprow) z += 15;
833   }
834 
835   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
836   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
837   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
838   PetscFunctionReturn(0);
839 }
840 
841 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
842 #undef __FUNCT__
843 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
844 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
845 {
846   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
847   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
848   const PetscScalar *x,*xb;
849   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
850   const MatScalar   *v;
851   PetscErrorCode    ierr;
852   const PetscInt    *ii,*ij=a->j,*idx;
853   PetscInt          mbs,i,j,n,*ridx=NULL;
854   PetscBool         usecprow=a->compressedrow.use;
855 
856   PetscFunctionBegin;
857   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
858   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
859 
860   v = a->a;
861   if (usecprow) {
862     mbs  = a->compressedrow.nrows;
863     ii   = a->compressedrow.i;
864     ridx = a->compressedrow.rindex;
865   } else {
866     mbs = a->mbs;
867     ii  = a->i;
868     z   = zarray;
869   }
870 
871   for (i=0; i<mbs; i++) {
872     n    = ii[i+1] - ii[i];
873     idx  = ij + ii[i];
874     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
875     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
876 
877     for (j=0; j<n; j++) {
878       xb = x + 15*(idx[j]);
879       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
880       x8 = xb[7];
881 
882       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
883       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
884       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
885       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
886       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
887       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
888       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
889       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
890       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
891       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
892       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
893       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
894       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
895       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
896       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
897       v     += 120;
898 
899       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
900 
901       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
902       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
903       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
904       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
905       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
906       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
907       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
908       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
909       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
910       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
911       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
912       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
913       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
914       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
915       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
916       v     += 105;
917     }
918     if (usecprow) z = zarray + 15*ridx[i];
919     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
920     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
921 
922     if (!usecprow) z += 15;
923   }
924 
925   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
926   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
927   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
932 
933 #undef __FUNCT__
934 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
935 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
936 {
937   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
938   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
939   const PetscScalar *x,*xb;
940   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
941   const MatScalar   *v;
942   PetscErrorCode    ierr;
943   const PetscInt    *ii,*ij=a->j,*idx;
944   PetscInt          mbs,i,j,n,*ridx=NULL;
945   PetscBool         usecprow=a->compressedrow.use;
946 
947   PetscFunctionBegin;
948   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
949   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
950 
951   v = a->a;
952   if (usecprow) {
953     mbs  = a->compressedrow.nrows;
954     ii   = a->compressedrow.i;
955     ridx = a->compressedrow.rindex;
956   } else {
957     mbs = a->mbs;
958     ii  = a->i;
959     z   = zarray;
960   }
961 
962   for (i=0; i<mbs; i++) {
963     n    = ii[i+1] - ii[i];
964     idx  = ij + ii[i];
965     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
966     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
967 
968     for (j=0; j<n; j++) {
969       xb = x + 15*(idx[j]);
970       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
971       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
972 
973       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
974       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
975       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
976       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
977       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
978       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
979       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
980       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
981       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
982       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
983       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
984       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
985       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
986       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
987       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
988       v     += 225;
989     }
990     if (usecprow) z = zarray + 15*ridx[i];
991     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
992     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
993 
994     if (!usecprow) z += 15;
995   }
996 
997   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
998   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
999   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 
1004 /*
1005     This will not work with MatScalar == float because it calls the BLAS
1006 */
1007 #undef __FUNCT__
1008 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1009 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1010 {
1011   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1012   PetscScalar       *z = 0,*work,*workt,*zarray;
1013   const PetscScalar *x,*xb;
1014   const MatScalar   *v;
1015   PetscErrorCode    ierr;
1016   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1017   const PetscInt    *idx,*ii,*ridx=NULL;
1018   PetscInt          ncols,k;
1019   PetscBool         usecprow=a->compressedrow.use;
1020 
1021   PetscFunctionBegin;
1022   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1023   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1024 
1025   idx = a->j;
1026   v   = a->a;
1027   if (usecprow) {
1028     mbs  = a->compressedrow.nrows;
1029     ii   = a->compressedrow.i;
1030     ridx = a->compressedrow.rindex;
1031   } else {
1032     mbs = a->mbs;
1033     ii  = a->i;
1034     z   = zarray;
1035   }
1036 
1037   if (!a->mult_work) {
1038     k    = PetscMax(A->rmap->n,A->cmap->n);
1039     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1040   }
1041   work = a->mult_work;
1042   for (i=0; i<mbs; i++) {
1043     n           = ii[1] - ii[0]; ii++;
1044     ncols       = n*bs;
1045     workt       = work;
1046     for (j=0; j<n; j++) {
1047       xb = x + bs*(*idx++);
1048       for (k=0; k<bs; k++) workt[k] = xb[k];
1049       workt += bs;
1050     }
1051     if (usecprow) z = zarray + bs*ridx[i];
1052     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1053     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1054     v += n*bs2;
1055     if (!usecprow) z += bs;
1056   }
1057   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1058   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1059   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1060   PetscFunctionReturn(0);
1061 }
1062 
1063 #undef __FUNCT__
1064 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1065 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1066 {
1067   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1068   const PetscScalar *x;
1069   PetscScalar       *y,*z,sum;
1070   const MatScalar   *v;
1071   PetscErrorCode    ierr;
1072   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1073   const PetscInt    *idx,*ii;
1074   PetscBool         usecprow=a->compressedrow.use;
1075 
1076   PetscFunctionBegin;
1077   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1078   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1079 
1080   idx = a->j;
1081   v   = a->a;
1082   if (usecprow) {
1083     if (zz != yy) {
1084       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1085     }
1086     mbs  = a->compressedrow.nrows;
1087     ii   = a->compressedrow.i;
1088     ridx = a->compressedrow.rindex;
1089   } else {
1090     ii = a->i;
1091   }
1092 
1093   for (i=0; i<mbs; i++) {
1094     n = ii[1] - ii[0];
1095     ii++;
1096     if (!usecprow) {
1097       sum         = y[i];
1098     } else {
1099       sum = y[ridx[i]];
1100     }
1101     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1102     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1103     PetscSparseDensePlusDot(sum,x,v,idx,n);
1104     v   += n;
1105     idx += n;
1106     if (usecprow) {
1107       z[ridx[i]] = sum;
1108     } else {
1109       z[i] = sum;
1110     }
1111   }
1112   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1113   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 #undef __FUNCT__
1119 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1120 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1121 {
1122   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1123   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1124   const PetscScalar *x,*xb;
1125   PetscScalar       x1,x2,*yarray,*zarray;
1126   const MatScalar   *v;
1127   PetscErrorCode    ierr;
1128   PetscInt          mbs = a->mbs,i,n,j;
1129   const PetscInt    *idx,*ii,*ridx = NULL;
1130   PetscBool         usecprow = a->compressedrow.use;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1134   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1135 
1136   idx = a->j;
1137   v   = a->a;
1138   if (usecprow) {
1139     if (zz != yy) {
1140       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1141     }
1142     mbs  = a->compressedrow.nrows;
1143     ii   = a->compressedrow.i;
1144     ridx = a->compressedrow.rindex;
1145     if (zz != yy) {
1146       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1147     }
1148   } else {
1149     ii = a->i;
1150     y  = yarray;
1151     z  = zarray;
1152   }
1153 
1154   for (i=0; i<mbs; i++) {
1155     n = ii[1] - ii[0]; ii++;
1156     if (usecprow) {
1157       z = zarray + 2*ridx[i];
1158       y = yarray + 2*ridx[i];
1159     }
1160     sum1 = y[0]; sum2 = y[1];
1161     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1162     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1163     for (j=0; j<n; j++) {
1164       xb = x + 2*(*idx++);
1165       x1 = xb[0];
1166       x2 = xb[1];
1167 
1168       sum1 += v[0]*x1 + v[2]*x2;
1169       sum2 += v[1]*x1 + v[3]*x2;
1170       v    += 4;
1171     }
1172     z[0] = sum1; z[1] = sum2;
1173     if (!usecprow) {
1174       z += 2; y += 2;
1175     }
1176   }
1177   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1178   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1179   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 #undef __FUNCT__
1184 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1185 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1186 {
1187   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1188   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1189   const PetscScalar *x,*xb;
1190   const MatScalar   *v;
1191   PetscErrorCode    ierr;
1192   PetscInt          mbs = a->mbs,i,j,n;
1193   const PetscInt    *idx,*ii,*ridx = NULL;
1194   PetscBool         usecprow = a->compressedrow.use;
1195 
1196   PetscFunctionBegin;
1197   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1198   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1199 
1200   idx = a->j;
1201   v   = a->a;
1202   if (usecprow) {
1203     if (zz != yy) {
1204       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1205     }
1206     mbs  = a->compressedrow.nrows;
1207     ii   = a->compressedrow.i;
1208     ridx = a->compressedrow.rindex;
1209   } else {
1210     ii = a->i;
1211     y  = yarray;
1212     z  = zarray;
1213   }
1214 
1215   for (i=0; i<mbs; i++) {
1216     n = ii[1] - ii[0]; ii++;
1217     if (usecprow) {
1218       z = zarray + 3*ridx[i];
1219       y = yarray + 3*ridx[i];
1220     }
1221     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1222     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1223     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1224     for (j=0; j<n; j++) {
1225       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1226       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1227       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1228       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1229       v    += 9;
1230     }
1231     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1232     if (!usecprow) {
1233       z += 3; y += 3;
1234     }
1235   }
1236   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1237   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1238   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1239   PetscFunctionReturn(0);
1240 }
1241 
1242 #undef __FUNCT__
1243 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1244 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1245 {
1246   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1247   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1248   const PetscScalar *x,*xb;
1249   const MatScalar   *v;
1250   PetscErrorCode    ierr;
1251   PetscInt          mbs = a->mbs,i,j,n;
1252   const PetscInt    *idx,*ii,*ridx=NULL;
1253   PetscBool         usecprow=a->compressedrow.use;
1254 
1255   PetscFunctionBegin;
1256   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1257   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1258 
1259   idx = a->j;
1260   v   = a->a;
1261   if (usecprow) {
1262     if (zz != yy) {
1263       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1264     }
1265     mbs  = a->compressedrow.nrows;
1266     ii   = a->compressedrow.i;
1267     ridx = a->compressedrow.rindex;
1268   } else {
1269     ii = a->i;
1270     y  = yarray;
1271     z  = zarray;
1272   }
1273 
1274   for (i=0; i<mbs; i++) {
1275     n = ii[1] - ii[0]; ii++;
1276     if (usecprow) {
1277       z = zarray + 4*ridx[i];
1278       y = yarray + 4*ridx[i];
1279     }
1280     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1281     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1282     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1283     for (j=0; j<n; j++) {
1284       xb    = x + 4*(*idx++);
1285       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1286       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1287       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1288       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1289       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1290       v    += 16;
1291     }
1292     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1293     if (!usecprow) {
1294       z += 4; y += 4;
1295     }
1296   }
1297   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1298   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1299   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1300   PetscFunctionReturn(0);
1301 }
1302 
1303 #undef __FUNCT__
1304 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1305 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1306 {
1307   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1308   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1309   const PetscScalar *x,*xb;
1310   PetscScalar       *yarray,*zarray;
1311   const MatScalar   *v;
1312   PetscErrorCode    ierr;
1313   PetscInt          mbs = a->mbs,i,j,n;
1314   const PetscInt    *idx,*ii,*ridx = NULL;
1315   PetscBool         usecprow=a->compressedrow.use;
1316 
1317   PetscFunctionBegin;
1318   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1319   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1320 
1321   idx = a->j;
1322   v   = a->a;
1323   if (usecprow) {
1324     if (zz != yy) {
1325       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1326     }
1327     mbs  = a->compressedrow.nrows;
1328     ii   = a->compressedrow.i;
1329     ridx = a->compressedrow.rindex;
1330   } else {
1331     ii = a->i;
1332     y  = yarray;
1333     z  = zarray;
1334   }
1335 
1336   for (i=0; i<mbs; i++) {
1337     n = ii[1] - ii[0]; ii++;
1338     if (usecprow) {
1339       z = zarray + 5*ridx[i];
1340       y = yarray + 5*ridx[i];
1341     }
1342     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1343     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1344     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1345     for (j=0; j<n; j++) {
1346       xb    = x + 5*(*idx++);
1347       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1348       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1349       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1350       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1351       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1352       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1353       v    += 25;
1354     }
1355     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1356     if (!usecprow) {
1357       z += 5; y += 5;
1358     }
1359   }
1360   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1361   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1362   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1363   PetscFunctionReturn(0);
1364 }
1365 #undef __FUNCT__
1366 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1367 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1368 {
1369   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1370   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1371   const PetscScalar *x,*xb;
1372   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1373   const MatScalar   *v;
1374   PetscErrorCode    ierr;
1375   PetscInt          mbs = a->mbs,i,j,n;
1376   const PetscInt    *idx,*ii,*ridx=NULL;
1377   PetscBool         usecprow=a->compressedrow.use;
1378 
1379   PetscFunctionBegin;
1380   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1381   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1382 
1383   idx = a->j;
1384   v   = a->a;
1385   if (usecprow) {
1386     if (zz != yy) {
1387       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1388     }
1389     mbs  = a->compressedrow.nrows;
1390     ii   = a->compressedrow.i;
1391     ridx = a->compressedrow.rindex;
1392   } else {
1393     ii = a->i;
1394     y  = yarray;
1395     z  = zarray;
1396   }
1397 
1398   for (i=0; i<mbs; i++) {
1399     n = ii[1] - ii[0]; ii++;
1400     if (usecprow) {
1401       z = zarray + 6*ridx[i];
1402       y = yarray + 6*ridx[i];
1403     }
1404     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1405     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1406     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1407     for (j=0; j<n; j++) {
1408       xb    = x + 6*(*idx++);
1409       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1410       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1411       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1412       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1413       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1414       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1415       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1416       v    += 36;
1417     }
1418     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1419     if (!usecprow) {
1420       z += 6; y += 6;
1421     }
1422   }
1423   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1424   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1425   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 #undef __FUNCT__
1430 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1431 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1432 {
1433   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1434   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1435   const PetscScalar *x,*xb;
1436   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1437   const MatScalar   *v;
1438   PetscErrorCode    ierr;
1439   PetscInt          mbs = a->mbs,i,j,n;
1440   const PetscInt    *idx,*ii,*ridx = NULL;
1441   PetscBool         usecprow=a->compressedrow.use;
1442 
1443   PetscFunctionBegin;
1444   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1445   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1446 
1447   idx = a->j;
1448   v   = a->a;
1449   if (usecprow) {
1450     if (zz != yy) {
1451       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1452     }
1453     mbs  = a->compressedrow.nrows;
1454     ii   = a->compressedrow.i;
1455     ridx = a->compressedrow.rindex;
1456   } else {
1457     ii = a->i;
1458     y  = yarray;
1459     z  = zarray;
1460   }
1461 
1462   for (i=0; i<mbs; i++) {
1463     n = ii[1] - ii[0]; ii++;
1464     if (usecprow) {
1465       z = zarray + 7*ridx[i];
1466       y = yarray + 7*ridx[i];
1467     }
1468     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1469     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1470     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1471     for (j=0; j<n; j++) {
1472       xb    = x + 7*(*idx++);
1473       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1474       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1475       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1476       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1477       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1478       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1479       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1480       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1481       v    += 49;
1482     }
1483     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1484     if (!usecprow) {
1485       z += 7; y += 7;
1486     }
1487   }
1488   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1489   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1490   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1491   PetscFunctionReturn(0);
1492 }
1493 
1494 #undef __FUNCT__
1495 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1496 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1497 {
1498   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1499   PetscScalar       *z = 0,*work,*workt,*zarray;
1500   const PetscScalar *x,*xb;
1501   const MatScalar   *v;
1502   PetscErrorCode    ierr;
1503   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1504   PetscInt          ncols,k;
1505   const PetscInt    *ridx = NULL,*idx,*ii;
1506   PetscBool         usecprow = a->compressedrow.use;
1507 
1508   PetscFunctionBegin;
1509   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1510   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1511   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1512 
1513   idx = a->j;
1514   v   = a->a;
1515   if (usecprow) {
1516     mbs  = a->compressedrow.nrows;
1517     ii   = a->compressedrow.i;
1518     ridx = a->compressedrow.rindex;
1519   } else {
1520     mbs = a->mbs;
1521     ii  = a->i;
1522     z   = zarray;
1523   }
1524 
1525   if (!a->mult_work) {
1526     k    = PetscMax(A->rmap->n,A->cmap->n);
1527     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1528   }
1529   work = a->mult_work;
1530   for (i=0; i<mbs; i++) {
1531     n     = ii[1] - ii[0]; ii++;
1532     ncols = n*bs;
1533     workt = work;
1534     for (j=0; j<n; j++) {
1535       xb = x + bs*(*idx++);
1536       for (k=0; k<bs; k++) workt[k] = xb[k];
1537       workt += bs;
1538     }
1539     if (usecprow) z = zarray + bs*ridx[i];
1540     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1541     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1542     v += n*bs2;
1543     if (!usecprow) z += bs;
1544   }
1545   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1546   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1547   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 #undef __FUNCT__
1552 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1553 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1554 {
1555   PetscScalar    zero = 0.0;
1556   PetscErrorCode ierr;
1557 
1558   PetscFunctionBegin;
1559   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1560   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1561   PetscFunctionReturn(0);
1562 }
1563 
1564 #undef __FUNCT__
1565 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1566 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1567 {
1568   PetscScalar    zero = 0.0;
1569   PetscErrorCode ierr;
1570 
1571   PetscFunctionBegin;
1572   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1573   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1574   PetscFunctionReturn(0);
1575 }
1576 
1577 #undef __FUNCT__
1578 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1579 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1580 {
1581   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1582   PetscScalar       *z,x1,x2,x3,x4,x5;
1583   const PetscScalar *x,*xb = NULL;
1584   const MatScalar   *v;
1585   PetscErrorCode    ierr;
1586   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1587   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1588   Mat_CompressedRow cprow = a->compressedrow;
1589   PetscBool         usecprow = cprow.use;
1590 
1591   PetscFunctionBegin;
1592   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1593   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1594   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1595 
1596   idx = a->j;
1597   v   = a->a;
1598   if (usecprow) {
1599     mbs  = cprow.nrows;
1600     ii   = cprow.i;
1601     ridx = cprow.rindex;
1602   } else {
1603     mbs=a->mbs;
1604     ii = a->i;
1605     xb = x;
1606   }
1607 
1608   switch (bs) {
1609   case 1:
1610     for (i=0; i<mbs; i++) {
1611       if (usecprow) xb = x + ridx[i];
1612       x1 = xb[0];
1613       ib = idx + ii[0];
1614       n  = ii[1] - ii[0]; ii++;
1615       for (j=0; j<n; j++) {
1616         rval     = ib[j];
1617         z[rval] += PetscConj(*v) * x1;
1618         v++;
1619       }
1620       if (!usecprow) xb++;
1621     }
1622     break;
1623   case 2:
1624     for (i=0; i<mbs; i++) {
1625       if (usecprow) xb = x + 2*ridx[i];
1626       x1 = xb[0]; x2 = xb[1];
1627       ib = idx + ii[0];
1628       n  = ii[1] - ii[0]; ii++;
1629       for (j=0; j<n; j++) {
1630         rval       = ib[j]*2;
1631         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1632         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1633         v         += 4;
1634       }
1635       if (!usecprow) xb += 2;
1636     }
1637     break;
1638   case 3:
1639     for (i=0; i<mbs; i++) {
1640       if (usecprow) xb = x + 3*ridx[i];
1641       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1642       ib = idx + ii[0];
1643       n  = ii[1] - ii[0]; ii++;
1644       for (j=0; j<n; j++) {
1645         rval       = ib[j]*3;
1646         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1647         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1648         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1649         v         += 9;
1650       }
1651       if (!usecprow) xb += 3;
1652     }
1653     break;
1654   case 4:
1655     for (i=0; i<mbs; i++) {
1656       if (usecprow) xb = x + 4*ridx[i];
1657       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1658       ib = idx + ii[0];
1659       n  = ii[1] - ii[0]; ii++;
1660       for (j=0; j<n; j++) {
1661         rval       = ib[j]*4;
1662         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1663         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1664         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1665         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1666         v         += 16;
1667       }
1668       if (!usecprow) xb += 4;
1669     }
1670     break;
1671   case 5:
1672     for (i=0; i<mbs; i++) {
1673       if (usecprow) xb = x + 5*ridx[i];
1674       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1675       x4 = xb[3]; x5 = xb[4];
1676       ib = idx + ii[0];
1677       n  = ii[1] - ii[0]; ii++;
1678       for (j=0; j<n; j++) {
1679         rval       = ib[j]*5;
1680         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1681         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1682         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1683         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1684         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1685         v         += 25;
1686       }
1687       if (!usecprow) xb += 5;
1688     }
1689     break;
1690   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1691     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1692 #if 0
1693     {
1694       PetscInt          ncols,k,bs2=a->bs2;
1695       PetscScalar       *work,*workt,zb;
1696       const PetscScalar *xtmp;
1697       if (!a->mult_work) {
1698         k    = PetscMax(A->rmap->n,A->cmap->n);
1699         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1700       }
1701       work = a->mult_work;
1702       xtmp = x;
1703       for (i=0; i<mbs; i++) {
1704         n     = ii[1] - ii[0]; ii++;
1705         ncols = n*bs;
1706         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1707         if (usecprow) xtmp = x + bs*ridx[i];
1708         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1709         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1710         v += n*bs2;
1711         if (!usecprow) xtmp += bs;
1712         workt = work;
1713         for (j=0; j<n; j++) {
1714           zb = z + bs*(*idx++);
1715           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1716           workt += bs;
1717         }
1718       }
1719     }
1720 #endif
1721   }
1722   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1723   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1724   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1725   PetscFunctionReturn(0);
1726 }
1727 
1728 #undef __FUNCT__
1729 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1730 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1731 {
1732   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1733   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1734   const PetscScalar *x,*xb = 0;
1735   const MatScalar   *v;
1736   PetscErrorCode    ierr;
1737   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1738   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1739   Mat_CompressedRow cprow   = a->compressedrow;
1740   PetscBool         usecprow=cprow.use;
1741 
1742   PetscFunctionBegin;
1743   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1744   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1745   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1746 
1747   idx = a->j;
1748   v   = a->a;
1749   if (usecprow) {
1750     mbs  = cprow.nrows;
1751     ii   = cprow.i;
1752     ridx = cprow.rindex;
1753   } else {
1754     mbs=a->mbs;
1755     ii = a->i;
1756     xb = x;
1757   }
1758 
1759   switch (bs) {
1760   case 1:
1761     for (i=0; i<mbs; i++) {
1762       if (usecprow) xb = x + ridx[i];
1763       x1 = xb[0];
1764       ib = idx + ii[0];
1765       n  = ii[1] - ii[0]; ii++;
1766       for (j=0; j<n; j++) {
1767         rval     = ib[j];
1768         z[rval] += *v * x1;
1769         v++;
1770       }
1771       if (!usecprow) xb++;
1772     }
1773     break;
1774   case 2:
1775     for (i=0; i<mbs; i++) {
1776       if (usecprow) xb = x + 2*ridx[i];
1777       x1 = xb[0]; x2 = xb[1];
1778       ib = idx + ii[0];
1779       n  = ii[1] - ii[0]; ii++;
1780       for (j=0; j<n; j++) {
1781         rval       = ib[j]*2;
1782         z[rval++] += v[0]*x1 + v[1]*x2;
1783         z[rval++] += v[2]*x1 + v[3]*x2;
1784         v         += 4;
1785       }
1786       if (!usecprow) xb += 2;
1787     }
1788     break;
1789   case 3:
1790     for (i=0; i<mbs; i++) {
1791       if (usecprow) xb = x + 3*ridx[i];
1792       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1793       ib = idx + ii[0];
1794       n  = ii[1] - ii[0]; ii++;
1795       for (j=0; j<n; j++) {
1796         rval       = ib[j]*3;
1797         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1798         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1799         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1800         v         += 9;
1801       }
1802       if (!usecprow) xb += 3;
1803     }
1804     break;
1805   case 4:
1806     for (i=0; i<mbs; i++) {
1807       if (usecprow) xb = x + 4*ridx[i];
1808       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1809       ib = idx + ii[0];
1810       n  = ii[1] - ii[0]; ii++;
1811       for (j=0; j<n; j++) {
1812         rval       = ib[j]*4;
1813         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1814         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1815         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1816         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1817         v         += 16;
1818       }
1819       if (!usecprow) xb += 4;
1820     }
1821     break;
1822   case 5:
1823     for (i=0; i<mbs; i++) {
1824       if (usecprow) xb = x + 5*ridx[i];
1825       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1826       x4 = xb[3]; x5 = xb[4];
1827       ib = idx + ii[0];
1828       n  = ii[1] - ii[0]; ii++;
1829       for (j=0; j<n; j++) {
1830         rval       = ib[j]*5;
1831         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1832         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1833         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1834         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1835         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1836         v         += 25;
1837       }
1838       if (!usecprow) xb += 5;
1839     }
1840     break;
1841   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1842     PetscInt          ncols,k;
1843     PetscScalar       *work,*workt;
1844     const PetscScalar *xtmp;
1845     if (!a->mult_work) {
1846       k    = PetscMax(A->rmap->n,A->cmap->n);
1847       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1848     }
1849     work = a->mult_work;
1850     xtmp = x;
1851     for (i=0; i<mbs; i++) {
1852       n     = ii[1] - ii[0]; ii++;
1853       ncols = n*bs;
1854       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1855       if (usecprow) xtmp = x + bs*ridx[i];
1856       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1857       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1858       v += n*bs2;
1859       if (!usecprow) xtmp += bs;
1860       workt = work;
1861       for (j=0; j<n; j++) {
1862         zb = z + bs*(*idx++);
1863         for (k=0; k<bs; k++) zb[k] += workt[k];
1864         workt += bs;
1865       }
1866     }
1867     }
1868   }
1869   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1870   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1871   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 #undef __FUNCT__
1876 #define __FUNCT__ "MatScale_SeqBAIJ"
1877 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1878 {
1879   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1880   PetscInt       totalnz = a->bs2*a->nz;
1881   PetscScalar    oalpha  = alpha;
1882   PetscErrorCode ierr;
1883   PetscBLASInt   one = 1,tnz;
1884 
1885   PetscFunctionBegin;
1886   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1887   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1888   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1889   PetscFunctionReturn(0);
1890 }
1891 
1892 #undef __FUNCT__
1893 #define __FUNCT__ "MatNorm_SeqBAIJ"
1894 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1895 {
1896   PetscErrorCode ierr;
1897   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1898   MatScalar      *v  = a->a;
1899   PetscReal      sum = 0.0;
1900   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1901 
1902   PetscFunctionBegin;
1903   if (type == NORM_FROBENIUS) {
1904     for (i=0; i< bs2*nz; i++) {
1905       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1906     }
1907     *norm = PetscSqrtReal(sum);
1908   } else if (type == NORM_1) { /* maximum column sum */
1909     PetscReal *tmp;
1910     PetscInt  *bcol = a->j;
1911     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
1912     for (i=0; i<nz; i++) {
1913       for (j=0; j<bs; j++) {
1914         k1 = bs*(*bcol) + j; /* column index */
1915         for (k=0; k<bs; k++) {
1916           tmp[k1] += PetscAbsScalar(*v); v++;
1917         }
1918       }
1919       bcol++;
1920     }
1921     *norm = 0.0;
1922     for (j=0; j<A->cmap->n; j++) {
1923       if (tmp[j] > *norm) *norm = tmp[j];
1924     }
1925     ierr = PetscFree(tmp);CHKERRQ(ierr);
1926   } else if (type == NORM_INFINITY) { /* maximum row sum */
1927     *norm = 0.0;
1928     for (k=0; k<bs; k++) {
1929       for (j=0; j<a->mbs; j++) {
1930         v   = a->a + bs2*a->i[j] + k;
1931         sum = 0.0;
1932         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1933           for (k1=0; k1<bs; k1++) {
1934             sum += PetscAbsScalar(*v);
1935             v   += bs;
1936           }
1937         }
1938         if (sum > *norm) *norm = sum;
1939       }
1940     }
1941   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1942   PetscFunctionReturn(0);
1943 }
1944 
1945 
1946 #undef __FUNCT__
1947 #define __FUNCT__ "MatEqual_SeqBAIJ"
1948 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1949 {
1950   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1951   PetscErrorCode ierr;
1952 
1953   PetscFunctionBegin;
1954   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1955   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1956     *flg = PETSC_FALSE;
1957     PetscFunctionReturn(0);
1958   }
1959 
1960   /* if the a->i are the same */
1961   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1962   if (!*flg) PetscFunctionReturn(0);
1963 
1964   /* if a->j are the same */
1965   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1966   if (!*flg) PetscFunctionReturn(0);
1967 
1968   /* if a->a are the same */
1969   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1970   PetscFunctionReturn(0);
1971 
1972 }
1973 
1974 #undef __FUNCT__
1975 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1976 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1977 {
1978   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1979   PetscErrorCode ierr;
1980   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1981   PetscScalar    *x,zero = 0.0;
1982   MatScalar      *aa,*aa_j;
1983 
1984   PetscFunctionBegin;
1985   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1986   bs   = A->rmap->bs;
1987   aa   = a->a;
1988   ai   = a->i;
1989   aj   = a->j;
1990   ambs = a->mbs;
1991   bs2  = a->bs2;
1992 
1993   ierr = VecSet(v,zero);CHKERRQ(ierr);
1994   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1995   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1996   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1997   for (i=0; i<ambs; i++) {
1998     for (j=ai[i]; j<ai[i+1]; j++) {
1999       if (aj[j] == i) {
2000         row  = i*bs;
2001         aa_j = aa+j*bs2;
2002         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2003         break;
2004       }
2005     }
2006   }
2007   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2008   PetscFunctionReturn(0);
2009 }
2010 
2011 #undef __FUNCT__
2012 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2013 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2014 {
2015   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2016   const PetscScalar *l,*r,*li,*ri;
2017   PetscScalar       x;
2018   MatScalar         *aa, *v;
2019   PetscErrorCode    ierr;
2020   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2021   const PetscInt    *ai,*aj;
2022 
2023   PetscFunctionBegin;
2024   ai  = a->i;
2025   aj  = a->j;
2026   aa  = a->a;
2027   m   = A->rmap->n;
2028   n   = A->cmap->n;
2029   bs  = A->rmap->bs;
2030   mbs = a->mbs;
2031   bs2 = a->bs2;
2032   if (ll) {
2033     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2034     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2035     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2036     for (i=0; i<mbs; i++) { /* for each block row */
2037       M  = ai[i+1] - ai[i];
2038       li = l + i*bs;
2039       v  = aa + bs2*ai[i];
2040       for (j=0; j<M; j++) { /* for each block */
2041         for (k=0; k<bs2; k++) {
2042           (*v++) *= li[k%bs];
2043         }
2044       }
2045     }
2046     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2047     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2048   }
2049 
2050   if (rr) {
2051     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2052     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2053     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2054     for (i=0; i<mbs; i++) { /* for each block row */
2055       iai = ai[i];
2056       M   = ai[i+1] - iai;
2057       v   = aa + bs2*iai;
2058       for (j=0; j<M; j++) { /* for each block */
2059         ri = r + bs*aj[iai+j];
2060         for (k=0; k<bs; k++) {
2061           x = ri[k];
2062           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2063           v += bs;
2064         }
2065       }
2066     }
2067     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2068     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2069   }
2070   PetscFunctionReturn(0);
2071 }
2072 
2073 
2074 #undef __FUNCT__
2075 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2076 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2077 {
2078   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2079 
2080   PetscFunctionBegin;
2081   info->block_size   = a->bs2;
2082   info->nz_allocated = a->bs2*a->maxnz;
2083   info->nz_used      = a->bs2*a->nz;
2084   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2085   info->assemblies   = A->num_ass;
2086   info->mallocs      = A->info.mallocs;
2087   info->memory       = ((PetscObject)A)->mem;
2088   if (A->factortype) {
2089     info->fill_ratio_given  = A->info.fill_ratio_given;
2090     info->fill_ratio_needed = A->info.fill_ratio_needed;
2091     info->factor_mallocs    = A->info.factor_mallocs;
2092   } else {
2093     info->fill_ratio_given  = 0;
2094     info->fill_ratio_needed = 0;
2095     info->factor_mallocs    = 0;
2096   }
2097   PetscFunctionReturn(0);
2098 }
2099 
2100 #undef __FUNCT__
2101 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2102 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2103 {
2104   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2105   PetscErrorCode ierr;
2106 
2107   PetscFunctionBegin;
2108   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2109   PetscFunctionReturn(0);
2110 }
2111