xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision bebe2cf65d55febe21a5af8db2bd2e168caaa2e7)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m  = a->mbs;
20   ai = a->i;
21   aj = a->j;
22   bs = A->rmap->bs;
23 
24   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
27   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n; ++j) {
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64     }
65     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
66   }
67   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
68   ierr = PetscFree(nidx);CHKERRQ(ierr);
69   ierr = PetscFree(nidx2);CHKERRQ(ierr);
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag;
87 
88   PetscFunctionBegin;
89 
90   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
91   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
93   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
94 
95   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
96   ssmap = smap;
97   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
98   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99   /* determine lens of each row */
100   for (i=0; i<nrows; i++) {
101     kstart  = ai[irow[i]];
102     kend    = kstart + a->ilen[irow[i]];
103     lens[i] = 0;
104     for (k=kstart; k<kend; k++) {
105       if (ssmap[aj[k]]) lens[i]++;
106     }
107   }
108   /* Create and fill new matrix */
109   if (scall == MAT_REUSE_MATRIX) {
110     c = (Mat_SeqBAIJ*)((*B)->data);
111 
112     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
114     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
116     C    = *B;
117   } else {
118     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
119     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
120     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
121     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
122   }
123   c = (Mat_SeqBAIJ*)(C->data);
124   for (i=0; i<nrows; i++) {
125     row      = irow[i];
126     kstart   = ai[row];
127     kend     = kstart + a->ilen[row];
128     mat_i    = c->i[i];
129     mat_j    = c->j + mat_i;
130     mat_a    = c->a + mat_i*bs2;
131     mat_ilen = c->ilen + i;
132     for (k=kstart; k<kend; k++) {
133       if ((tcol=ssmap[a->j[k]])) {
134         *mat_j++ = tcol - 1;
135         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
136         mat_a   += bs2;
137         (*mat_ilen)++;
138       }
139     }
140   }
141   /* sort */
142   {
143     MatScalar *work;
144 
145     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
146     for (i=0; i<nrows; i++) {
147       PetscInt ilen;
148       mat_i = c->i[i];
149       mat_j = c->j + mat_i;
150       mat_a = c->a + mat_i*bs2;
151       ilen  = c->ilen[i];
152       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
153     }
154     ierr = PetscFree(work);CHKERRQ(ierr);
155   }
156 
157   /* Free work space */
158   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
159   ierr = PetscFree(smap);CHKERRQ(ierr);
160   ierr = PetscFree(lens);CHKERRQ(ierr);
161   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
163 
164   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
165   *B   = C;
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
171 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
172 {
173   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
174   IS             is1,is2;
175   PetscErrorCode ierr;
176   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
177   const PetscInt *irow,*icol;
178 
179   PetscFunctionBegin;
180   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
181   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
182   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
183   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
184 
185   /* Verify if the indices corespond to each element in a block
186    and form the IS with compressed IS */
187   maxmnbs = PetscMax(a->mbs,a->nbs);
188   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
189   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
190   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
191   for (i=0; i<a->mbs; i++) {
192     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
193   }
194   count = 0;
195   for (i=0; i<nrows; i++) {
196     PetscInt j = irow[i] / bs;
197     if ((vary[j]--)==bs) iary[count++] = j;
198   }
199   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
200 
201   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
202   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
203   for (i=0; i<a->nbs; i++) {
204     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
205   }
206   count = 0;
207   for (i=0; i<ncols; i++) {
208     PetscInt j = icol[i] / bs;
209     if ((vary[j]--)==bs) iary[count++] = j;
210   }
211   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
212   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
213   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
214   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
215 
216   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
217   ierr = ISDestroy(&is1);CHKERRQ(ierr);
218   ierr = ISDestroy(&is2);CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 #undef __FUNCT__
223 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
224 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
225 {
226   PetscErrorCode ierr;
227   PetscInt       i;
228 
229   PetscFunctionBegin;
230   if (scall == MAT_INITIAL_MATRIX) {
231     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
232   }
233 
234   for (i=0; i<n; i++) {
235     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
236   }
237   PetscFunctionReturn(0);
238 }
239 
240 
241 /* -------------------------------------------------------*/
242 /* Should check that shapes of vectors and matrices match */
243 /* -------------------------------------------------------*/
244 
245 #undef __FUNCT__
246 #define __FUNCT__ "MatMult_SeqBAIJ_1"
247 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
248 {
249   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
250   PetscScalar       *z,sum;
251   const PetscScalar *x;
252   const MatScalar   *v;
253   PetscErrorCode    ierr;
254   PetscInt          mbs,i,n;
255   const PetscInt    *idx,*ii,*ridx=NULL;
256   PetscBool         usecprow=a->compressedrow.use;
257 
258   PetscFunctionBegin;
259   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
260   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
261 
262   if (usecprow) {
263     mbs  = a->compressedrow.nrows;
264     ii   = a->compressedrow.i;
265     ridx = a->compressedrow.rindex;
266     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
267   } else {
268     mbs = a->mbs;
269     ii  = a->i;
270   }
271 
272   for (i=0; i<mbs; i++) {
273     n   = ii[1] - ii[0];
274     v   = a->a + ii[0];
275     idx = a->j + ii[0];
276     ii++;
277     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
278     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
279     sum = 0.0;
280     PetscSparseDensePlusDot(sum,x,v,idx,n);
281     if (usecprow) {
282       z[ridx[i]] = sum;
283     } else {
284       z[i]        = sum;
285     }
286   }
287   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
288   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
289   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
290   PetscFunctionReturn(0);
291 }
292 
293 #undef __FUNCT__
294 #define __FUNCT__ "MatMult_SeqBAIJ_2"
295 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
296 {
297   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
298   PetscScalar       *z = 0,sum1,sum2,*zarray;
299   const PetscScalar *x,*xb;
300   PetscScalar       x1,x2;
301   const MatScalar   *v;
302   PetscErrorCode    ierr;
303   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
304   PetscBool         usecprow=a->compressedrow.use;
305 
306   PetscFunctionBegin;
307   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
309 
310   idx = a->j;
311   v   = a->a;
312   if (usecprow) {
313     mbs  = a->compressedrow.nrows;
314     ii   = a->compressedrow.i;
315     ridx = a->compressedrow.rindex;
316   } else {
317     mbs = a->mbs;
318     ii  = a->i;
319     z   = zarray;
320   }
321 
322   for (i=0; i<mbs; i++) {
323     n           = ii[1] - ii[0]; ii++;
324     sum1        = 0.0; sum2 = 0.0;
325     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
326     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
327     for (j=0; j<n; j++) {
328       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
329       sum1 += v[0]*x1 + v[2]*x2;
330       sum2 += v[1]*x1 + v[3]*x2;
331       v    += 4;
332     }
333     if (usecprow) z = zarray + 2*ridx[i];
334     z[0] = sum1; z[1] = sum2;
335     if (!usecprow) z += 2;
336   }
337   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
338   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
339   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
340   PetscFunctionReturn(0);
341 }
342 
343 #undef __FUNCT__
344 #define __FUNCT__ "MatMult_SeqBAIJ_3"
345 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
346 {
347   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
348   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
349   const PetscScalar *x,*xb;
350   const MatScalar   *v;
351   PetscErrorCode    ierr;
352   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
353   PetscBool         usecprow=a->compressedrow.use;
354 
355 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
356 #pragma disjoint(*v,*z,*xb)
357 #endif
358 
359   PetscFunctionBegin;
360   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
361   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
362 
363   idx = a->j;
364   v   = a->a;
365   if (usecprow) {
366     mbs  = a->compressedrow.nrows;
367     ii   = a->compressedrow.i;
368     ridx = a->compressedrow.rindex;
369   } else {
370     mbs = a->mbs;
371     ii  = a->i;
372     z   = zarray;
373   }
374 
375   for (i=0; i<mbs; i++) {
376     n           = ii[1] - ii[0]; ii++;
377     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
378     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
379     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380     for (j=0; j<n; j++) {
381       xb = x + 3*(*idx++);
382       x1 = xb[0];
383       x2 = xb[1];
384       x3 = xb[2];
385 
386       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
387       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
388       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
389       v    += 9;
390     }
391     if (usecprow) z = zarray + 3*ridx[i];
392     z[0] = sum1; z[1] = sum2; z[2] = sum3;
393     if (!usecprow) z += 3;
394   }
395   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
396   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
397   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 
401 #undef __FUNCT__
402 #define __FUNCT__ "MatMult_SeqBAIJ_4"
403 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
404 {
405   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
406   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
407   const PetscScalar *x,*xb;
408   const MatScalar   *v;
409   PetscErrorCode    ierr;
410   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
411   PetscBool         usecprow=a->compressedrow.use;
412 
413   PetscFunctionBegin;
414   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
415   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
416 
417   idx = a->j;
418   v   = a->a;
419   if (usecprow) {
420     mbs  = a->compressedrow.nrows;
421     ii   = a->compressedrow.i;
422     ridx = a->compressedrow.rindex;
423   } else {
424     mbs = a->mbs;
425     ii  = a->i;
426     z   = zarray;
427   }
428 
429   for (i=0; i<mbs; i++) {
430     n = ii[1] - ii[0];
431     ii++;
432     sum1 = 0.0;
433     sum2 = 0.0;
434     sum3 = 0.0;
435     sum4 = 0.0;
436 
437     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
438     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
439     for (j=0; j<n; j++) {
440       xb    = x + 4*(*idx++);
441       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
442       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
443       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
444       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
445       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
446       v    += 16;
447     }
448     if (usecprow) z = zarray + 4*ridx[i];
449     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
450     if (!usecprow) z += 4;
451   }
452   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
453   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
454   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
455   PetscFunctionReturn(0);
456 }
457 
458 #undef __FUNCT__
459 #define __FUNCT__ "MatMult_SeqBAIJ_5"
460 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
461 {
462   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
463   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
464   const PetscScalar *xb,*x;
465   const MatScalar   *v;
466   PetscErrorCode    ierr;
467   const PetscInt    *idx,*ii,*ridx=NULL;
468   PetscInt          mbs,i,j,n;
469   PetscBool         usecprow=a->compressedrow.use;
470 
471   PetscFunctionBegin;
472   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
473   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
474 
475   idx = a->j;
476   v   = a->a;
477   if (usecprow) {
478     mbs  = a->compressedrow.nrows;
479     ii   = a->compressedrow.i;
480     ridx = a->compressedrow.rindex;
481   } else {
482     mbs = a->mbs;
483     ii  = a->i;
484     z   = zarray;
485   }
486 
487   for (i=0; i<mbs; i++) {
488     n           = ii[1] - ii[0]; ii++;
489     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
490     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
491     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
492     for (j=0; j<n; j++) {
493       xb    = x + 5*(*idx++);
494       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
495       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
496       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
497       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
498       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
499       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
500       v    += 25;
501     }
502     if (usecprow) z = zarray + 5*ridx[i];
503     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
504     if (!usecprow) z += 5;
505   }
506   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
507   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
508   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
509   PetscFunctionReturn(0);
510 }
511 
512 
513 
514 #undef __FUNCT__
515 #define __FUNCT__ "MatMult_SeqBAIJ_6"
516 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
517 {
518   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
519   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
520   const PetscScalar *x,*xb;
521   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
522   const MatScalar   *v;
523   PetscErrorCode    ierr;
524   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
525   PetscBool         usecprow=a->compressedrow.use;
526 
527   PetscFunctionBegin;
528   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
529   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
530 
531   idx = a->j;
532   v   = a->a;
533   if (usecprow) {
534     mbs  = a->compressedrow.nrows;
535     ii   = a->compressedrow.i;
536     ridx = a->compressedrow.rindex;
537   } else {
538     mbs = a->mbs;
539     ii  = a->i;
540     z   = zarray;
541   }
542 
543   for (i=0; i<mbs; i++) {
544     n  = ii[1] - ii[0];
545     ii++;
546     sum1 = 0.0;
547     sum2 = 0.0;
548     sum3 = 0.0;
549     sum4 = 0.0;
550     sum5 = 0.0;
551     sum6 = 0.0;
552 
553     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
554     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
555     for (j=0; j<n; j++) {
556       xb    = x + 6*(*idx++);
557       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
558       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
559       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
560       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
561       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
562       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
563       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
564       v    += 36;
565     }
566     if (usecprow) z = zarray + 6*ridx[i];
567     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
568     if (!usecprow) z += 6;
569   }
570 
571   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
572   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
573   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
574   PetscFunctionReturn(0);
575 }
576 
577 #undef __FUNCT__
578 #define __FUNCT__ "MatMult_SeqBAIJ_7"
579 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
580 {
581   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
582   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
583   const PetscScalar *x,*xb;
584   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
585   const MatScalar   *v;
586   PetscErrorCode    ierr;
587   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
588   PetscBool         usecprow=a->compressedrow.use;
589 
590   PetscFunctionBegin;
591   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
592   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
593 
594   idx = a->j;
595   v   = a->a;
596   if (usecprow) {
597     mbs  = a->compressedrow.nrows;
598     ii   = a->compressedrow.i;
599     ridx = a->compressedrow.rindex;
600   } else {
601     mbs = a->mbs;
602     ii  = a->i;
603     z   = zarray;
604   }
605 
606   for (i=0; i<mbs; i++) {
607     n  = ii[1] - ii[0];
608     ii++;
609     sum1 = 0.0;
610     sum2 = 0.0;
611     sum3 = 0.0;
612     sum4 = 0.0;
613     sum5 = 0.0;
614     sum6 = 0.0;
615     sum7 = 0.0;
616 
617     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
618     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
619     for (j=0; j<n; j++) {
620       xb    = x + 7*(*idx++);
621       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
622       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
623       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
624       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
625       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
626       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
627       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
628       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
629       v    += 49;
630     }
631     if (usecprow) z = zarray + 7*ridx[i];
632     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
633     if (!usecprow) z += 7;
634   }
635 
636   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
637   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
638   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
639   PetscFunctionReturn(0);
640 }
641 
642 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
643 /* Default MatMult for block size 15 */
644 
645 #undef __FUNCT__
646 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
647 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
648 {
649   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
650   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
651   const PetscScalar *x,*xb;
652   PetscScalar       *zarray,xv;
653   const MatScalar   *v;
654   PetscErrorCode    ierr;
655   const PetscInt    *ii,*ij=a->j,*idx;
656   PetscInt          mbs,i,j,k,n,*ridx=NULL;
657   PetscBool         usecprow=a->compressedrow.use;
658 
659   PetscFunctionBegin;
660   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
661   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
662 
663   v = a->a;
664   if (usecprow) {
665     mbs  = a->compressedrow.nrows;
666     ii   = a->compressedrow.i;
667     ridx = a->compressedrow.rindex;
668   } else {
669     mbs = a->mbs;
670     ii  = a->i;
671     z   = zarray;
672   }
673 
674   for (i=0; i<mbs; i++) {
675     n    = ii[i+1] - ii[i];
676     idx  = ij + ii[i];
677     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
678     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
679 
680     for (j=0; j<n; j++) {
681       xb = x + 15*(idx[j]);
682 
683       for (k=0; k<15; k++) {
684         xv     =  xb[k];
685         sum1  += v[0]*xv;
686         sum2  += v[1]*xv;
687         sum3  += v[2]*xv;
688         sum4  += v[3]*xv;
689         sum5  += v[4]*xv;
690         sum6  += v[5]*xv;
691         sum7  += v[6]*xv;
692         sum8  += v[7]*xv;
693         sum9  += v[8]*xv;
694         sum10 += v[9]*xv;
695         sum11 += v[10]*xv;
696         sum12 += v[11]*xv;
697         sum13 += v[12]*xv;
698         sum14 += v[13]*xv;
699         sum15 += v[14]*xv;
700         v     += 15;
701       }
702     }
703     if (usecprow) z = zarray + 15*ridx[i];
704     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
705     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
706 
707     if (!usecprow) z += 15;
708   }
709 
710   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
711   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
712   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
713   PetscFunctionReturn(0);
714 }
715 
716 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
717 #undef __FUNCT__
718 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
719 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
720 {
721   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
722   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
723   const PetscScalar *x,*xb;
724   PetscScalar       x1,x2,x3,x4,*zarray;
725   const MatScalar   *v;
726   PetscErrorCode    ierr;
727   const PetscInt    *ii,*ij=a->j,*idx;
728   PetscInt          mbs,i,j,n,*ridx=NULL;
729   PetscBool         usecprow=a->compressedrow.use;
730 
731   PetscFunctionBegin;
732   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
733   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
734 
735   v = a->a;
736   if (usecprow) {
737     mbs  = a->compressedrow.nrows;
738     ii   = a->compressedrow.i;
739     ridx = a->compressedrow.rindex;
740   } else {
741     mbs = a->mbs;
742     ii  = a->i;
743     z   = zarray;
744   }
745 
746   for (i=0; i<mbs; i++) {
747     n    = ii[i+1] - ii[i];
748     idx  = ij + ii[i];
749     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
750     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
751 
752     for (j=0; j<n; j++) {
753       xb = x + 15*(idx[j]);
754       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
755 
756       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
757       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
758       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
759       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
760       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
761       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
762       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
763       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
764       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
765       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
766       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
767       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
768       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
769       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
770       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
771 
772       v += 60;
773 
774       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
775 
776       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
777       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
778       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
779       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
780       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
781       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
782       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
783       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
784       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
785       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
786       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
787       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
788       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
789       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
790       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
791       v     += 60;
792 
793       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
794       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
795       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
796       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
797       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
798       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
799       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
800       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
801       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
802       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
803       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
804       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
805       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
806       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
807       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
808       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
809       v     += 60;
810 
811       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
812       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
813       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
814       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
815       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
816       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
817       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
818       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
819       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
820       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
821       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
822       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
823       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
824       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
825       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
826       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
827       v     += 45;
828     }
829     if (usecprow) z = zarray + 15*ridx[i];
830     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
831     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
832 
833     if (!usecprow) z += 15;
834   }
835 
836   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
837   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
838   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
839   PetscFunctionReturn(0);
840 }
841 
842 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
843 #undef __FUNCT__
844 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
845 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
846 {
847   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
848   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
849   const PetscScalar *x,*xb;
850   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
851   const MatScalar   *v;
852   PetscErrorCode    ierr;
853   const PetscInt    *ii,*ij=a->j,*idx;
854   PetscInt          mbs,i,j,n,*ridx=NULL;
855   PetscBool         usecprow=a->compressedrow.use;
856 
857   PetscFunctionBegin;
858   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
859   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
860 
861   v = a->a;
862   if (usecprow) {
863     mbs  = a->compressedrow.nrows;
864     ii   = a->compressedrow.i;
865     ridx = a->compressedrow.rindex;
866   } else {
867     mbs = a->mbs;
868     ii  = a->i;
869     z   = zarray;
870   }
871 
872   for (i=0; i<mbs; i++) {
873     n    = ii[i+1] - ii[i];
874     idx  = ij + ii[i];
875     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
876     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
877 
878     for (j=0; j<n; j++) {
879       xb = x + 15*(idx[j]);
880       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
881       x8 = xb[7];
882 
883       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
884       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
885       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
886       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
887       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
888       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
889       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
890       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
891       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
892       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
893       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
894       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
895       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
896       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
897       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
898       v     += 120;
899 
900       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
901 
902       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
903       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
904       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
905       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
906       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
907       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
908       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
909       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
910       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
911       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
912       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
913       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
914       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
915       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
916       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
917       v     += 105;
918     }
919     if (usecprow) z = zarray + 15*ridx[i];
920     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
921     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
922 
923     if (!usecprow) z += 15;
924   }
925 
926   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
927   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
928   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
933 
934 #undef __FUNCT__
935 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
936 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
937 {
938   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
939   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
940   const PetscScalar *x,*xb;
941   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
942   const MatScalar   *v;
943   PetscErrorCode    ierr;
944   const PetscInt    *ii,*ij=a->j,*idx;
945   PetscInt          mbs,i,j,n,*ridx=NULL;
946   PetscBool         usecprow=a->compressedrow.use;
947 
948   PetscFunctionBegin;
949   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
950   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
951 
952   v = a->a;
953   if (usecprow) {
954     mbs  = a->compressedrow.nrows;
955     ii   = a->compressedrow.i;
956     ridx = a->compressedrow.rindex;
957   } else {
958     mbs = a->mbs;
959     ii  = a->i;
960     z   = zarray;
961   }
962 
963   for (i=0; i<mbs; i++) {
964     n    = ii[i+1] - ii[i];
965     idx  = ij + ii[i];
966     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
967     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
968 
969     for (j=0; j<n; j++) {
970       xb = x + 15*(idx[j]);
971       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
972       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
973 
974       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
975       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
976       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
977       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
978       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
979       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
980       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
981       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
982       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
983       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
984       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
985       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
986       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
987       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
988       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
989       v     += 225;
990     }
991     if (usecprow) z = zarray + 15*ridx[i];
992     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
993     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
994 
995     if (!usecprow) z += 15;
996   }
997 
998   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
999   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1000   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 
1005 /*
1006     This will not work with MatScalar == float because it calls the BLAS
1007 */
1008 #undef __FUNCT__
1009 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1010 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1011 {
1012   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1013   PetscScalar       *z = 0,*work,*workt,*zarray;
1014   const PetscScalar *x,*xb;
1015   const MatScalar   *v;
1016   PetscErrorCode    ierr;
1017   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1018   const PetscInt    *idx,*ii,*ridx=NULL;
1019   PetscInt          ncols,k;
1020   PetscBool         usecprow=a->compressedrow.use;
1021 
1022   PetscFunctionBegin;
1023   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1024   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1025 
1026   idx = a->j;
1027   v   = a->a;
1028   if (usecprow) {
1029     mbs  = a->compressedrow.nrows;
1030     ii   = a->compressedrow.i;
1031     ridx = a->compressedrow.rindex;
1032   } else {
1033     mbs = a->mbs;
1034     ii  = a->i;
1035     z   = zarray;
1036   }
1037 
1038   if (!a->mult_work) {
1039     k    = PetscMax(A->rmap->n,A->cmap->n);
1040     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1041   }
1042   work = a->mult_work;
1043   for (i=0; i<mbs; i++) {
1044     n           = ii[1] - ii[0]; ii++;
1045     ncols       = n*bs;
1046     workt       = work;
1047     for (j=0; j<n; j++) {
1048       xb = x + bs*(*idx++);
1049       for (k=0; k<bs; k++) workt[k] = xb[k];
1050       workt += bs;
1051     }
1052     if (usecprow) z = zarray + bs*ridx[i];
1053     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1054     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1055     v += n*bs2;
1056     if (!usecprow) z += bs;
1057   }
1058   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1059   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1060   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1061   PetscFunctionReturn(0);
1062 }
1063 
1064 #undef __FUNCT__
1065 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1066 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1067 {
1068   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1069   const PetscScalar *x;
1070   PetscScalar       *y,*z,sum;
1071   const MatScalar   *v;
1072   PetscErrorCode    ierr;
1073   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1074   const PetscInt    *idx,*ii;
1075   PetscBool         usecprow=a->compressedrow.use;
1076 
1077   PetscFunctionBegin;
1078   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1079   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1080 
1081   idx = a->j;
1082   v   = a->a;
1083   if (usecprow) {
1084     if (zz != yy) {
1085       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1086     }
1087     mbs  = a->compressedrow.nrows;
1088     ii   = a->compressedrow.i;
1089     ridx = a->compressedrow.rindex;
1090   } else {
1091     ii = a->i;
1092   }
1093 
1094   for (i=0; i<mbs; i++) {
1095     n = ii[1] - ii[0];
1096     ii++;
1097     if (!usecprow) {
1098       sum         = y[i];
1099     } else {
1100       sum = y[ridx[i]];
1101     }
1102     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1103     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1104     PetscSparseDensePlusDot(sum,x,v,idx,n);
1105     v   += n;
1106     idx += n;
1107     if (usecprow) {
1108       z[ridx[i]] = sum;
1109     } else {
1110       z[i] = sum;
1111     }
1112   }
1113   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1114   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1115   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1121 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1122 {
1123   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1124   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1125   const PetscScalar *x,*xb;
1126   PetscScalar       x1,x2,*yarray,*zarray;
1127   const MatScalar   *v;
1128   PetscErrorCode    ierr;
1129   PetscInt          mbs = a->mbs,i,n,j;
1130   const PetscInt    *idx,*ii,*ridx = NULL;
1131   PetscBool         usecprow = a->compressedrow.use;
1132 
1133   PetscFunctionBegin;
1134   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1135   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1136 
1137   idx = a->j;
1138   v   = a->a;
1139   if (usecprow) {
1140     if (zz != yy) {
1141       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1142     }
1143     mbs  = a->compressedrow.nrows;
1144     ii   = a->compressedrow.i;
1145     ridx = a->compressedrow.rindex;
1146     if (zz != yy) {
1147       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     }
1149   } else {
1150     ii = a->i;
1151     y  = yarray;
1152     z  = zarray;
1153   }
1154 
1155   for (i=0; i<mbs; i++) {
1156     n = ii[1] - ii[0]; ii++;
1157     if (usecprow) {
1158       z = zarray + 2*ridx[i];
1159       y = yarray + 2*ridx[i];
1160     }
1161     sum1 = y[0]; sum2 = y[1];
1162     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1163     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1164     for (j=0; j<n; j++) {
1165       xb = x + 2*(*idx++);
1166       x1 = xb[0];
1167       x2 = xb[1];
1168 
1169       sum1 += v[0]*x1 + v[2]*x2;
1170       sum2 += v[1]*x1 + v[3]*x2;
1171       v    += 4;
1172     }
1173     z[0] = sum1; z[1] = sum2;
1174     if (!usecprow) {
1175       z += 2; y += 2;
1176     }
1177   }
1178   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1179   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1180   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 #undef __FUNCT__
1185 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1186 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1187 {
1188   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1189   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1190   const PetscScalar *x,*xb;
1191   const MatScalar   *v;
1192   PetscErrorCode    ierr;
1193   PetscInt          mbs = a->mbs,i,j,n;
1194   const PetscInt    *idx,*ii,*ridx = NULL;
1195   PetscBool         usecprow = a->compressedrow.use;
1196 
1197   PetscFunctionBegin;
1198   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1199   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1200 
1201   idx = a->j;
1202   v   = a->a;
1203   if (usecprow) {
1204     if (zz != yy) {
1205       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1206     }
1207     mbs  = a->compressedrow.nrows;
1208     ii   = a->compressedrow.i;
1209     ridx = a->compressedrow.rindex;
1210   } else {
1211     ii = a->i;
1212     y  = yarray;
1213     z  = zarray;
1214   }
1215 
1216   for (i=0; i<mbs; i++) {
1217     n = ii[1] - ii[0]; ii++;
1218     if (usecprow) {
1219       z = zarray + 3*ridx[i];
1220       y = yarray + 3*ridx[i];
1221     }
1222     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1223     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1224     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1225     for (j=0; j<n; j++) {
1226       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1227       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1228       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1229       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1230       v    += 9;
1231     }
1232     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1233     if (!usecprow) {
1234       z += 3; y += 3;
1235     }
1236   }
1237   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1238   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1239   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1240   PetscFunctionReturn(0);
1241 }
1242 
1243 #undef __FUNCT__
1244 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1245 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1246 {
1247   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1248   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1249   const PetscScalar *x,*xb;
1250   const MatScalar   *v;
1251   PetscErrorCode    ierr;
1252   PetscInt          mbs = a->mbs,i,j,n;
1253   const PetscInt    *idx,*ii,*ridx=NULL;
1254   PetscBool         usecprow=a->compressedrow.use;
1255 
1256   PetscFunctionBegin;
1257   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1258   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1259 
1260   idx = a->j;
1261   v   = a->a;
1262   if (usecprow) {
1263     if (zz != yy) {
1264       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1265     }
1266     mbs  = a->compressedrow.nrows;
1267     ii   = a->compressedrow.i;
1268     ridx = a->compressedrow.rindex;
1269   } else {
1270     ii = a->i;
1271     y  = yarray;
1272     z  = zarray;
1273   }
1274 
1275   for (i=0; i<mbs; i++) {
1276     n = ii[1] - ii[0]; ii++;
1277     if (usecprow) {
1278       z = zarray + 4*ridx[i];
1279       y = yarray + 4*ridx[i];
1280     }
1281     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1282     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1283     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1284     for (j=0; j<n; j++) {
1285       xb    = x + 4*(*idx++);
1286       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1287       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1288       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1289       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1290       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1291       v    += 16;
1292     }
1293     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1294     if (!usecprow) {
1295       z += 4; y += 4;
1296     }
1297   }
1298   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1299   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1300   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 #undef __FUNCT__
1305 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1306 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1307 {
1308   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1309   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1310   const PetscScalar *x,*xb;
1311   PetscScalar       *yarray,*zarray;
1312   const MatScalar   *v;
1313   PetscErrorCode    ierr;
1314   PetscInt          mbs = a->mbs,i,j,n;
1315   const PetscInt    *idx,*ii,*ridx = NULL;
1316   PetscBool         usecprow=a->compressedrow.use;
1317 
1318   PetscFunctionBegin;
1319   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1320   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1321 
1322   idx = a->j;
1323   v   = a->a;
1324   if (usecprow) {
1325     if (zz != yy) {
1326       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1327     }
1328     mbs  = a->compressedrow.nrows;
1329     ii   = a->compressedrow.i;
1330     ridx = a->compressedrow.rindex;
1331   } else {
1332     ii = a->i;
1333     y  = yarray;
1334     z  = zarray;
1335   }
1336 
1337   for (i=0; i<mbs; i++) {
1338     n = ii[1] - ii[0]; ii++;
1339     if (usecprow) {
1340       z = zarray + 5*ridx[i];
1341       y = yarray + 5*ridx[i];
1342     }
1343     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1344     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1345     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1346     for (j=0; j<n; j++) {
1347       xb    = x + 5*(*idx++);
1348       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1349       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1350       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1351       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1352       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1353       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1354       v    += 25;
1355     }
1356     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1357     if (!usecprow) {
1358       z += 5; y += 5;
1359     }
1360   }
1361   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1362   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1363   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1364   PetscFunctionReturn(0);
1365 }
1366 #undef __FUNCT__
1367 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1368 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1369 {
1370   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1371   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1372   const PetscScalar *x,*xb;
1373   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1374   const MatScalar   *v;
1375   PetscErrorCode    ierr;
1376   PetscInt          mbs = a->mbs,i,j,n;
1377   const PetscInt    *idx,*ii,*ridx=NULL;
1378   PetscBool         usecprow=a->compressedrow.use;
1379 
1380   PetscFunctionBegin;
1381   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1382   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1383 
1384   idx = a->j;
1385   v   = a->a;
1386   if (usecprow) {
1387     if (zz != yy) {
1388       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1389     }
1390     mbs  = a->compressedrow.nrows;
1391     ii   = a->compressedrow.i;
1392     ridx = a->compressedrow.rindex;
1393   } else {
1394     ii = a->i;
1395     y  = yarray;
1396     z  = zarray;
1397   }
1398 
1399   for (i=0; i<mbs; i++) {
1400     n = ii[1] - ii[0]; ii++;
1401     if (usecprow) {
1402       z = zarray + 6*ridx[i];
1403       y = yarray + 6*ridx[i];
1404     }
1405     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1406     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1407     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1408     for (j=0; j<n; j++) {
1409       xb    = x + 6*(*idx++);
1410       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1411       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1412       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1413       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1414       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1415       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1416       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1417       v    += 36;
1418     }
1419     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1420     if (!usecprow) {
1421       z += 6; y += 6;
1422     }
1423   }
1424   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1425   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1426   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 #undef __FUNCT__
1431 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1432 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1433 {
1434   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1435   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1436   const PetscScalar *x,*xb;
1437   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1438   const MatScalar   *v;
1439   PetscErrorCode    ierr;
1440   PetscInt          mbs = a->mbs,i,j,n;
1441   const PetscInt    *idx,*ii,*ridx = NULL;
1442   PetscBool         usecprow=a->compressedrow.use;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1446   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1447 
1448   idx = a->j;
1449   v   = a->a;
1450   if (usecprow) {
1451     if (zz != yy) {
1452       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1453     }
1454     mbs  = a->compressedrow.nrows;
1455     ii   = a->compressedrow.i;
1456     ridx = a->compressedrow.rindex;
1457   } else {
1458     ii = a->i;
1459     y  = yarray;
1460     z  = zarray;
1461   }
1462 
1463   for (i=0; i<mbs; i++) {
1464     n = ii[1] - ii[0]; ii++;
1465     if (usecprow) {
1466       z = zarray + 7*ridx[i];
1467       y = yarray + 7*ridx[i];
1468     }
1469     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1470     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1471     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1472     for (j=0; j<n; j++) {
1473       xb    = x + 7*(*idx++);
1474       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1475       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1476       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1477       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1478       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1479       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1480       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1481       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1482       v    += 49;
1483     }
1484     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1485     if (!usecprow) {
1486       z += 7; y += 7;
1487     }
1488   }
1489   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1490   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1491   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1492   PetscFunctionReturn(0);
1493 }
1494 
1495 #undef __FUNCT__
1496 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1497 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1498 {
1499   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1500   PetscScalar       *z = 0,*work,*workt,*zarray;
1501   const PetscScalar *x,*xb;
1502   const MatScalar   *v;
1503   PetscErrorCode    ierr;
1504   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1505   PetscInt          ncols,k;
1506   const PetscInt    *ridx = NULL,*idx,*ii;
1507   PetscBool         usecprow = a->compressedrow.use;
1508 
1509   PetscFunctionBegin;
1510   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1511   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1512   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1513 
1514   idx = a->j;
1515   v   = a->a;
1516   if (usecprow) {
1517     mbs  = a->compressedrow.nrows;
1518     ii   = a->compressedrow.i;
1519     ridx = a->compressedrow.rindex;
1520   } else {
1521     mbs = a->mbs;
1522     ii  = a->i;
1523     z   = zarray;
1524   }
1525 
1526   if (!a->mult_work) {
1527     k    = PetscMax(A->rmap->n,A->cmap->n);
1528     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1529   }
1530   work = a->mult_work;
1531   for (i=0; i<mbs; i++) {
1532     n     = ii[1] - ii[0]; ii++;
1533     ncols = n*bs;
1534     workt = work;
1535     for (j=0; j<n; j++) {
1536       xb = x + bs*(*idx++);
1537       for (k=0; k<bs; k++) workt[k] = xb[k];
1538       workt += bs;
1539     }
1540     if (usecprow) z = zarray + bs*ridx[i];
1541     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1542     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1543     v += n*bs2;
1544     if (!usecprow) z += bs;
1545   }
1546   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1547   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1548   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 #undef __FUNCT__
1553 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1554 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1555 {
1556   PetscScalar    zero = 0.0;
1557   PetscErrorCode ierr;
1558 
1559   PetscFunctionBegin;
1560   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1561   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1567 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1568 {
1569   PetscScalar    zero = 0.0;
1570   PetscErrorCode ierr;
1571 
1572   PetscFunctionBegin;
1573   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1574   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 #undef __FUNCT__
1579 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1580 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1581 {
1582   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1583   PetscScalar       *z,x1,x2,x3,x4,x5;
1584   const PetscScalar *x,*xb = NULL;
1585   const MatScalar   *v;
1586   PetscErrorCode    ierr;
1587   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1588   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1589   Mat_CompressedRow cprow = a->compressedrow;
1590   PetscBool         usecprow = cprow.use;
1591 
1592   PetscFunctionBegin;
1593   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1594   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1595   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1596 
1597   idx = a->j;
1598   v   = a->a;
1599   if (usecprow) {
1600     mbs  = cprow.nrows;
1601     ii   = cprow.i;
1602     ridx = cprow.rindex;
1603   } else {
1604     mbs=a->mbs;
1605     ii = a->i;
1606     xb = x;
1607   }
1608 
1609   switch (bs) {
1610   case 1:
1611     for (i=0; i<mbs; i++) {
1612       if (usecprow) xb = x + ridx[i];
1613       x1 = xb[0];
1614       ib = idx + ii[0];
1615       n  = ii[1] - ii[0]; ii++;
1616       for (j=0; j<n; j++) {
1617         rval     = ib[j];
1618         z[rval] += PetscConj(*v) * x1;
1619         v++;
1620       }
1621       if (!usecprow) xb++;
1622     }
1623     break;
1624   case 2:
1625     for (i=0; i<mbs; i++) {
1626       if (usecprow) xb = x + 2*ridx[i];
1627       x1 = xb[0]; x2 = xb[1];
1628       ib = idx + ii[0];
1629       n  = ii[1] - ii[0]; ii++;
1630       for (j=0; j<n; j++) {
1631         rval       = ib[j]*2;
1632         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1633         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1634         v         += 4;
1635       }
1636       if (!usecprow) xb += 2;
1637     }
1638     break;
1639   case 3:
1640     for (i=0; i<mbs; i++) {
1641       if (usecprow) xb = x + 3*ridx[i];
1642       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1643       ib = idx + ii[0];
1644       n  = ii[1] - ii[0]; ii++;
1645       for (j=0; j<n; j++) {
1646         rval       = ib[j]*3;
1647         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1648         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1649         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1650         v         += 9;
1651       }
1652       if (!usecprow) xb += 3;
1653     }
1654     break;
1655   case 4:
1656     for (i=0; i<mbs; i++) {
1657       if (usecprow) xb = x + 4*ridx[i];
1658       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1659       ib = idx + ii[0];
1660       n  = ii[1] - ii[0]; ii++;
1661       for (j=0; j<n; j++) {
1662         rval       = ib[j]*4;
1663         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1664         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1665         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1666         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1667         v         += 16;
1668       }
1669       if (!usecprow) xb += 4;
1670     }
1671     break;
1672   case 5:
1673     for (i=0; i<mbs; i++) {
1674       if (usecprow) xb = x + 5*ridx[i];
1675       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1676       x4 = xb[3]; x5 = xb[4];
1677       ib = idx + ii[0];
1678       n  = ii[1] - ii[0]; ii++;
1679       for (j=0; j<n; j++) {
1680         rval       = ib[j]*5;
1681         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1682         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1683         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1684         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1685         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1686         v         += 25;
1687       }
1688       if (!usecprow) xb += 5;
1689     }
1690     break;
1691   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1692     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1693 #if 0
1694     {
1695       PetscInt          ncols,k,bs2=a->bs2;
1696       PetscScalar       *work,*workt,zb;
1697       const PetscScalar *xtmp;
1698       if (!a->mult_work) {
1699         k    = PetscMax(A->rmap->n,A->cmap->n);
1700         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1701       }
1702       work = a->mult_work;
1703       xtmp = x;
1704       for (i=0; i<mbs; i++) {
1705         n     = ii[1] - ii[0]; ii++;
1706         ncols = n*bs;
1707         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1708         if (usecprow) xtmp = x + bs*ridx[i];
1709         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1710         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1711         v += n*bs2;
1712         if (!usecprow) xtmp += bs;
1713         workt = work;
1714         for (j=0; j<n; j++) {
1715           zb = z + bs*(*idx++);
1716           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1717           workt += bs;
1718         }
1719       }
1720     }
1721 #endif
1722   }
1723   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1724   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1725   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 #undef __FUNCT__
1730 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1731 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1732 {
1733   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1734   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1735   const PetscScalar *x,*xb = 0;
1736   const MatScalar   *v;
1737   PetscErrorCode    ierr;
1738   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1739   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1740   Mat_CompressedRow cprow   = a->compressedrow;
1741   PetscBool         usecprow=cprow.use;
1742 
1743   PetscFunctionBegin;
1744   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1745   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1746   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1747 
1748   idx = a->j;
1749   v   = a->a;
1750   if (usecprow) {
1751     mbs  = cprow.nrows;
1752     ii   = cprow.i;
1753     ridx = cprow.rindex;
1754   } else {
1755     mbs=a->mbs;
1756     ii = a->i;
1757     xb = x;
1758   }
1759 
1760   switch (bs) {
1761   case 1:
1762     for (i=0; i<mbs; i++) {
1763       if (usecprow) xb = x + ridx[i];
1764       x1 = xb[0];
1765       ib = idx + ii[0];
1766       n  = ii[1] - ii[0]; ii++;
1767       for (j=0; j<n; j++) {
1768         rval     = ib[j];
1769         z[rval] += *v * x1;
1770         v++;
1771       }
1772       if (!usecprow) xb++;
1773     }
1774     break;
1775   case 2:
1776     for (i=0; i<mbs; i++) {
1777       if (usecprow) xb = x + 2*ridx[i];
1778       x1 = xb[0]; x2 = xb[1];
1779       ib = idx + ii[0];
1780       n  = ii[1] - ii[0]; ii++;
1781       for (j=0; j<n; j++) {
1782         rval       = ib[j]*2;
1783         z[rval++] += v[0]*x1 + v[1]*x2;
1784         z[rval++] += v[2]*x1 + v[3]*x2;
1785         v         += 4;
1786       }
1787       if (!usecprow) xb += 2;
1788     }
1789     break;
1790   case 3:
1791     for (i=0; i<mbs; i++) {
1792       if (usecprow) xb = x + 3*ridx[i];
1793       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1794       ib = idx + ii[0];
1795       n  = ii[1] - ii[0]; ii++;
1796       for (j=0; j<n; j++) {
1797         rval       = ib[j]*3;
1798         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1799         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1800         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1801         v         += 9;
1802       }
1803       if (!usecprow) xb += 3;
1804     }
1805     break;
1806   case 4:
1807     for (i=0; i<mbs; i++) {
1808       if (usecprow) xb = x + 4*ridx[i];
1809       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1810       ib = idx + ii[0];
1811       n  = ii[1] - ii[0]; ii++;
1812       for (j=0; j<n; j++) {
1813         rval       = ib[j]*4;
1814         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1815         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1816         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1817         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1818         v         += 16;
1819       }
1820       if (!usecprow) xb += 4;
1821     }
1822     break;
1823   case 5:
1824     for (i=0; i<mbs; i++) {
1825       if (usecprow) xb = x + 5*ridx[i];
1826       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1827       x4 = xb[3]; x5 = xb[4];
1828       ib = idx + ii[0];
1829       n  = ii[1] - ii[0]; ii++;
1830       for (j=0; j<n; j++) {
1831         rval       = ib[j]*5;
1832         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1833         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1834         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1835         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1836         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1837         v         += 25;
1838       }
1839       if (!usecprow) xb += 5;
1840     }
1841     break;
1842   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1843     PetscInt          ncols,k;
1844     PetscScalar       *work,*workt;
1845     const PetscScalar *xtmp;
1846     if (!a->mult_work) {
1847       k    = PetscMax(A->rmap->n,A->cmap->n);
1848       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1849     }
1850     work = a->mult_work;
1851     xtmp = x;
1852     for (i=0; i<mbs; i++) {
1853       n     = ii[1] - ii[0]; ii++;
1854       ncols = n*bs;
1855       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1856       if (usecprow) xtmp = x + bs*ridx[i];
1857       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1858       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1859       v += n*bs2;
1860       if (!usecprow) xtmp += bs;
1861       workt = work;
1862       for (j=0; j<n; j++) {
1863         zb = z + bs*(*idx++);
1864         for (k=0; k<bs; k++) zb[k] += workt[k];
1865         workt += bs;
1866       }
1867     }
1868     }
1869   }
1870   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1871   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1872   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 #undef __FUNCT__
1877 #define __FUNCT__ "MatScale_SeqBAIJ"
1878 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1879 {
1880   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1881   PetscInt       totalnz = a->bs2*a->nz;
1882   PetscScalar    oalpha  = alpha;
1883   PetscErrorCode ierr;
1884   PetscBLASInt   one = 1,tnz;
1885 
1886   PetscFunctionBegin;
1887   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1888   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1889   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 #undef __FUNCT__
1894 #define __FUNCT__ "MatNorm_SeqBAIJ"
1895 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1896 {
1897   PetscErrorCode ierr;
1898   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1899   MatScalar      *v  = a->a;
1900   PetscReal      sum = 0.0;
1901   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1902 
1903   PetscFunctionBegin;
1904   if (type == NORM_FROBENIUS) {
1905     for (i=0; i< bs2*nz; i++) {
1906       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1907     }
1908     *norm = PetscSqrtReal(sum);
1909   } else if (type == NORM_1) { /* maximum column sum */
1910     PetscReal *tmp;
1911     PetscInt  *bcol = a->j;
1912     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
1913     for (i=0; i<nz; i++) {
1914       for (j=0; j<bs; j++) {
1915         k1 = bs*(*bcol) + j; /* column index */
1916         for (k=0; k<bs; k++) {
1917           tmp[k1] += PetscAbsScalar(*v); v++;
1918         }
1919       }
1920       bcol++;
1921     }
1922     *norm = 0.0;
1923     for (j=0; j<A->cmap->n; j++) {
1924       if (tmp[j] > *norm) *norm = tmp[j];
1925     }
1926     ierr = PetscFree(tmp);CHKERRQ(ierr);
1927   } else if (type == NORM_INFINITY) { /* maximum row sum */
1928     *norm = 0.0;
1929     for (k=0; k<bs; k++) {
1930       for (j=0; j<a->mbs; j++) {
1931         v   = a->a + bs2*a->i[j] + k;
1932         sum = 0.0;
1933         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1934           for (k1=0; k1<bs; k1++) {
1935             sum += PetscAbsScalar(*v);
1936             v   += bs;
1937           }
1938         }
1939         if (sum > *norm) *norm = sum;
1940       }
1941     }
1942   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1943   PetscFunctionReturn(0);
1944 }
1945 
1946 
1947 #undef __FUNCT__
1948 #define __FUNCT__ "MatEqual_SeqBAIJ"
1949 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1950 {
1951   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1952   PetscErrorCode ierr;
1953 
1954   PetscFunctionBegin;
1955   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1956   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1957     *flg = PETSC_FALSE;
1958     PetscFunctionReturn(0);
1959   }
1960 
1961   /* if the a->i are the same */
1962   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1963   if (!*flg) PetscFunctionReturn(0);
1964 
1965   /* if a->j are the same */
1966   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1967   if (!*flg) PetscFunctionReturn(0);
1968 
1969   /* if a->a are the same */
1970   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1971   PetscFunctionReturn(0);
1972 
1973 }
1974 
1975 #undef __FUNCT__
1976 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1977 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1978 {
1979   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1980   PetscErrorCode ierr;
1981   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1982   PetscScalar    *x,zero = 0.0;
1983   MatScalar      *aa,*aa_j;
1984 
1985   PetscFunctionBegin;
1986   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1987   bs   = A->rmap->bs;
1988   aa   = a->a;
1989   ai   = a->i;
1990   aj   = a->j;
1991   ambs = a->mbs;
1992   bs2  = a->bs2;
1993 
1994   ierr = VecSet(v,zero);CHKERRQ(ierr);
1995   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1996   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1997   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1998   for (i=0; i<ambs; i++) {
1999     for (j=ai[i]; j<ai[i+1]; j++) {
2000       if (aj[j] == i) {
2001         row  = i*bs;
2002         aa_j = aa+j*bs2;
2003         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2004         break;
2005       }
2006     }
2007   }
2008   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2009   PetscFunctionReturn(0);
2010 }
2011 
2012 #undef __FUNCT__
2013 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2014 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2015 {
2016   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2017   const PetscScalar *l,*r,*li,*ri;
2018   PetscScalar       x;
2019   MatScalar         *aa, *v;
2020   PetscErrorCode    ierr;
2021   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2022   const PetscInt    *ai,*aj;
2023 
2024   PetscFunctionBegin;
2025   ai  = a->i;
2026   aj  = a->j;
2027   aa  = a->a;
2028   m   = A->rmap->n;
2029   n   = A->cmap->n;
2030   bs  = A->rmap->bs;
2031   mbs = a->mbs;
2032   bs2 = a->bs2;
2033   if (ll) {
2034     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2035     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2036     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2037     for (i=0; i<mbs; i++) { /* for each block row */
2038       M  = ai[i+1] - ai[i];
2039       li = l + i*bs;
2040       v  = aa + bs2*ai[i];
2041       for (j=0; j<M; j++) { /* for each block */
2042         for (k=0; k<bs2; k++) {
2043           (*v++) *= li[k%bs];
2044         }
2045       }
2046     }
2047     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2048     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2049   }
2050 
2051   if (rr) {
2052     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2053     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2054     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2055     for (i=0; i<mbs; i++) { /* for each block row */
2056       iai = ai[i];
2057       M   = ai[i+1] - iai;
2058       v   = aa + bs2*iai;
2059       for (j=0; j<M; j++) { /* for each block */
2060         ri = r + bs*aj[iai+j];
2061         for (k=0; k<bs; k++) {
2062           x = ri[k];
2063           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2064           v += bs;
2065         }
2066       }
2067     }
2068     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2069     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2070   }
2071   PetscFunctionReturn(0);
2072 }
2073 
2074 
2075 #undef __FUNCT__
2076 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2077 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2078 {
2079   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2080 
2081   PetscFunctionBegin;
2082   info->block_size   = a->bs2;
2083   info->nz_allocated = a->bs2*a->maxnz;
2084   info->nz_used      = a->bs2*a->nz;
2085   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2086   info->assemblies   = A->num_ass;
2087   info->mallocs      = A->info.mallocs;
2088   info->memory       = ((PetscObject)A)->mem;
2089   if (A->factortype) {
2090     info->fill_ratio_given  = A->info.fill_ratio_given;
2091     info->fill_ratio_needed = A->info.fill_ratio_needed;
2092     info->factor_mallocs    = A->info.factor_mallocs;
2093   } else {
2094     info->fill_ratio_given  = 0;
2095     info->fill_ratio_needed = 0;
2096     info->factor_mallocs    = 0;
2097   }
2098   PetscFunctionReturn(0);
2099 }
2100 
2101 #undef __FUNCT__
2102 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2103 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2104 {
2105   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2106   PetscErrorCode ierr;
2107 
2108   PetscFunctionBegin;
2109   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2110   PetscFunctionReturn(0);
2111 }
2112