xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 6b3eee04a67a44bd9a4e65b41b1dd96ffcd06ca3)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
9 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
14   const PetscInt *idx;
15   PetscInt       start,end,*ai,*aj,bs,*nidx2;
16   PetscBT        table;
17 
18   PetscFunctionBegin;
19   m  = a->mbs;
20   ai = a->i;
21   aj = a->j;
22   bs = A->rmap->bs;
23 
24   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
27   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) {
31     /* Initialise the two local arrays */
32     isz  = 0;
33     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j=0; j<n; ++j) {
41       ival = idx[j]/bs; /* convert the indices into block indices */
42       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row+1];
55         for (l = start; l<end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j=0; j<isz; j++) {
63       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64     }
65     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
66   }
67   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
68   ierr = PetscFree(nidx);CHKERRQ(ierr);
69   ierr = PetscFree(nidx2);CHKERRQ(ierr);
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
75 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag;
87 
88   PetscFunctionBegin;
89   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
90   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
91   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
93 
94   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
95   ssmap = smap;
96   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
97   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98   /* determine lens of each row */
99   for (i=0; i<nrows; i++) {
100     kstart  = ai[irow[i]];
101     kend    = kstart + a->ilen[irow[i]];
102     lens[i] = 0;
103     for (k=kstart; k<kend; k++) {
104       if (ssmap[aj[k]]) lens[i]++;
105     }
106   }
107   /* Create and fill new matrix */
108   if (scall == MAT_REUSE_MATRIX) {
109     c = (Mat_SeqBAIJ*)((*B)->data);
110 
111     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
113     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
115     C    = *B;
116   } else {
117     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
118     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
119     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
120     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
121   }
122   c = (Mat_SeqBAIJ*)(C->data);
123   for (i=0; i<nrows; i++) {
124     row      = irow[i];
125     kstart   = ai[row];
126     kend     = kstart + a->ilen[row];
127     mat_i    = c->i[i];
128     mat_j    = c->j + mat_i;
129     mat_a    = c->a + mat_i*bs2;
130     mat_ilen = c->ilen + i;
131     for (k=kstart; k<kend; k++) {
132       if ((tcol=ssmap[a->j[k]])) {
133         *mat_j++ = tcol - 1;
134         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
135         mat_a   += bs2;
136         (*mat_ilen)++;
137       }
138     }
139   }
140   /* sort */
141   {
142     MatScalar *work;
143     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
144     for (i=0; i<nrows; i++) {
145       PetscInt ilen;
146       mat_i = c->i[i];
147       mat_j = c->j + mat_i;
148       mat_a = c->a + mat_i*bs2;
149       ilen  = c->ilen[i];
150       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
151     }
152     ierr = PetscFree(work);CHKERRQ(ierr);
153   }
154 
155   /* Free work space */
156   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
157   ierr = PetscFree(smap);CHKERRQ(ierr);
158   ierr = PetscFree(lens);CHKERRQ(ierr);
159   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
160   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161 
162   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
163   *B   = C;
164   PetscFunctionReturn(0);
165 }
166 
167 #undef __FUNCT__
168 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
169 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
170 {
171   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
172   IS             is1,is2;
173   PetscErrorCode ierr;
174   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
175   const PetscInt *irow,*icol;
176 
177   PetscFunctionBegin;
178   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
179   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
180   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
181   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
182 
183   /* Verify if the indices corespond to each element in a block
184    and form the IS with compressed IS */
185   maxmnbs = PetscMax(a->mbs,a->nbs);
186   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
187   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
188   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
189   for (i=0; i<a->mbs; i++) {
190     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
191   }
192   count = 0;
193   for (i=0; i<nrows; i++) {
194     j = irow[i] / bs;
195     if ((vary[j]--)==bs) iary[count++] = j;
196   }
197   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
198 
199   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
200   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
201   for (i=0; i<a->nbs; i++) {
202     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
203   }
204   count = 0;
205   for (i=0; i<ncols; i++) {
206     j = icol[i] / bs;
207     if ((vary[j]--)==bs) iary[count++] = j;
208   }
209   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
210   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
211   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
212   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
213 
214   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
215   ierr = ISDestroy(&is1);CHKERRQ(ierr);
216   ierr = ISDestroy(&is2);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 #undef __FUNCT__
221 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
222 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
223 {
224   PetscErrorCode ierr;
225   PetscInt       i;
226 
227   PetscFunctionBegin;
228   if (scall == MAT_INITIAL_MATRIX) {
229     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
230   }
231 
232   for (i=0; i<n; i++) {
233     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
234   }
235   PetscFunctionReturn(0);
236 }
237 
238 
239 /* -------------------------------------------------------*/
240 /* Should check that shapes of vectors and matrices match */
241 /* -------------------------------------------------------*/
242 
243 #undef __FUNCT__
244 #define __FUNCT__ "MatMult_SeqBAIJ_1"
245 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
246 {
247   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
248   PetscScalar       *z,sum;
249   const PetscScalar *x;
250   const MatScalar   *v;
251   PetscErrorCode    ierr;
252   PetscInt          mbs,i,n;
253   const PetscInt    *idx,*ii,*ridx=NULL;
254   PetscBool         usecprow=a->compressedrow.use;
255 
256   PetscFunctionBegin;
257   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
258   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
259 
260   if (usecprow) {
261     mbs  = a->compressedrow.nrows;
262     ii   = a->compressedrow.i;
263     ridx = a->compressedrow.rindex;
264     ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
265   } else {
266     mbs = a->mbs;
267     ii  = a->i;
268   }
269 
270   for (i=0; i<mbs; i++) {
271     n   = ii[1] - ii[0];
272     v   = a->a + ii[0];
273     idx = a->j + ii[0];
274     ii++;
275     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
276     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
277     sum = 0.0;
278     PetscSparseDensePlusDot(sum,x,v,idx,n);
279     if (usecprow) {
280       z[ridx[i]] = sum;
281     } else {
282       z[i]       = sum;
283     }
284   }
285   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
286   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
287   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
288   PetscFunctionReturn(0);
289 }
290 
291 #undef __FUNCT__
292 #define __FUNCT__ "MatMult_SeqBAIJ_2"
293 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
294 {
295   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
296   PetscScalar       *z = 0,sum1,sum2,*zarray;
297   const PetscScalar *x,*xb;
298   PetscScalar       x1,x2;
299   const MatScalar   *v;
300   PetscErrorCode    ierr;
301   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
302   PetscBool         usecprow=a->compressedrow.use;
303 
304   PetscFunctionBegin;
305   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
306   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
307 
308   idx = a->j;
309   v   = a->a;
310   if (usecprow) {
311     mbs  = a->compressedrow.nrows;
312     ii   = a->compressedrow.i;
313     ridx = a->compressedrow.rindex;
314     ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
315   } else {
316     mbs = a->mbs;
317     ii  = a->i;
318     z   = zarray;
319   }
320 
321   for (i=0; i<mbs; i++) {
322     n           = ii[1] - ii[0]; ii++;
323     sum1        = 0.0; sum2 = 0.0;
324     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
325     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
326     for (j=0; j<n; j++) {
327       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
328       sum1 += v[0]*x1 + v[2]*x2;
329       sum2 += v[1]*x1 + v[3]*x2;
330       v    += 4;
331     }
332     if (usecprow) z = zarray + 2*ridx[i];
333     z[0] = sum1; z[1] = sum2;
334     if (!usecprow) z += 2;
335   }
336   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
338   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNCT__
343 #define __FUNCT__ "MatMult_SeqBAIJ_3"
344 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
345 {
346   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
347   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
348   const PetscScalar *x,*xb;
349   const MatScalar   *v;
350   PetscErrorCode    ierr;
351   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
352   PetscBool         usecprow=a->compressedrow.use;
353 
354 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
355 #pragma disjoint(*v,*z,*xb)
356 #endif
357 
358   PetscFunctionBegin;
359   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
360   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
361 
362   idx = a->j;
363   v   = a->a;
364   if (usecprow) {
365     mbs  = a->compressedrow.nrows;
366     ii   = a->compressedrow.i;
367     ridx = a->compressedrow.rindex;
368     ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
369   } else {
370     mbs = a->mbs;
371     ii  = a->i;
372     z   = zarray;
373   }
374 
375   for (i=0; i<mbs; i++) {
376     n           = ii[1] - ii[0]; ii++;
377     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
378     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
379     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380     for (j=0; j<n; j++) {
381       xb = x + 3*(*idx++);
382       x1 = xb[0];
383       x2 = xb[1];
384       x3 = xb[2];
385 
386       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
387       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
388       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
389       v    += 9;
390     }
391     if (usecprow) z = zarray + 3*ridx[i];
392     z[0] = sum1; z[1] = sum2; z[2] = sum3;
393     if (!usecprow) z += 3;
394   }
395   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
396   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
397   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 
401 #undef __FUNCT__
402 #define __FUNCT__ "MatMult_SeqBAIJ_4"
403 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
404 {
405   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
406   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
407   const PetscScalar *x,*xb;
408   const MatScalar   *v;
409   PetscErrorCode    ierr;
410   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
411   PetscBool         usecprow=a->compressedrow.use;
412 
413   PetscFunctionBegin;
414   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
415   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
416 
417   idx = a->j;
418   v   = a->a;
419   if (usecprow) {
420     mbs  = a->compressedrow.nrows;
421     ii   = a->compressedrow.i;
422     ridx = a->compressedrow.rindex;
423     ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
424   } else {
425     mbs = a->mbs;
426     ii  = a->i;
427     z   = zarray;
428   }
429 
430   for (i=0; i<mbs; i++) {
431     n = ii[1] - ii[0];
432     ii++;
433     sum1 = 0.0;
434     sum2 = 0.0;
435     sum3 = 0.0;
436     sum4 = 0.0;
437 
438     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
439     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
440     for (j=0; j<n; j++) {
441       xb    = x + 4*(*idx++);
442       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
443       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
444       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
445       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
446       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
447       v    += 16;
448     }
449     if (usecprow) z = zarray + 4*ridx[i];
450     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
451     if (!usecprow) z += 4;
452   }
453   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
454   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
455   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
456   PetscFunctionReturn(0);
457 }
458 
459 #undef __FUNCT__
460 #define __FUNCT__ "MatMult_SeqBAIJ_5"
461 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
462 {
463   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
464   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
465   const PetscScalar *xb,*x;
466   const MatScalar   *v;
467   PetscErrorCode    ierr;
468   const PetscInt    *idx,*ii,*ridx=NULL;
469   PetscInt          mbs,i,j,n;
470   PetscBool         usecprow=a->compressedrow.use;
471 
472   PetscFunctionBegin;
473   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
474   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
475 
476   idx = a->j;
477   v   = a->a;
478   if (usecprow) {
479     mbs  = a->compressedrow.nrows;
480     ii   = a->compressedrow.i;
481     ridx = a->compressedrow.rindex;
482     ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
483   } else {
484     mbs = a->mbs;
485     ii  = a->i;
486     z   = zarray;
487   }
488 
489   for (i=0; i<mbs; i++) {
490     n           = ii[1] - ii[0]; ii++;
491     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
492     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
493     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
494     for (j=0; j<n; j++) {
495       xb    = x + 5*(*idx++);
496       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
497       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
498       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
499       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
500       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
501       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
502       v    += 25;
503     }
504     if (usecprow) z = zarray + 5*ridx[i];
505     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
506     if (!usecprow) z += 5;
507   }
508   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
509   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
510   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
511   PetscFunctionReturn(0);
512 }
513 
514 
515 
516 #undef __FUNCT__
517 #define __FUNCT__ "MatMult_SeqBAIJ_6"
518 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
519 {
520   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
521   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
522   const PetscScalar *x,*xb;
523   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
524   const MatScalar   *v;
525   PetscErrorCode    ierr;
526   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
527   PetscBool         usecprow=a->compressedrow.use;
528 
529   PetscFunctionBegin;
530   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
531   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
532 
533   idx = a->j;
534   v   = a->a;
535   if (usecprow) {
536     mbs  = a->compressedrow.nrows;
537     ii   = a->compressedrow.i;
538     ridx = a->compressedrow.rindex;
539     ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
540   } else {
541     mbs = a->mbs;
542     ii  = a->i;
543     z   = zarray;
544   }
545 
546   for (i=0; i<mbs; i++) {
547     n  = ii[1] - ii[0];
548     ii++;
549     sum1 = 0.0;
550     sum2 = 0.0;
551     sum3 = 0.0;
552     sum4 = 0.0;
553     sum5 = 0.0;
554     sum6 = 0.0;
555 
556     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
557     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
558     for (j=0; j<n; j++) {
559       xb    = x + 6*(*idx++);
560       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
561       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
562       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
563       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
564       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
565       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
566       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
567       v    += 36;
568     }
569     if (usecprow) z = zarray + 6*ridx[i];
570     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
571     if (!usecprow) z += 6;
572   }
573 
574   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
575   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
576   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
577   PetscFunctionReturn(0);
578 }
579 
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMult_SeqBAIJ_7"
582 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
583 {
584   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
585   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
586   const PetscScalar *x,*xb;
587   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
588   const MatScalar   *v;
589   PetscErrorCode    ierr;
590   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
591   PetscBool         usecprow=a->compressedrow.use;
592 
593   PetscFunctionBegin;
594   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
595   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
596 
597   idx = a->j;
598   v   = a->a;
599   if (usecprow) {
600     mbs  = a->compressedrow.nrows;
601     ii   = a->compressedrow.i;
602     ridx = a->compressedrow.rindex;
603     ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
604   } else {
605     mbs = a->mbs;
606     ii  = a->i;
607     z   = zarray;
608   }
609 
610   for (i=0; i<mbs; i++) {
611     n  = ii[1] - ii[0];
612     ii++;
613     sum1 = 0.0;
614     sum2 = 0.0;
615     sum3 = 0.0;
616     sum4 = 0.0;
617     sum5 = 0.0;
618     sum6 = 0.0;
619     sum7 = 0.0;
620 
621     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
622     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
623     for (j=0; j<n; j++) {
624       xb    = x + 7*(*idx++);
625       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
626       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
627       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
628       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
629       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
630       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
631       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
632       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
633       v    += 49;
634     }
635     if (usecprow) z = zarray + 7*ridx[i];
636     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
637     if (!usecprow) z += 7;
638   }
639 
640   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
641   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
642   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
643   PetscFunctionReturn(0);
644 }
645 
646 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
647 /* Default MatMult for block size 15 */
648 
649 #undef __FUNCT__
650 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
651 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
652 {
653   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
654   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
655   const PetscScalar *x,*xb;
656   PetscScalar       *zarray,xv;
657   const MatScalar   *v;
658   PetscErrorCode    ierr;
659   const PetscInt    *ii,*ij=a->j,*idx;
660   PetscInt          mbs,i,j,k,n,*ridx=NULL;
661   PetscBool         usecprow=a->compressedrow.use;
662 
663   PetscFunctionBegin;
664   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
665   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
666 
667   v = a->a;
668   if (usecprow) {
669     mbs  = a->compressedrow.nrows;
670     ii   = a->compressedrow.i;
671     ridx = a->compressedrow.rindex;
672     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
673   } else {
674     mbs = a->mbs;
675     ii  = a->i;
676     z   = zarray;
677   }
678 
679   for (i=0; i<mbs; i++) {
680     n    = ii[i+1] - ii[i];
681     idx  = ij + ii[i];
682     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
683     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
684 
685     for (j=0; j<n; j++) {
686       xb = x + 15*(idx[j]);
687 
688       for (k=0; k<15; k++) {
689         xv     =  xb[k];
690         sum1  += v[0]*xv;
691         sum2  += v[1]*xv;
692         sum3  += v[2]*xv;
693         sum4  += v[3]*xv;
694         sum5  += v[4]*xv;
695         sum6  += v[5]*xv;
696         sum7  += v[6]*xv;
697         sum8  += v[7]*xv;
698         sum9  += v[8]*xv;
699         sum10 += v[9]*xv;
700         sum11 += v[10]*xv;
701         sum12 += v[11]*xv;
702         sum13 += v[12]*xv;
703         sum14 += v[13]*xv;
704         sum15 += v[14]*xv;
705         v     += 15;
706       }
707     }
708     if (usecprow) z = zarray + 15*ridx[i];
709     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
710     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
711 
712     if (!usecprow) z += 15;
713   }
714 
715   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
716   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
717   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
718   PetscFunctionReturn(0);
719 }
720 
721 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
722 #undef __FUNCT__
723 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
724 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
725 {
726   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
727   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
728   const PetscScalar *x,*xb;
729   PetscScalar       x1,x2,x3,x4,*zarray;
730   const MatScalar   *v;
731   PetscErrorCode    ierr;
732   const PetscInt    *ii,*ij=a->j,*idx;
733   PetscInt          mbs,i,j,n,*ridx=NULL;
734   PetscBool         usecprow=a->compressedrow.use;
735 
736   PetscFunctionBegin;
737   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
738   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
739 
740   v = a->a;
741   if (usecprow) {
742     mbs  = a->compressedrow.nrows;
743     ii   = a->compressedrow.i;
744     ridx = a->compressedrow.rindex;
745     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
746   } else {
747     mbs = a->mbs;
748     ii  = a->i;
749     z   = zarray;
750   }
751 
752   for (i=0; i<mbs; i++) {
753     n    = ii[i+1] - ii[i];
754     idx  = ij + ii[i];
755     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
756     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
757 
758     for (j=0; j<n; j++) {
759       xb = x + 15*(idx[j]);
760       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
761 
762       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
763       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
764       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
765       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
766       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
767       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
768       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
769       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
770       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
771       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
772       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
773       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
774       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
775       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
776       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
777 
778       v += 60;
779 
780       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
781 
782       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
783       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
784       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
785       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
786       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
787       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
788       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
789       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
790       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
791       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
792       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
793       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
794       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
795       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
796       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
797       v     += 60;
798 
799       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
800       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
801       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
802       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
803       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
804       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
805       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
806       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
807       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
808       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
809       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
810       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
811       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
812       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
813       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
814       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
815       v     += 60;
816 
817       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
818       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
819       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
820       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
821       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
822       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
823       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
824       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
825       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
826       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
827       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
828       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
829       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
830       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
831       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
832       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
833       v     += 45;
834     }
835     if (usecprow) z = zarray + 15*ridx[i];
836     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
837     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
838 
839     if (!usecprow) z += 15;
840   }
841 
842   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
843   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
844   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
845   PetscFunctionReturn(0);
846 }
847 
848 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
849 #undef __FUNCT__
850 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
851 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
852 {
853   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
854   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
855   const PetscScalar *x,*xb;
856   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
857   const MatScalar   *v;
858   PetscErrorCode    ierr;
859   const PetscInt    *ii,*ij=a->j,*idx;
860   PetscInt          mbs,i,j,n,*ridx=NULL;
861   PetscBool         usecprow=a->compressedrow.use;
862 
863   PetscFunctionBegin;
864   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
865   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
866 
867   v = a->a;
868   if (usecprow) {
869     mbs  = a->compressedrow.nrows;
870     ii   = a->compressedrow.i;
871     ridx = a->compressedrow.rindex;
872     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
873   } else {
874     mbs = a->mbs;
875     ii  = a->i;
876     z   = zarray;
877   }
878 
879   for (i=0; i<mbs; i++) {
880     n    = ii[i+1] - ii[i];
881     idx  = ij + ii[i];
882     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
883     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
884 
885     for (j=0; j<n; j++) {
886       xb = x + 15*(idx[j]);
887       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
888       x8 = xb[7];
889 
890       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
891       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
892       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
893       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
894       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
895       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
896       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
897       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
898       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
899       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
900       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
901       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
902       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
903       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
904       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
905       v     += 120;
906 
907       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
908 
909       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
910       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
911       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
912       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
913       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
914       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
915       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
916       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
917       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
918       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
919       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
920       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
921       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
922       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
923       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
924       v     += 105;
925     }
926     if (usecprow) z = zarray + 15*ridx[i];
927     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
928     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
929 
930     if (!usecprow) z += 15;
931   }
932 
933   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
934   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
935   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
936   PetscFunctionReturn(0);
937 }
938 
939 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
940 
941 #undef __FUNCT__
942 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
943 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
944 {
945   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
946   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
947   const PetscScalar *x,*xb;
948   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
949   const MatScalar   *v;
950   PetscErrorCode    ierr;
951   const PetscInt    *ii,*ij=a->j,*idx;
952   PetscInt          mbs,i,j,n,*ridx=NULL;
953   PetscBool         usecprow=a->compressedrow.use;
954 
955   PetscFunctionBegin;
956   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
957   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
958 
959   v = a->a;
960   if (usecprow) {
961     mbs  = a->compressedrow.nrows;
962     ii   = a->compressedrow.i;
963     ridx = a->compressedrow.rindex;
964     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
965   } else {
966     mbs = a->mbs;
967     ii  = a->i;
968     z   = zarray;
969   }
970 
971   for (i=0; i<mbs; i++) {
972     n    = ii[i+1] - ii[i];
973     idx  = ij + ii[i];
974     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
975     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
976 
977     for (j=0; j<n; j++) {
978       xb = x + 15*(idx[j]);
979       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
980       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
981 
982       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
983       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
984       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
985       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
986       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
987       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
988       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
989       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
990       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
991       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
992       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
993       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
994       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
995       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
996       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
997       v     += 225;
998     }
999     if (usecprow) z = zarray + 15*ridx[i];
1000     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1001     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1002 
1003     if (!usecprow) z += 15;
1004   }
1005 
1006   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1007   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1008   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 
1013 /*
1014     This will not work with MatScalar == float because it calls the BLAS
1015 */
1016 #undef __FUNCT__
1017 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1018 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1019 {
1020   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1021   PetscScalar       *z = 0,*work,*workt,*zarray;
1022   const PetscScalar *x,*xb;
1023   const MatScalar   *v;
1024   PetscErrorCode    ierr;
1025   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1026   const PetscInt    *idx,*ii,*ridx=NULL;
1027   PetscInt          ncols,k;
1028   PetscBool         usecprow=a->compressedrow.use;
1029 
1030   PetscFunctionBegin;
1031   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1032   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1033 
1034   idx = a->j;
1035   v   = a->a;
1036   if (usecprow) {
1037     mbs  = a->compressedrow.nrows;
1038     ii   = a->compressedrow.i;
1039     ridx = a->compressedrow.rindex;
1040     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1041   } else {
1042     mbs = a->mbs;
1043     ii  = a->i;
1044     z   = zarray;
1045   }
1046 
1047   if (!a->mult_work) {
1048     k    = PetscMax(A->rmap->n,A->cmap->n);
1049     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1050   }
1051   work = a->mult_work;
1052   for (i=0; i<mbs; i++) {
1053     n           = ii[1] - ii[0]; ii++;
1054     ncols       = n*bs;
1055     workt       = work;
1056     for (j=0; j<n; j++) {
1057       xb = x + bs*(*idx++);
1058       for (k=0; k<bs; k++) workt[k] = xb[k];
1059       workt += bs;
1060     }
1061     if (usecprow) z = zarray + bs*ridx[i];
1062     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1063     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1064     v += n*bs2;
1065     if (!usecprow) z += bs;
1066   }
1067   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1068   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1069   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1070   PetscFunctionReturn(0);
1071 }
1072 
1073 #undef __FUNCT__
1074 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1075 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1076 {
1077   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1078   const PetscScalar *x;
1079   PetscScalar       *y,*z,sum;
1080   const MatScalar   *v;
1081   PetscErrorCode    ierr;
1082   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1083   const PetscInt    *idx,*ii;
1084   PetscBool         usecprow=a->compressedrow.use;
1085 
1086   PetscFunctionBegin;
1087   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1088   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1089 
1090   idx = a->j;
1091   v   = a->a;
1092   if (usecprow) {
1093     if (zz != yy) {
1094       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1095     }
1096     mbs  = a->compressedrow.nrows;
1097     ii   = a->compressedrow.i;
1098     ridx = a->compressedrow.rindex;
1099   } else {
1100     ii = a->i;
1101   }
1102 
1103   for (i=0; i<mbs; i++) {
1104     n = ii[1] - ii[0];
1105     ii++;
1106     if (!usecprow) {
1107       sum         = y[i];
1108     } else {
1109       sum = y[ridx[i]];
1110     }
1111     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1112     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1113     PetscSparseDensePlusDot(sum,x,v,idx,n);
1114     v   += n;
1115     idx += n;
1116     if (usecprow) {
1117       z[ridx[i]] = sum;
1118     } else {
1119       z[i] = sum;
1120     }
1121   }
1122   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1123   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1124   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1125   PetscFunctionReturn(0);
1126 }
1127 
1128 #undef __FUNCT__
1129 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1130 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1131 {
1132   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1133   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1134   const PetscScalar *x,*xb;
1135   PetscScalar       x1,x2,*yarray,*zarray;
1136   const MatScalar   *v;
1137   PetscErrorCode    ierr;
1138   PetscInt          mbs = a->mbs,i,n,j;
1139   const PetscInt    *idx,*ii,*ridx = NULL;
1140   PetscBool         usecprow = a->compressedrow.use;
1141 
1142   PetscFunctionBegin;
1143   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1144   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1145 
1146   idx = a->j;
1147   v   = a->a;
1148   if (usecprow) {
1149     if (zz != yy) {
1150       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1151     }
1152     mbs  = a->compressedrow.nrows;
1153     ii   = a->compressedrow.i;
1154     ridx = a->compressedrow.rindex;
1155   } else {
1156     ii = a->i;
1157     y  = yarray;
1158     z  = zarray;
1159   }
1160 
1161   for (i=0; i<mbs; i++) {
1162     n = ii[1] - ii[0]; ii++;
1163     if (usecprow) {
1164       z = zarray + 2*ridx[i];
1165       y = yarray + 2*ridx[i];
1166     }
1167     sum1 = y[0]; sum2 = y[1];
1168     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1169     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1170     for (j=0; j<n; j++) {
1171       xb = x + 2*(*idx++);
1172       x1 = xb[0];
1173       x2 = xb[1];
1174 
1175       sum1 += v[0]*x1 + v[2]*x2;
1176       sum2 += v[1]*x1 + v[3]*x2;
1177       v    += 4;
1178     }
1179     z[0] = sum1; z[1] = sum2;
1180     if (!usecprow) {
1181       z += 2; y += 2;
1182     }
1183   }
1184   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1185   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1186   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 #undef __FUNCT__
1191 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1192 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1193 {
1194   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1195   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1196   const PetscScalar *x,*xb;
1197   const MatScalar   *v;
1198   PetscErrorCode    ierr;
1199   PetscInt          mbs = a->mbs,i,j,n;
1200   const PetscInt    *idx,*ii,*ridx = NULL;
1201   PetscBool         usecprow = a->compressedrow.use;
1202 
1203   PetscFunctionBegin;
1204   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1205   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1206 
1207   idx = a->j;
1208   v   = a->a;
1209   if (usecprow) {
1210     if (zz != yy) {
1211       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1212     }
1213     mbs  = a->compressedrow.nrows;
1214     ii   = a->compressedrow.i;
1215     ridx = a->compressedrow.rindex;
1216   } else {
1217     ii = a->i;
1218     y  = yarray;
1219     z  = zarray;
1220   }
1221 
1222   for (i=0; i<mbs; i++) {
1223     n = ii[1] - ii[0]; ii++;
1224     if (usecprow) {
1225       z = zarray + 3*ridx[i];
1226       y = yarray + 3*ridx[i];
1227     }
1228     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1229     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1230     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1231     for (j=0; j<n; j++) {
1232       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1233       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1234       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1235       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1236       v    += 9;
1237     }
1238     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1239     if (!usecprow) {
1240       z += 3; y += 3;
1241     }
1242   }
1243   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1244   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1245   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1246   PetscFunctionReturn(0);
1247 }
1248 
1249 #undef __FUNCT__
1250 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1251 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1252 {
1253   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1254   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1255   const PetscScalar *x,*xb;
1256   const MatScalar   *v;
1257   PetscErrorCode    ierr;
1258   PetscInt          mbs = a->mbs,i,j,n;
1259   const PetscInt    *idx,*ii,*ridx=NULL;
1260   PetscBool         usecprow=a->compressedrow.use;
1261 
1262   PetscFunctionBegin;
1263   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1264   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1265 
1266   idx = a->j;
1267   v   = a->a;
1268   if (usecprow) {
1269     if (zz != yy) {
1270       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1271     }
1272     mbs  = a->compressedrow.nrows;
1273     ii   = a->compressedrow.i;
1274     ridx = a->compressedrow.rindex;
1275   } else {
1276     ii = a->i;
1277     y  = yarray;
1278     z  = zarray;
1279   }
1280 
1281   for (i=0; i<mbs; i++) {
1282     n = ii[1] - ii[0]; ii++;
1283     if (usecprow) {
1284       z = zarray + 4*ridx[i];
1285       y = yarray + 4*ridx[i];
1286     }
1287     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1288     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1289     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1290     for (j=0; j<n; j++) {
1291       xb    = x + 4*(*idx++);
1292       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1293       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1294       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1295       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1296       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1297       v    += 16;
1298     }
1299     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1300     if (!usecprow) {
1301       z += 4; y += 4;
1302     }
1303   }
1304   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1305   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1306   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1307   PetscFunctionReturn(0);
1308 }
1309 
1310 #undef __FUNCT__
1311 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1312 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1313 {
1314   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1315   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1316   const PetscScalar *x,*xb;
1317   PetscScalar       *yarray,*zarray;
1318   const MatScalar   *v;
1319   PetscErrorCode    ierr;
1320   PetscInt          mbs = a->mbs,i,j,n;
1321   const PetscInt    *idx,*ii,*ridx = NULL;
1322   PetscBool         usecprow=a->compressedrow.use;
1323 
1324   PetscFunctionBegin;
1325   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1326   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1327 
1328   idx = a->j;
1329   v   = a->a;
1330   if (usecprow) {
1331     if (zz != yy) {
1332       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1333     }
1334     mbs  = a->compressedrow.nrows;
1335     ii   = a->compressedrow.i;
1336     ridx = a->compressedrow.rindex;
1337   } else {
1338     ii = a->i;
1339     y  = yarray;
1340     z  = zarray;
1341   }
1342 
1343   for (i=0; i<mbs; i++) {
1344     n = ii[1] - ii[0]; ii++;
1345     if (usecprow) {
1346       z = zarray + 5*ridx[i];
1347       y = yarray + 5*ridx[i];
1348     }
1349     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1350     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1351     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1352     for (j=0; j<n; j++) {
1353       xb    = x + 5*(*idx++);
1354       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1355       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1356       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1357       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1358       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1359       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1360       v    += 25;
1361     }
1362     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1363     if (!usecprow) {
1364       z += 5; y += 5;
1365     }
1366   }
1367   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1368   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1369   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 #undef __FUNCT__
1373 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1374 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1375 {
1376   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1377   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1378   const PetscScalar *x,*xb;
1379   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1380   const MatScalar   *v;
1381   PetscErrorCode    ierr;
1382   PetscInt          mbs = a->mbs,i,j,n;
1383   const PetscInt    *idx,*ii,*ridx=NULL;
1384   PetscBool         usecprow=a->compressedrow.use;
1385 
1386   PetscFunctionBegin;
1387   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1388   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1389 
1390   idx = a->j;
1391   v   = a->a;
1392   if (usecprow) {
1393     if (zz != yy) {
1394       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1395     }
1396     mbs  = a->compressedrow.nrows;
1397     ii   = a->compressedrow.i;
1398     ridx = a->compressedrow.rindex;
1399   } else {
1400     ii = a->i;
1401     y  = yarray;
1402     z  = zarray;
1403   }
1404 
1405   for (i=0; i<mbs; i++) {
1406     n = ii[1] - ii[0]; ii++;
1407     if (usecprow) {
1408       z = zarray + 6*ridx[i];
1409       y = yarray + 6*ridx[i];
1410     }
1411     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1412     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1413     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1414     for (j=0; j<n; j++) {
1415       xb    = x + 6*(*idx++);
1416       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1417       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1418       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1419       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1420       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1421       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1422       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1423       v    += 36;
1424     }
1425     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1426     if (!usecprow) {
1427       z += 6; y += 6;
1428     }
1429   }
1430   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1431   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1432   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 #undef __FUNCT__
1437 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
1438 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1439 {
1440   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1441   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1442   const PetscScalar *x,*xb;
1443   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1444   const MatScalar   *v;
1445   PetscErrorCode    ierr;
1446   PetscInt          mbs = a->mbs,i,j,n;
1447   const PetscInt    *idx,*ii,*ridx = NULL;
1448   PetscBool         usecprow=a->compressedrow.use;
1449 
1450   PetscFunctionBegin;
1451   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1452   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1453 
1454   idx = a->j;
1455   v   = a->a;
1456   if (usecprow) {
1457     if (zz != yy) {
1458       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1459     }
1460     mbs  = a->compressedrow.nrows;
1461     ii   = a->compressedrow.i;
1462     ridx = a->compressedrow.rindex;
1463   } else {
1464     ii = a->i;
1465     y  = yarray;
1466     z  = zarray;
1467   }
1468 
1469   for (i=0; i<mbs; i++) {
1470     n = ii[1] - ii[0]; ii++;
1471     if (usecprow) {
1472       z = zarray + 7*ridx[i];
1473       y = yarray + 7*ridx[i];
1474     }
1475     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1476     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1477     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1478     for (j=0; j<n; j++) {
1479       xb    = x + 7*(*idx++);
1480       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1481       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1482       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1483       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1484       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1485       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1486       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1487       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1488       v    += 49;
1489     }
1490     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1491     if (!usecprow) {
1492       z += 7; y += 7;
1493     }
1494   }
1495   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1496   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1497   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1498   PetscFunctionReturn(0);
1499 }
1500 
1501 #undef __FUNCT__
1502 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
1503 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1504 {
1505   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1506   PetscScalar       *z = 0,*work,*workt,*zarray;
1507   const PetscScalar *x,*xb;
1508   const MatScalar   *v;
1509   PetscErrorCode    ierr;
1510   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1511   PetscInt          ncols,k;
1512   const PetscInt    *ridx = NULL,*idx,*ii;
1513   PetscBool         usecprow = a->compressedrow.use;
1514 
1515   PetscFunctionBegin;
1516   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1517   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1518   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1519 
1520   idx = a->j;
1521   v   = a->a;
1522   if (usecprow) {
1523     mbs  = a->compressedrow.nrows;
1524     ii   = a->compressedrow.i;
1525     ridx = a->compressedrow.rindex;
1526   } else {
1527     mbs = a->mbs;
1528     ii  = a->i;
1529     z   = zarray;
1530   }
1531 
1532   if (!a->mult_work) {
1533     k    = PetscMax(A->rmap->n,A->cmap->n);
1534     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1535   }
1536   work = a->mult_work;
1537   for (i=0; i<mbs; i++) {
1538     n     = ii[1] - ii[0]; ii++;
1539     ncols = n*bs;
1540     workt = work;
1541     for (j=0; j<n; j++) {
1542       xb = x + bs*(*idx++);
1543       for (k=0; k<bs; k++) workt[k] = xb[k];
1544       workt += bs;
1545     }
1546     if (usecprow) z = zarray + bs*ridx[i];
1547     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1548     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1549     v += n*bs2;
1550     if (!usecprow) z += bs;
1551   }
1552   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1553   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1554   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 #undef __FUNCT__
1559 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
1560 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1561 {
1562   PetscScalar    zero = 0.0;
1563   PetscErrorCode ierr;
1564 
1565   PetscFunctionBegin;
1566   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1567   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 #undef __FUNCT__
1572 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
1573 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1574 {
1575   PetscScalar    zero = 0.0;
1576   PetscErrorCode ierr;
1577 
1578   PetscFunctionBegin;
1579   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1580   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 #undef __FUNCT__
1585 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
1586 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1587 {
1588   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1589   PetscScalar       *z,x1,x2,x3,x4,x5;
1590   const PetscScalar *x,*xb = NULL;
1591   const MatScalar   *v;
1592   PetscErrorCode    ierr;
1593   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1594   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1595   Mat_CompressedRow cprow = a->compressedrow;
1596   PetscBool         usecprow = cprow.use;
1597 
1598   PetscFunctionBegin;
1599   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1600   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1601   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1602 
1603   idx = a->j;
1604   v   = a->a;
1605   if (usecprow) {
1606     mbs  = cprow.nrows;
1607     ii   = cprow.i;
1608     ridx = cprow.rindex;
1609   } else {
1610     mbs=a->mbs;
1611     ii = a->i;
1612     xb = x;
1613   }
1614 
1615   switch (bs) {
1616   case 1:
1617     for (i=0; i<mbs; i++) {
1618       if (usecprow) xb = x + ridx[i];
1619       x1 = xb[0];
1620       ib = idx + ii[0];
1621       n  = ii[1] - ii[0]; ii++;
1622       for (j=0; j<n; j++) {
1623         rval     = ib[j];
1624         z[rval] += PetscConj(*v) * x1;
1625         v++;
1626       }
1627       if (!usecprow) xb++;
1628     }
1629     break;
1630   case 2:
1631     for (i=0; i<mbs; i++) {
1632       if (usecprow) xb = x + 2*ridx[i];
1633       x1 = xb[0]; x2 = xb[1];
1634       ib = idx + ii[0];
1635       n  = ii[1] - ii[0]; ii++;
1636       for (j=0; j<n; j++) {
1637         rval       = ib[j]*2;
1638         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1639         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1640         v         += 4;
1641       }
1642       if (!usecprow) xb += 2;
1643     }
1644     break;
1645   case 3:
1646     for (i=0; i<mbs; i++) {
1647       if (usecprow) xb = x + 3*ridx[i];
1648       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1649       ib = idx + ii[0];
1650       n  = ii[1] - ii[0]; ii++;
1651       for (j=0; j<n; j++) {
1652         rval       = ib[j]*3;
1653         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1654         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1655         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1656         v         += 9;
1657       }
1658       if (!usecprow) xb += 3;
1659     }
1660     break;
1661   case 4:
1662     for (i=0; i<mbs; i++) {
1663       if (usecprow) xb = x + 4*ridx[i];
1664       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1665       ib = idx + ii[0];
1666       n  = ii[1] - ii[0]; ii++;
1667       for (j=0; j<n; j++) {
1668         rval       = ib[j]*4;
1669         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1670         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1671         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1672         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1673         v         += 16;
1674       }
1675       if (!usecprow) xb += 4;
1676     }
1677     break;
1678   case 5:
1679     for (i=0; i<mbs; i++) {
1680       if (usecprow) xb = x + 5*ridx[i];
1681       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1682       x4 = xb[3]; x5 = xb[4];
1683       ib = idx + ii[0];
1684       n  = ii[1] - ii[0]; ii++;
1685       for (j=0; j<n; j++) {
1686         rval       = ib[j]*5;
1687         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1688         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1689         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1690         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1691         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1692         v         += 25;
1693       }
1694       if (!usecprow) xb += 5;
1695     }
1696     break;
1697   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1698     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1699 #if 0
1700     {
1701       PetscInt          ncols,k,bs2=a->bs2;
1702       PetscScalar       *work,*workt,zb;
1703       const PetscScalar *xtmp;
1704       if (!a->mult_work) {
1705         k    = PetscMax(A->rmap->n,A->cmap->n);
1706         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1707       }
1708       work = a->mult_work;
1709       xtmp = x;
1710       for (i=0; i<mbs; i++) {
1711         n     = ii[1] - ii[0]; ii++;
1712         ncols = n*bs;
1713         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1714         if (usecprow) xtmp = x + bs*ridx[i];
1715         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1716         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1717         v += n*bs2;
1718         if (!usecprow) xtmp += bs;
1719         workt = work;
1720         for (j=0; j<n; j++) {
1721           zb = z + bs*(*idx++);
1722           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1723           workt += bs;
1724         }
1725       }
1726     }
1727 #endif
1728   }
1729   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1730   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1731   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 #undef __FUNCT__
1736 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
1737 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1738 {
1739   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1740   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1741   const PetscScalar *x,*xb = 0;
1742   const MatScalar   *v;
1743   PetscErrorCode    ierr;
1744   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1745   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1746   Mat_CompressedRow cprow   = a->compressedrow;
1747   PetscBool         usecprow=cprow.use;
1748 
1749   PetscFunctionBegin;
1750   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1751   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1752   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1753 
1754   idx = a->j;
1755   v   = a->a;
1756   if (usecprow) {
1757     mbs  = cprow.nrows;
1758     ii   = cprow.i;
1759     ridx = cprow.rindex;
1760   } else {
1761     mbs=a->mbs;
1762     ii = a->i;
1763     xb = x;
1764   }
1765 
1766   switch (bs) {
1767   case 1:
1768     for (i=0; i<mbs; i++) {
1769       if (usecprow) xb = x + ridx[i];
1770       x1 = xb[0];
1771       ib = idx + ii[0];
1772       n  = ii[1] - ii[0]; ii++;
1773       for (j=0; j<n; j++) {
1774         rval     = ib[j];
1775         z[rval] += *v * x1;
1776         v++;
1777       }
1778       if (!usecprow) xb++;
1779     }
1780     break;
1781   case 2:
1782     for (i=0; i<mbs; i++) {
1783       if (usecprow) xb = x + 2*ridx[i];
1784       x1 = xb[0]; x2 = xb[1];
1785       ib = idx + ii[0];
1786       n  = ii[1] - ii[0]; ii++;
1787       for (j=0; j<n; j++) {
1788         rval       = ib[j]*2;
1789         z[rval++] += v[0]*x1 + v[1]*x2;
1790         z[rval++] += v[2]*x1 + v[3]*x2;
1791         v         += 4;
1792       }
1793       if (!usecprow) xb += 2;
1794     }
1795     break;
1796   case 3:
1797     for (i=0; i<mbs; i++) {
1798       if (usecprow) xb = x + 3*ridx[i];
1799       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1800       ib = idx + ii[0];
1801       n  = ii[1] - ii[0]; ii++;
1802       for (j=0; j<n; j++) {
1803         rval       = ib[j]*3;
1804         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1805         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1806         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1807         v         += 9;
1808       }
1809       if (!usecprow) xb += 3;
1810     }
1811     break;
1812   case 4:
1813     for (i=0; i<mbs; i++) {
1814       if (usecprow) xb = x + 4*ridx[i];
1815       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1816       ib = idx + ii[0];
1817       n  = ii[1] - ii[0]; ii++;
1818       for (j=0; j<n; j++) {
1819         rval       = ib[j]*4;
1820         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1821         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1822         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1823         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1824         v         += 16;
1825       }
1826       if (!usecprow) xb += 4;
1827     }
1828     break;
1829   case 5:
1830     for (i=0; i<mbs; i++) {
1831       if (usecprow) xb = x + 5*ridx[i];
1832       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1833       x4 = xb[3]; x5 = xb[4];
1834       ib = idx + ii[0];
1835       n  = ii[1] - ii[0]; ii++;
1836       for (j=0; j<n; j++) {
1837         rval       = ib[j]*5;
1838         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1839         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1840         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1841         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1842         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1843         v         += 25;
1844       }
1845       if (!usecprow) xb += 5;
1846     }
1847     break;
1848   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1849     PetscInt          ncols,k;
1850     PetscScalar       *work,*workt;
1851     const PetscScalar *xtmp;
1852     if (!a->mult_work) {
1853       k    = PetscMax(A->rmap->n,A->cmap->n);
1854       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1855     }
1856     work = a->mult_work;
1857     xtmp = x;
1858     for (i=0; i<mbs; i++) {
1859       n     = ii[1] - ii[0]; ii++;
1860       ncols = n*bs;
1861       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1862       if (usecprow) xtmp = x + bs*ridx[i];
1863       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1864       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1865       v += n*bs2;
1866       if (!usecprow) xtmp += bs;
1867       workt = work;
1868       for (j=0; j<n; j++) {
1869         zb = z + bs*(*idx++);
1870         for (k=0; k<bs; k++) zb[k] += workt[k];
1871         workt += bs;
1872       }
1873     }
1874     }
1875   }
1876   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1877   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1878   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 #undef __FUNCT__
1883 #define __FUNCT__ "MatScale_SeqBAIJ"
1884 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1885 {
1886   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1887   PetscInt       totalnz = a->bs2*a->nz;
1888   PetscScalar    oalpha  = alpha;
1889   PetscErrorCode ierr;
1890   PetscBLASInt   one = 1,tnz;
1891 
1892   PetscFunctionBegin;
1893   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1894   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1895   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1896   PetscFunctionReturn(0);
1897 }
1898 
1899 #undef __FUNCT__
1900 #define __FUNCT__ "MatNorm_SeqBAIJ"
1901 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1902 {
1903   PetscErrorCode ierr;
1904   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1905   MatScalar      *v  = a->a;
1906   PetscReal      sum = 0.0;
1907   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1908 
1909   PetscFunctionBegin;
1910   if (type == NORM_FROBENIUS) {
1911     for (i=0; i< bs2*nz; i++) {
1912       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1913     }
1914     *norm = PetscSqrtReal(sum);
1915     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
1916   } else if (type == NORM_1) { /* maximum column sum */
1917     PetscReal *tmp;
1918     PetscInt  *bcol = a->j;
1919     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
1920     for (i=0; i<nz; i++) {
1921       for (j=0; j<bs; j++) {
1922         k1 = bs*(*bcol) + j; /* column index */
1923         for (k=0; k<bs; k++) {
1924           tmp[k1] += PetscAbsScalar(*v); v++;
1925         }
1926       }
1927       bcol++;
1928     }
1929     *norm = 0.0;
1930     for (j=0; j<A->cmap->n; j++) {
1931       if (tmp[j] > *norm) *norm = tmp[j];
1932     }
1933     ierr = PetscFree(tmp);CHKERRQ(ierr);
1934     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1935   } else if (type == NORM_INFINITY) { /* maximum row sum */
1936     *norm = 0.0;
1937     for (k=0; k<bs; k++) {
1938       for (j=0; j<a->mbs; j++) {
1939         v   = a->a + bs2*a->i[j] + k;
1940         sum = 0.0;
1941         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1942           for (k1=0; k1<bs; k1++) {
1943             sum += PetscAbsScalar(*v);
1944             v   += bs;
1945           }
1946         }
1947         if (sum > *norm) *norm = sum;
1948       }
1949     }
1950     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1951   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1952   PetscFunctionReturn(0);
1953 }
1954 
1955 
1956 #undef __FUNCT__
1957 #define __FUNCT__ "MatEqual_SeqBAIJ"
1958 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1959 {
1960   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1961   PetscErrorCode ierr;
1962 
1963   PetscFunctionBegin;
1964   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1965   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1966     *flg = PETSC_FALSE;
1967     PetscFunctionReturn(0);
1968   }
1969 
1970   /* if the a->i are the same */
1971   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1972   if (!*flg) PetscFunctionReturn(0);
1973 
1974   /* if a->j are the same */
1975   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1976   if (!*flg) PetscFunctionReturn(0);
1977 
1978   /* if a->a are the same */
1979   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1980   PetscFunctionReturn(0);
1981 
1982 }
1983 
1984 #undef __FUNCT__
1985 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
1986 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1987 {
1988   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1989   PetscErrorCode ierr;
1990   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1991   PetscScalar    *x,zero = 0.0;
1992   MatScalar      *aa,*aa_j;
1993 
1994   PetscFunctionBegin;
1995   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1996   bs   = A->rmap->bs;
1997   aa   = a->a;
1998   ai   = a->i;
1999   aj   = a->j;
2000   ambs = a->mbs;
2001   bs2  = a->bs2;
2002 
2003   ierr = VecSet(v,zero);CHKERRQ(ierr);
2004   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2005   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2006   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2007   for (i=0; i<ambs; i++) {
2008     for (j=ai[i]; j<ai[i+1]; j++) {
2009       if (aj[j] == i) {
2010         row  = i*bs;
2011         aa_j = aa+j*bs2;
2012         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2013         break;
2014       }
2015     }
2016   }
2017   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2018   PetscFunctionReturn(0);
2019 }
2020 
2021 #undef __FUNCT__
2022 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2023 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2024 {
2025   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2026   const PetscScalar *l,*r,*li,*ri;
2027   PetscScalar       x;
2028   MatScalar         *aa, *v;
2029   PetscErrorCode    ierr;
2030   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2031   const PetscInt    *ai,*aj;
2032 
2033   PetscFunctionBegin;
2034   ai  = a->i;
2035   aj  = a->j;
2036   aa  = a->a;
2037   m   = A->rmap->n;
2038   n   = A->cmap->n;
2039   bs  = A->rmap->bs;
2040   mbs = a->mbs;
2041   bs2 = a->bs2;
2042   if (ll) {
2043     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2044     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2045     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2046     for (i=0; i<mbs; i++) { /* for each block row */
2047       M  = ai[i+1] - ai[i];
2048       li = l + i*bs;
2049       v  = aa + bs2*ai[i];
2050       for (j=0; j<M; j++) { /* for each block */
2051         for (k=0; k<bs2; k++) {
2052           (*v++) *= li[k%bs];
2053         }
2054       }
2055     }
2056     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2057     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2058   }
2059 
2060   if (rr) {
2061     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2062     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2063     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2064     for (i=0; i<mbs; i++) { /* for each block row */
2065       iai = ai[i];
2066       M   = ai[i+1] - iai;
2067       v   = aa + bs2*iai;
2068       for (j=0; j<M; j++) { /* for each block */
2069         ri = r + bs*aj[iai+j];
2070         for (k=0; k<bs; k++) {
2071           x = ri[k];
2072           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2073           v += bs;
2074         }
2075       }
2076     }
2077     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2078     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2079   }
2080   PetscFunctionReturn(0);
2081 }
2082 
2083 
2084 #undef __FUNCT__
2085 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2086 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2087 {
2088   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2089 
2090   PetscFunctionBegin;
2091   info->block_size   = a->bs2;
2092   info->nz_allocated = a->bs2*a->maxnz;
2093   info->nz_used      = a->bs2*a->nz;
2094   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2095   info->assemblies   = A->num_ass;
2096   info->mallocs      = A->info.mallocs;
2097   info->memory       = ((PetscObject)A)->mem;
2098   if (A->factortype) {
2099     info->fill_ratio_given  = A->info.fill_ratio_given;
2100     info->fill_ratio_needed = A->info.fill_ratio_needed;
2101     info->factor_mallocs    = A->info.factor_mallocs;
2102   } else {
2103     info->fill_ratio_given  = 0;
2104     info->fill_ratio_needed = 0;
2105     info->factor_mallocs    = 0;
2106   }
2107   PetscFunctionReturn(0);
2108 }
2109 
2110 #undef __FUNCT__
2111 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2112 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2113 {
2114   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2115   PetscErrorCode ierr;
2116 
2117   PetscFunctionBegin;
2118   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2119   PetscFunctionReturn(0);
2120 }
2121