xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision feff33ee0b5b037fa8f9f294dede656a2f85cc47)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
16   const PetscInt *idx;
17   PetscInt       start,end,*ai,*aj,bs,*nidx2;
18   PetscBT        table;
19 
20   PetscFunctionBegin;
21   m  = a->mbs;
22   ai = a->i;
23   aj = a->j;
24   bs = A->rmap->bs;
25 
26   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27 
28   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
29   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
30   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
31 
32   for (i=0; i<is_max; i++) {
33     /* Initialise the two local arrays */
34     isz  = 0;
35     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
36 
37     /* Extract the indices, assume there can be duplicate entries */
38     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
39     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
40 
41     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42     for (j=0; j<n; ++j) {
43       ival = idx[j]/bs; /* convert the indices into block indices */
44       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap*/
52       n = isz;
53       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
54         row   = nidx[k];
55         start = ai[row];
56         end   = ai[row+1];
57         for (l = start; l<end; l++) {
58           val = aj[l];
59           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
60         }
61       }
62     }
63     /* expand the Index Set */
64     for (j=0; j<isz; j++) {
65       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag;
87 
88   PetscFunctionBegin;
89   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
90   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
91   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
93 
94   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
95   ssmap = smap;
96   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
97   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98   /* determine lens of each row */
99   for (i=0; i<nrows; i++) {
100     kstart  = ai[irow[i]];
101     kend    = kstart + a->ilen[irow[i]];
102     lens[i] = 0;
103     for (k=kstart; k<kend; k++) {
104       if (ssmap[aj[k]]) lens[i]++;
105     }
106   }
107   /* Create and fill new matrix */
108   if (scall == MAT_REUSE_MATRIX) {
109     c = (Mat_SeqBAIJ*)((*B)->data);
110 
111     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
113     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
115     C    = *B;
116   } else {
117     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
118     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
119     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
120     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
121   }
122   c = (Mat_SeqBAIJ*)(C->data);
123   for (i=0; i<nrows; i++) {
124     row      = irow[i];
125     kstart   = ai[row];
126     kend     = kstart + a->ilen[row];
127     mat_i    = c->i[i];
128     mat_j    = c->j + mat_i;
129     mat_a    = c->a + mat_i*bs2;
130     mat_ilen = c->ilen + i;
131     for (k=kstart; k<kend; k++) {
132       if ((tcol=ssmap[a->j[k]])) {
133         *mat_j++ = tcol - 1;
134         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
135         mat_a   += bs2;
136         (*mat_ilen)++;
137       }
138     }
139   }
140   /* sort */
141   {
142     MatScalar *work;
143     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
144     for (i=0; i<nrows; i++) {
145       PetscInt ilen;
146       mat_i = c->i[i];
147       mat_j = c->j + mat_i;
148       mat_a = c->a + mat_i*bs2;
149       ilen  = c->ilen[i];
150       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
151     }
152     ierr = PetscFree(work);CHKERRQ(ierr);
153   }
154 
155   /* Free work space */
156   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
157   ierr = PetscFree(smap);CHKERRQ(ierr);
158   ierr = PetscFree(lens);CHKERRQ(ierr);
159   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
160   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161 
162   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
163   *B   = C;
164   PetscFunctionReturn(0);
165 }
166 
167 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168 {
169   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
170   IS             is1,is2;
171   PetscErrorCode ierr;
172   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173   const PetscInt *irow,*icol;
174 
175   PetscFunctionBegin;
176   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
177   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
178   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
179   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
180 
181   /* Verify if the indices corespond to each element in a block
182    and form the IS with compressed IS */
183   maxmnbs = PetscMax(a->mbs,a->nbs);
184   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
185   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
186   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187   for (i=0; i<a->mbs; i++) {
188     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189   }
190   count = 0;
191   for (i=0; i<nrows; i++) {
192     j = irow[i] / bs;
193     if ((vary[j]--)==bs) iary[count++] = j;
194   }
195   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
196 
197   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
198   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199   for (i=0; i<a->nbs; i++) {
200     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201   }
202   count = 0;
203   for (i=0; i<ncols; i++) {
204     j = icol[i] / bs;
205     if ((vary[j]--)==bs) iary[count++] = j;
206   }
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
210   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
211 
212   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
213   ierr = ISDestroy(&is1);CHKERRQ(ierr);
214   ierr = ISDestroy(&is2);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219 {
220   PetscErrorCode ierr;
221   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
222   Mat_SubSppt    *submatj = c->submatis1;
223 
224   PetscFunctionBegin;
225   ierr = submatj->destroy(C);CHKERRQ(ierr);
226   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231 {
232   PetscErrorCode ierr;
233   PetscInt       i;
234   Mat            C;
235   Mat_SeqBAIJ    *c;
236   Mat_SubSppt    *submatj;
237 
238   PetscFunctionBegin;
239   for (i=0; i<n; i++) {
240     C       = (*mat)[i];
241     c       = (Mat_SeqBAIJ*)C->data;
242     submatj = c->submatis1;
243     if (submatj) {
244       ierr = submatj->destroy(C);CHKERRQ(ierr);
245       ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
246       ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
247       ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
248       ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
249     } else {
250       ierr = MatDestroy(&C);CHKERRQ(ierr);
251     }
252   }
253   ierr = PetscFree(*mat);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 
257 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
258 {
259   PetscErrorCode ierr;
260   PetscInt       i;
261 
262   PetscFunctionBegin;
263   if (scall == MAT_INITIAL_MATRIX) {
264     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
265   }
266 
267   for (i=0; i<n; i++) {
268     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
269   }
270   PetscFunctionReturn(0);
271 }
272 
273 /* -------------------------------------------------------*/
274 /* Should check that shapes of vectors and matrices match */
275 /* -------------------------------------------------------*/
276 
277 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
278 {
279   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
280   PetscScalar       *z,sum;
281   const PetscScalar *x;
282   const MatScalar   *v;
283   PetscErrorCode    ierr;
284   PetscInt          mbs,i,n;
285   const PetscInt    *idx,*ii,*ridx=NULL;
286   PetscBool         usecprow=a->compressedrow.use;
287 
288   PetscFunctionBegin;
289   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
290   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
291 
292   if (usecprow) {
293     mbs  = a->compressedrow.nrows;
294     ii   = a->compressedrow.i;
295     ridx = a->compressedrow.rindex;
296     ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
297   } else {
298     mbs = a->mbs;
299     ii  = a->i;
300   }
301 
302   for (i=0; i<mbs; i++) {
303     n   = ii[1] - ii[0];
304     v   = a->a + ii[0];
305     idx = a->j + ii[0];
306     ii++;
307     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
308     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
309     sum = 0.0;
310     PetscSparseDensePlusDot(sum,x,v,idx,n);
311     if (usecprow) {
312       z[ridx[i]] = sum;
313     } else {
314       z[i]       = sum;
315     }
316   }
317   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
318   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
319   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
320   PetscFunctionReturn(0);
321 }
322 
323 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
324 {
325   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
326   PetscScalar       *z = 0,sum1,sum2,*zarray;
327   const PetscScalar *x,*xb;
328   PetscScalar       x1,x2;
329   const MatScalar   *v;
330   PetscErrorCode    ierr;
331   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
332   PetscBool         usecprow=a->compressedrow.use;
333 
334   PetscFunctionBegin;
335   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
336   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
337 
338   idx = a->j;
339   v   = a->a;
340   if (usecprow) {
341     mbs  = a->compressedrow.nrows;
342     ii   = a->compressedrow.i;
343     ridx = a->compressedrow.rindex;
344     ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
345   } else {
346     mbs = a->mbs;
347     ii  = a->i;
348     z   = zarray;
349   }
350 
351   for (i=0; i<mbs; i++) {
352     n           = ii[1] - ii[0]; ii++;
353     sum1        = 0.0; sum2 = 0.0;
354     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
355     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356     for (j=0; j<n; j++) {
357       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
358       sum1 += v[0]*x1 + v[2]*x2;
359       sum2 += v[1]*x1 + v[3]*x2;
360       v    += 4;
361     }
362     if (usecprow) z = zarray + 2*ridx[i];
363     z[0] = sum1; z[1] = sum2;
364     if (!usecprow) z += 2;
365   }
366   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
367   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
368   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
369   PetscFunctionReturn(0);
370 }
371 
372 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
373 {
374   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
375   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
376   const PetscScalar *x,*xb;
377   const MatScalar   *v;
378   PetscErrorCode    ierr;
379   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
380   PetscBool         usecprow=a->compressedrow.use;
381 
382 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383 #pragma disjoint(*v,*z,*xb)
384 #endif
385 
386   PetscFunctionBegin;
387   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
388   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
389 
390   idx = a->j;
391   v   = a->a;
392   if (usecprow) {
393     mbs  = a->compressedrow.nrows;
394     ii   = a->compressedrow.i;
395     ridx = a->compressedrow.rindex;
396     ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
397   } else {
398     mbs = a->mbs;
399     ii  = a->i;
400     z   = zarray;
401   }
402 
403   for (i=0; i<mbs; i++) {
404     n           = ii[1] - ii[0]; ii++;
405     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
406     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
407     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
408     for (j=0; j<n; j++) {
409       xb = x + 3*(*idx++);
410       x1 = xb[0];
411       x2 = xb[1];
412       x3 = xb[2];
413 
414       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
415       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
416       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
417       v    += 9;
418     }
419     if (usecprow) z = zarray + 3*ridx[i];
420     z[0] = sum1; z[1] = sum2; z[2] = sum3;
421     if (!usecprow) z += 3;
422   }
423   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
424   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
425   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
426   PetscFunctionReturn(0);
427 }
428 
429 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
430 {
431   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
432   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
433   const PetscScalar *x,*xb;
434   const MatScalar   *v;
435   PetscErrorCode    ierr;
436   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
437   PetscBool         usecprow=a->compressedrow.use;
438 
439   PetscFunctionBegin;
440   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
441   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
442 
443   idx = a->j;
444   v   = a->a;
445   if (usecprow) {
446     mbs  = a->compressedrow.nrows;
447     ii   = a->compressedrow.i;
448     ridx = a->compressedrow.rindex;
449     ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
450   } else {
451     mbs = a->mbs;
452     ii  = a->i;
453     z   = zarray;
454   }
455 
456   for (i=0; i<mbs; i++) {
457     n = ii[1] - ii[0];
458     ii++;
459     sum1 = 0.0;
460     sum2 = 0.0;
461     sum3 = 0.0;
462     sum4 = 0.0;
463 
464     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
465     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
466     for (j=0; j<n; j++) {
467       xb    = x + 4*(*idx++);
468       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
469       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
470       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
471       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
472       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
473       v    += 16;
474     }
475     if (usecprow) z = zarray + 4*ridx[i];
476     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
477     if (!usecprow) z += 4;
478   }
479   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
480   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
481   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
486 {
487   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
488   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
489   const PetscScalar *xb,*x;
490   const MatScalar   *v;
491   PetscErrorCode    ierr;
492   const PetscInt    *idx,*ii,*ridx=NULL;
493   PetscInt          mbs,i,j,n;
494   PetscBool         usecprow=a->compressedrow.use;
495 
496   PetscFunctionBegin;
497   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
498   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
499 
500   idx = a->j;
501   v   = a->a;
502   if (usecprow) {
503     mbs  = a->compressedrow.nrows;
504     ii   = a->compressedrow.i;
505     ridx = a->compressedrow.rindex;
506     ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
507   } else {
508     mbs = a->mbs;
509     ii  = a->i;
510     z   = zarray;
511   }
512 
513   for (i=0; i<mbs; i++) {
514     n           = ii[1] - ii[0]; ii++;
515     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
516     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
517     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
518     for (j=0; j<n; j++) {
519       xb    = x + 5*(*idx++);
520       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
521       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
522       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
523       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
524       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
525       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
526       v    += 25;
527     }
528     if (usecprow) z = zarray + 5*ridx[i];
529     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
530     if (!usecprow) z += 5;
531   }
532   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
533   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
534   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
535   PetscFunctionReturn(0);
536 }
537 
538 
539 
540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541 {
542   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
543   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
544   const PetscScalar *x,*xb;
545   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
546   const MatScalar   *v;
547   PetscErrorCode    ierr;
548   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
549   PetscBool         usecprow=a->compressedrow.use;
550 
551   PetscFunctionBegin;
552   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
553   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
554 
555   idx = a->j;
556   v   = a->a;
557   if (usecprow) {
558     mbs  = a->compressedrow.nrows;
559     ii   = a->compressedrow.i;
560     ridx = a->compressedrow.rindex;
561     ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
562   } else {
563     mbs = a->mbs;
564     ii  = a->i;
565     z   = zarray;
566   }
567 
568   for (i=0; i<mbs; i++) {
569     n  = ii[1] - ii[0];
570     ii++;
571     sum1 = 0.0;
572     sum2 = 0.0;
573     sum3 = 0.0;
574     sum4 = 0.0;
575     sum5 = 0.0;
576     sum6 = 0.0;
577 
578     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
579     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580     for (j=0; j<n; j++) {
581       xb    = x + 6*(*idx++);
582       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
584       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
585       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
586       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
587       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
588       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
589       v    += 36;
590     }
591     if (usecprow) z = zarray + 6*ridx[i];
592     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593     if (!usecprow) z += 6;
594   }
595 
596   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
597   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
598   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
599   PetscFunctionReturn(0);
600 }
601 
602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603 {
604   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
605   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606   const PetscScalar *x,*xb;
607   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
608   const MatScalar   *v;
609   PetscErrorCode    ierr;
610   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
611   PetscBool         usecprow=a->compressedrow.use;
612 
613   PetscFunctionBegin;
614   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
615   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
616 
617   idx = a->j;
618   v   = a->a;
619   if (usecprow) {
620     mbs  = a->compressedrow.nrows;
621     ii   = a->compressedrow.i;
622     ridx = a->compressedrow.rindex;
623     ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
624   } else {
625     mbs = a->mbs;
626     ii  = a->i;
627     z   = zarray;
628   }
629 
630   for (i=0; i<mbs; i++) {
631     n  = ii[1] - ii[0];
632     ii++;
633     sum1 = 0.0;
634     sum2 = 0.0;
635     sum3 = 0.0;
636     sum4 = 0.0;
637     sum5 = 0.0;
638     sum6 = 0.0;
639     sum7 = 0.0;
640 
641     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
642     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643     for (j=0; j<n; j++) {
644       xb    = x + 7*(*idx++);
645       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653       v    += 49;
654     }
655     if (usecprow) z = zarray + 7*ridx[i];
656     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657     if (!usecprow) z += 7;
658   }
659 
660   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
661   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
662   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
663   PetscFunctionReturn(0);
664 }
665 
666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668 {
669   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
670   PetscScalar    *z = 0,*work,*workt,*zarray;
671   const PetscScalar *x,*xb;
672   const MatScalar   *v;
673   PetscErrorCode ierr;
674   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675   const PetscInt *idx,*ii,*ridx=NULL;
676   PetscInt       k;
677   PetscBool      usecprow=a->compressedrow.use;
678 
679   __m256d a0,a1,a2,a3,a4,a5;
680   __m256d w0,w1,w2,w3;
681   __m256d z0,z1,z2;
682   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
683 
684   PetscFunctionBegin;
685   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
686   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
687 
688   idx = a->j;
689   v   = a->a;
690   if (usecprow) {
691     mbs  = a->compressedrow.nrows;
692     ii   = a->compressedrow.i;
693     ridx = a->compressedrow.rindex;
694     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
695   } else {
696     mbs = a->mbs;
697     ii  = a->i;
698     z   = zarray;
699   }
700 
701   if (!a->mult_work) {
702     k    = PetscMax(A->rmap->n,A->cmap->n);
703     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
704   }
705 
706   work = a->mult_work;
707   for (i=0; i<mbs; i++) {
708     n           = ii[1] - ii[0]; ii++;
709     workt       = work;
710     for (j=0; j<n; j++) {
711       xb = x + bs*(*idx++);
712       for (k=0; k<bs; k++) workt[k] = xb[k];
713       workt += bs;
714     }
715     if (usecprow) z = zarray + bs*ridx[i];
716 
717     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
718 
719     for (j=0; j<n; j++) {
720       // first column of a
721       w0 = _mm256_set1_pd(work[j*9  ]);
722       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
725 
726       // second column of a
727       w1 = _mm256_set1_pd(work[j*9+ 1]);
728       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
731 
732       // third column of a
733       w2 = _mm256_set1_pd(work[j*9 +2]);
734       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
737 
738       // fourth column of a
739       w3 = _mm256_set1_pd(work[j*9+ 3]);
740       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
743 
744       // fifth column of a
745       w0 = _mm256_set1_pd(work[j*9+ 4]);
746       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
749 
750       // sixth column of a
751       w1 = _mm256_set1_pd(work[j*9+ 5]);
752       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
755 
756       // seventh column of a
757       w2 = _mm256_set1_pd(work[j*9+ 6]);
758       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
761 
762       // eigth column of a
763       w3 = _mm256_set1_pd(work[j*9+ 7]);
764       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
767 
768       // ninth column of a
769       w0 = _mm256_set1_pd(work[j*9+ 8]);
770       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773     }
774 
775     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
776 
777     v += n*bs2;
778     if (!usecprow) z += bs;
779   }
780   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
781   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
782   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
783   PetscFunctionReturn(0);
784 }
785 #endif
786 
787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788 {
789   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
790   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791   const PetscScalar *x,*xb;
792   PetscScalar       *zarray,xv;
793   const MatScalar   *v;
794   PetscErrorCode    ierr;
795   const PetscInt    *ii,*ij=a->j,*idx;
796   PetscInt          mbs,i,j,k,n,*ridx=NULL;
797   PetscBool         usecprow=a->compressedrow.use;
798 
799   PetscFunctionBegin;
800   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
801   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
802 
803   v = a->a;
804   if (usecprow) {
805     mbs  = a->compressedrow.nrows;
806     ii   = a->compressedrow.i;
807     ridx = a->compressedrow.rindex;
808     ierr = PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
809   } else {
810     mbs = a->mbs;
811     ii  = a->i;
812     z   = zarray;
813   }
814 
815   for (i=0; i<mbs; i++) {
816     n    = ii[i+1] - ii[i];
817     idx  = ij + ii[i];
818     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
820 
821     for (j=0; j<n; j++) {
822       xb = x + 11*(idx[j]);
823 
824       for (k=0; k<11; k++) {
825         xv     =  xb[k];
826         sum1  += v[0]*xv;
827         sum2  += v[1]*xv;
828         sum3  += v[2]*xv;
829         sum4  += v[3]*xv;
830         sum5  += v[4]*xv;
831         sum6  += v[5]*xv;
832         sum7  += v[6]*xv;
833         sum8  += v[7]*xv;
834         sum9  += v[8]*xv;
835         sum10 += v[9]*xv;
836         sum11 += v[10]*xv;
837         v     += 11;
838       }
839     }
840     if (usecprow) z = zarray + 11*ridx[i];
841     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
843 
844     if (!usecprow) z += 11;
845   }
846 
847   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
848   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
849   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
850   PetscFunctionReturn(0);
851 }
852 
853 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
854 /* Default MatMult for block size 15 */
855 
856 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
857 {
858   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
859   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
860   const PetscScalar *x,*xb;
861   PetscScalar       *zarray,xv;
862   const MatScalar   *v;
863   PetscErrorCode    ierr;
864   const PetscInt    *ii,*ij=a->j,*idx;
865   PetscInt          mbs,i,j,k,n,*ridx=NULL;
866   PetscBool         usecprow=a->compressedrow.use;
867 
868   PetscFunctionBegin;
869   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
870   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
871 
872   v = a->a;
873   if (usecprow) {
874     mbs  = a->compressedrow.nrows;
875     ii   = a->compressedrow.i;
876     ridx = a->compressedrow.rindex;
877     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
878   } else {
879     mbs = a->mbs;
880     ii  = a->i;
881     z   = zarray;
882   }
883 
884   for (i=0; i<mbs; i++) {
885     n    = ii[i+1] - ii[i];
886     idx  = ij + ii[i];
887     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
888     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
889 
890     for (j=0; j<n; j++) {
891       xb = x + 15*(idx[j]);
892 
893       for (k=0; k<15; k++) {
894         xv     =  xb[k];
895         sum1  += v[0]*xv;
896         sum2  += v[1]*xv;
897         sum3  += v[2]*xv;
898         sum4  += v[3]*xv;
899         sum5  += v[4]*xv;
900         sum6  += v[5]*xv;
901         sum7  += v[6]*xv;
902         sum8  += v[7]*xv;
903         sum9  += v[8]*xv;
904         sum10 += v[9]*xv;
905         sum11 += v[10]*xv;
906         sum12 += v[11]*xv;
907         sum13 += v[12]*xv;
908         sum14 += v[13]*xv;
909         sum15 += v[14]*xv;
910         v     += 15;
911       }
912     }
913     if (usecprow) z = zarray + 15*ridx[i];
914     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
916 
917     if (!usecprow) z += 15;
918   }
919 
920   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
921   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
922   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
923   PetscFunctionReturn(0);
924 }
925 
926 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
927 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
928 {
929   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
930   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931   const PetscScalar *x,*xb;
932   PetscScalar       x1,x2,x3,x4,*zarray;
933   const MatScalar   *v;
934   PetscErrorCode    ierr;
935   const PetscInt    *ii,*ij=a->j,*idx;
936   PetscInt          mbs,i,j,n,*ridx=NULL;
937   PetscBool         usecprow=a->compressedrow.use;
938 
939   PetscFunctionBegin;
940   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
941   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
942 
943   v = a->a;
944   if (usecprow) {
945     mbs  = a->compressedrow.nrows;
946     ii   = a->compressedrow.i;
947     ridx = a->compressedrow.rindex;
948     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
949   } else {
950     mbs = a->mbs;
951     ii  = a->i;
952     z   = zarray;
953   }
954 
955   for (i=0; i<mbs; i++) {
956     n    = ii[i+1] - ii[i];
957     idx  = ij + ii[i];
958     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
959     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
960 
961     for (j=0; j<n; j++) {
962       xb = x + 15*(idx[j]);
963       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
964 
965       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
966       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
967       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
968       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
969       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
970       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
971       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
972       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
973       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
974       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
975       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
976       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
977       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
978       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
979       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
980 
981       v += 60;
982 
983       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
984 
985       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
986       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
987       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
988       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
989       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
990       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
991       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
992       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
993       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
994       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
995       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
996       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
997       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
998       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
999       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1000       v     += 60;
1001 
1002       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1003       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1004       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1005       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1006       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1007       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1008       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1009       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1010       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1011       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1012       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1013       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1014       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1015       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1016       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1017       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1018       v     += 60;
1019 
1020       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1021       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1022       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1023       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1024       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1025       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1026       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1027       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1028       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1029       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1030       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1031       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1032       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1033       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1034       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1035       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1036       v     += 45;
1037     }
1038     if (usecprow) z = zarray + 15*ridx[i];
1039     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1040     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1041 
1042     if (!usecprow) z += 15;
1043   }
1044 
1045   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1046   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1047   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1052 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1053 {
1054   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1055   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1056   const PetscScalar *x,*xb;
1057   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1058   const MatScalar   *v;
1059   PetscErrorCode    ierr;
1060   const PetscInt    *ii,*ij=a->j,*idx;
1061   PetscInt          mbs,i,j,n,*ridx=NULL;
1062   PetscBool         usecprow=a->compressedrow.use;
1063 
1064   PetscFunctionBegin;
1065   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1066   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1067 
1068   v = a->a;
1069   if (usecprow) {
1070     mbs  = a->compressedrow.nrows;
1071     ii   = a->compressedrow.i;
1072     ridx = a->compressedrow.rindex;
1073     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1074   } else {
1075     mbs = a->mbs;
1076     ii  = a->i;
1077     z   = zarray;
1078   }
1079 
1080   for (i=0; i<mbs; i++) {
1081     n    = ii[i+1] - ii[i];
1082     idx  = ij + ii[i];
1083     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1084     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1085 
1086     for (j=0; j<n; j++) {
1087       xb = x + 15*(idx[j]);
1088       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1089       x8 = xb[7];
1090 
1091       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1092       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1093       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1094       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1095       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1096       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1097       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1098       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1099       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1100       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1101       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1102       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1103       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1104       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1105       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1106       v     += 120;
1107 
1108       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1109 
1110       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1111       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1112       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1113       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1114       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1115       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1116       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1117       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1118       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1119       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1120       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1121       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1122       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1123       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1124       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1125       v     += 105;
1126     }
1127     if (usecprow) z = zarray + 15*ridx[i];
1128     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1129     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1130 
1131     if (!usecprow) z += 15;
1132   }
1133 
1134   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1135   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1136   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1137   PetscFunctionReturn(0);
1138 }
1139 
1140 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1141 
1142 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1143 {
1144   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1145   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1146   const PetscScalar *x,*xb;
1147   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1148   const MatScalar   *v;
1149   PetscErrorCode    ierr;
1150   const PetscInt    *ii,*ij=a->j,*idx;
1151   PetscInt          mbs,i,j,n,*ridx=NULL;
1152   PetscBool         usecprow=a->compressedrow.use;
1153 
1154   PetscFunctionBegin;
1155   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1156   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1157 
1158   v = a->a;
1159   if (usecprow) {
1160     mbs  = a->compressedrow.nrows;
1161     ii   = a->compressedrow.i;
1162     ridx = a->compressedrow.rindex;
1163     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1164   } else {
1165     mbs = a->mbs;
1166     ii  = a->i;
1167     z   = zarray;
1168   }
1169 
1170   for (i=0; i<mbs; i++) {
1171     n    = ii[i+1] - ii[i];
1172     idx  = ij + ii[i];
1173     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1174     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1175 
1176     for (j=0; j<n; j++) {
1177       xb = x + 15*(idx[j]);
1178       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1179       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1180 
1181       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1182       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1183       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1184       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1185       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1186       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1187       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1188       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1189       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1190       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1191       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1192       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1193       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1194       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1195       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1196       v     += 225;
1197     }
1198     if (usecprow) z = zarray + 15*ridx[i];
1199     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1200     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1201 
1202     if (!usecprow) z += 15;
1203   }
1204 
1205   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1206   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1207   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1208   PetscFunctionReturn(0);
1209 }
1210 
1211 
1212 /*
1213     This will not work with MatScalar == float because it calls the BLAS
1214 */
1215 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1216 {
1217   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1218   PetscScalar       *z = 0,*work,*workt,*zarray;
1219   const PetscScalar *x,*xb;
1220   const MatScalar   *v;
1221   PetscErrorCode    ierr;
1222   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1223   const PetscInt    *idx,*ii,*ridx=NULL;
1224   PetscInt          ncols,k;
1225   PetscBool         usecprow=a->compressedrow.use;
1226 
1227   PetscFunctionBegin;
1228   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1229   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1230 
1231   idx = a->j;
1232   v   = a->a;
1233   if (usecprow) {
1234     mbs  = a->compressedrow.nrows;
1235     ii   = a->compressedrow.i;
1236     ridx = a->compressedrow.rindex;
1237     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1238   } else {
1239     mbs = a->mbs;
1240     ii  = a->i;
1241     z   = zarray;
1242   }
1243 
1244   if (!a->mult_work) {
1245     k    = PetscMax(A->rmap->n,A->cmap->n);
1246     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1247   }
1248   work = a->mult_work;
1249   for (i=0; i<mbs; i++) {
1250     n           = ii[1] - ii[0]; ii++;
1251     ncols       = n*bs;
1252     workt       = work;
1253     for (j=0; j<n; j++) {
1254       xb = x + bs*(*idx++);
1255       for (k=0; k<bs; k++) workt[k] = xb[k];
1256       workt += bs;
1257     }
1258     if (usecprow) z = zarray + bs*ridx[i];
1259     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1260     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1261     v += n*bs2;
1262     if (!usecprow) z += bs;
1263   }
1264   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1265   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1266   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1267   PetscFunctionReturn(0);
1268 }
1269 
1270 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1271 {
1272   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1273   const PetscScalar *x;
1274   PetscScalar       *y,*z,sum;
1275   const MatScalar   *v;
1276   PetscErrorCode    ierr;
1277   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1278   const PetscInt    *idx,*ii;
1279   PetscBool         usecprow=a->compressedrow.use;
1280 
1281   PetscFunctionBegin;
1282   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1283   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1284 
1285   idx = a->j;
1286   v   = a->a;
1287   if (usecprow) {
1288     if (zz != yy) {
1289       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1290     }
1291     mbs  = a->compressedrow.nrows;
1292     ii   = a->compressedrow.i;
1293     ridx = a->compressedrow.rindex;
1294   } else {
1295     ii = a->i;
1296   }
1297 
1298   for (i=0; i<mbs; i++) {
1299     n = ii[1] - ii[0];
1300     ii++;
1301     if (!usecprow) {
1302       sum         = y[i];
1303     } else {
1304       sum = y[ridx[i]];
1305     }
1306     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1307     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1308     PetscSparseDensePlusDot(sum,x,v,idx,n);
1309     v   += n;
1310     idx += n;
1311     if (usecprow) {
1312       z[ridx[i]] = sum;
1313     } else {
1314       z[i] = sum;
1315     }
1316   }
1317   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1318   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1319   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1320   PetscFunctionReturn(0);
1321 }
1322 
1323 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1324 {
1325   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1326   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1327   const PetscScalar *x,*xb;
1328   PetscScalar       x1,x2,*yarray,*zarray;
1329   const MatScalar   *v;
1330   PetscErrorCode    ierr;
1331   PetscInt          mbs = a->mbs,i,n,j;
1332   const PetscInt    *idx,*ii,*ridx = NULL;
1333   PetscBool         usecprow = a->compressedrow.use;
1334 
1335   PetscFunctionBegin;
1336   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1337   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1338 
1339   idx = a->j;
1340   v   = a->a;
1341   if (usecprow) {
1342     if (zz != yy) {
1343       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1344     }
1345     mbs  = a->compressedrow.nrows;
1346     ii   = a->compressedrow.i;
1347     ridx = a->compressedrow.rindex;
1348   } else {
1349     ii = a->i;
1350     y  = yarray;
1351     z  = zarray;
1352   }
1353 
1354   for (i=0; i<mbs; i++) {
1355     n = ii[1] - ii[0]; ii++;
1356     if (usecprow) {
1357       z = zarray + 2*ridx[i];
1358       y = yarray + 2*ridx[i];
1359     }
1360     sum1 = y[0]; sum2 = y[1];
1361     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1362     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1363     for (j=0; j<n; j++) {
1364       xb = x + 2*(*idx++);
1365       x1 = xb[0];
1366       x2 = xb[1];
1367 
1368       sum1 += v[0]*x1 + v[2]*x2;
1369       sum2 += v[1]*x1 + v[3]*x2;
1370       v    += 4;
1371     }
1372     z[0] = sum1; z[1] = sum2;
1373     if (!usecprow) {
1374       z += 2; y += 2;
1375     }
1376   }
1377   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1378   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1379   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1380   PetscFunctionReturn(0);
1381 }
1382 
1383 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1384 {
1385   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1386   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1387   const PetscScalar *x,*xb;
1388   const MatScalar   *v;
1389   PetscErrorCode    ierr;
1390   PetscInt          mbs = a->mbs,i,j,n;
1391   const PetscInt    *idx,*ii,*ridx = NULL;
1392   PetscBool         usecprow = a->compressedrow.use;
1393 
1394   PetscFunctionBegin;
1395   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1396   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1397 
1398   idx = a->j;
1399   v   = a->a;
1400   if (usecprow) {
1401     if (zz != yy) {
1402       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1403     }
1404     mbs  = a->compressedrow.nrows;
1405     ii   = a->compressedrow.i;
1406     ridx = a->compressedrow.rindex;
1407   } else {
1408     ii = a->i;
1409     y  = yarray;
1410     z  = zarray;
1411   }
1412 
1413   for (i=0; i<mbs; i++) {
1414     n = ii[1] - ii[0]; ii++;
1415     if (usecprow) {
1416       z = zarray + 3*ridx[i];
1417       y = yarray + 3*ridx[i];
1418     }
1419     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1420     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1421     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1422     for (j=0; j<n; j++) {
1423       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1424       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1425       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1426       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1427       v    += 9;
1428     }
1429     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1430     if (!usecprow) {
1431       z += 3; y += 3;
1432     }
1433   }
1434   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1435   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1436   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1437   PetscFunctionReturn(0);
1438 }
1439 
1440 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1441 {
1442   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1443   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1444   const PetscScalar *x,*xb;
1445   const MatScalar   *v;
1446   PetscErrorCode    ierr;
1447   PetscInt          mbs = a->mbs,i,j,n;
1448   const PetscInt    *idx,*ii,*ridx=NULL;
1449   PetscBool         usecprow=a->compressedrow.use;
1450 
1451   PetscFunctionBegin;
1452   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1453   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1454 
1455   idx = a->j;
1456   v   = a->a;
1457   if (usecprow) {
1458     if (zz != yy) {
1459       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1460     }
1461     mbs  = a->compressedrow.nrows;
1462     ii   = a->compressedrow.i;
1463     ridx = a->compressedrow.rindex;
1464   } else {
1465     ii = a->i;
1466     y  = yarray;
1467     z  = zarray;
1468   }
1469 
1470   for (i=0; i<mbs; i++) {
1471     n = ii[1] - ii[0]; ii++;
1472     if (usecprow) {
1473       z = zarray + 4*ridx[i];
1474       y = yarray + 4*ridx[i];
1475     }
1476     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1477     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1478     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1479     for (j=0; j<n; j++) {
1480       xb    = x + 4*(*idx++);
1481       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1482       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1483       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1484       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1485       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1486       v    += 16;
1487     }
1488     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1489     if (!usecprow) {
1490       z += 4; y += 4;
1491     }
1492   }
1493   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1494   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1495   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1500 {
1501   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1502   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1503   const PetscScalar *x,*xb;
1504   PetscScalar       *yarray,*zarray;
1505   const MatScalar   *v;
1506   PetscErrorCode    ierr;
1507   PetscInt          mbs = a->mbs,i,j,n;
1508   const PetscInt    *idx,*ii,*ridx = NULL;
1509   PetscBool         usecprow=a->compressedrow.use;
1510 
1511   PetscFunctionBegin;
1512   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1513   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1514 
1515   idx = a->j;
1516   v   = a->a;
1517   if (usecprow) {
1518     if (zz != yy) {
1519       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1520     }
1521     mbs  = a->compressedrow.nrows;
1522     ii   = a->compressedrow.i;
1523     ridx = a->compressedrow.rindex;
1524   } else {
1525     ii = a->i;
1526     y  = yarray;
1527     z  = zarray;
1528   }
1529 
1530   for (i=0; i<mbs; i++) {
1531     n = ii[1] - ii[0]; ii++;
1532     if (usecprow) {
1533       z = zarray + 5*ridx[i];
1534       y = yarray + 5*ridx[i];
1535     }
1536     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1537     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1538     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1539     for (j=0; j<n; j++) {
1540       xb    = x + 5*(*idx++);
1541       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1542       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1543       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1544       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1545       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1546       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1547       v    += 25;
1548     }
1549     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1550     if (!usecprow) {
1551       z += 5; y += 5;
1552     }
1553   }
1554   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1555   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1556   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1557   PetscFunctionReturn(0);
1558 }
1559 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1560 {
1561   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1562   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1563   const PetscScalar *x,*xb;
1564   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1565   const MatScalar   *v;
1566   PetscErrorCode    ierr;
1567   PetscInt          mbs = a->mbs,i,j,n;
1568   const PetscInt    *idx,*ii,*ridx=NULL;
1569   PetscBool         usecprow=a->compressedrow.use;
1570 
1571   PetscFunctionBegin;
1572   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1573   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1574 
1575   idx = a->j;
1576   v   = a->a;
1577   if (usecprow) {
1578     if (zz != yy) {
1579       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1580     }
1581     mbs  = a->compressedrow.nrows;
1582     ii   = a->compressedrow.i;
1583     ridx = a->compressedrow.rindex;
1584   } else {
1585     ii = a->i;
1586     y  = yarray;
1587     z  = zarray;
1588   }
1589 
1590   for (i=0; i<mbs; i++) {
1591     n = ii[1] - ii[0]; ii++;
1592     if (usecprow) {
1593       z = zarray + 6*ridx[i];
1594       y = yarray + 6*ridx[i];
1595     }
1596     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1597     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1598     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1599     for (j=0; j<n; j++) {
1600       xb    = x + 6*(*idx++);
1601       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1602       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1603       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1604       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1605       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1606       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1607       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1608       v    += 36;
1609     }
1610     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1611     if (!usecprow) {
1612       z += 6; y += 6;
1613     }
1614   }
1615   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1616   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1617   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1622 {
1623   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1624   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1625   const PetscScalar *x,*xb;
1626   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1627   const MatScalar   *v;
1628   PetscErrorCode    ierr;
1629   PetscInt          mbs = a->mbs,i,j,n;
1630   const PetscInt    *idx,*ii,*ridx = NULL;
1631   PetscBool         usecprow=a->compressedrow.use;
1632 
1633   PetscFunctionBegin;
1634   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1635   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1636 
1637   idx = a->j;
1638   v   = a->a;
1639   if (usecprow) {
1640     if (zz != yy) {
1641       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1642     }
1643     mbs  = a->compressedrow.nrows;
1644     ii   = a->compressedrow.i;
1645     ridx = a->compressedrow.rindex;
1646   } else {
1647     ii = a->i;
1648     y  = yarray;
1649     z  = zarray;
1650   }
1651 
1652   for (i=0; i<mbs; i++) {
1653     n = ii[1] - ii[0]; ii++;
1654     if (usecprow) {
1655       z = zarray + 7*ridx[i];
1656       y = yarray + 7*ridx[i];
1657     }
1658     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1659     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1660     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1661     for (j=0; j<n; j++) {
1662       xb    = x + 7*(*idx++);
1663       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1664       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1665       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1666       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1667       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1668       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1669       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1670       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1671       v    += 49;
1672     }
1673     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1674     if (!usecprow) {
1675       z += 7; y += 7;
1676     }
1677   }
1678   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1679   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1680   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
1685 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1686 {
1687   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1688   PetscScalar    *z = 0,*work,*workt,*zarray;
1689   const PetscScalar *x,*xb;
1690   const MatScalar   *v;
1691   PetscErrorCode ierr;
1692   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1693   PetscInt       k;
1694   PetscBool      usecprow=a->compressedrow.use;
1695   const PetscInt *idx,*ii,*ridx=NULL;
1696 
1697   __m256d a0,a1,a2,a3,a4,a5;
1698   __m256d w0,w1,w2,w3;
1699   __m256d z0,z1,z2;
1700   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1701 
1702   PetscFunctionBegin;
1703   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1704   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1705   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1706 
1707   idx = a->j;
1708   v   = a->a;
1709   if (usecprow) {
1710     mbs  = a->compressedrow.nrows;
1711     ii   = a->compressedrow.i;
1712     ridx = a->compressedrow.rindex;
1713   } else {
1714     mbs = a->mbs;
1715     ii  = a->i;
1716     z   = zarray;
1717   }
1718 
1719   if (!a->mult_work) {
1720     k    = PetscMax(A->rmap->n,A->cmap->n);
1721     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1722   }
1723 
1724   work = a->mult_work;
1725   for (i=0; i<mbs; i++) {
1726     n           = ii[1] - ii[0]; ii++;
1727     workt       = work;
1728     for (j=0; j<n; j++) {
1729       xb = x + bs*(*idx++);
1730       for (k=0; k<bs; k++) workt[k] = xb[k];
1731       workt += bs;
1732     }
1733     if (usecprow) z = zarray + bs*ridx[i];
1734 
1735     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1736 
1737     for (j=0; j<n; j++) {
1738       // first column of a
1739       w0 = _mm256_set1_pd(work[j*9  ]);
1740       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1741       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1742       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1743 
1744       // second column of a
1745       w1 = _mm256_set1_pd(work[j*9+ 1]);
1746       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1747       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1748       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1749 
1750       // third column of a
1751       w2 = _mm256_set1_pd(work[j*9+ 2]);
1752       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1753       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1754       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1755 
1756       // fourth column of a
1757       w3 = _mm256_set1_pd(work[j*9+ 3]);
1758       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1759       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1760       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1761 
1762       // fifth column of a
1763       w0 = _mm256_set1_pd(work[j*9+ 4]);
1764       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1765       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1766       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1767 
1768       // sixth column of a
1769       w1 = _mm256_set1_pd(work[j*9+ 5]);
1770       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1771       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1772       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1773 
1774       // seventh column of a
1775       w2 = _mm256_set1_pd(work[j*9+ 6]);
1776       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1777       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1778       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1779 
1780       // eigth column of a
1781       w3 = _mm256_set1_pd(work[j*9+ 7]);
1782       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1783       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1784       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1785 
1786       // ninth column of a
1787       w0 = _mm256_set1_pd(work[j*9+ 8]);
1788       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1789       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1790       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1791     }
1792 
1793     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1794 
1795     v += n*bs2;
1796     if (!usecprow) z += bs;
1797   }
1798   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1799   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1800   ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
1801   PetscFunctionReturn(0);
1802 }
1803 #endif
1804 
1805 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1806 {
1807   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1808   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1809   const PetscScalar *x,*xb;
1810   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1811   const MatScalar   *v;
1812   PetscErrorCode    ierr;
1813   PetscInt          mbs = a->mbs,i,j,n;
1814   const PetscInt    *idx,*ii,*ridx = NULL;
1815   PetscBool         usecprow=a->compressedrow.use;
1816 
1817   PetscFunctionBegin;
1818   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1819   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1820 
1821   idx = a->j;
1822   v   = a->a;
1823   if (usecprow) {
1824     if (zz != yy) {
1825       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1826     }
1827     mbs  = a->compressedrow.nrows;
1828     ii   = a->compressedrow.i;
1829     ridx = a->compressedrow.rindex;
1830   } else {
1831     ii = a->i;
1832     y  = yarray;
1833     z  = zarray;
1834   }
1835 
1836   for (i=0; i<mbs; i++) {
1837     n = ii[1] - ii[0]; ii++;
1838     if (usecprow) {
1839       z = zarray + 11*ridx[i];
1840       y = yarray + 11*ridx[i];
1841     }
1842     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1843     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1844     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1845     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1846     for (j=0; j<n; j++) {
1847       xb    = x + 11*(*idx++);
1848       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1849       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1850       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1851       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1852       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1853       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1854       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1855       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1856       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1857       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1858       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1859       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1860       v    += 121;
1861     }
1862     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1863     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1864     if (!usecprow) {
1865       z += 11; y += 11;
1866     }
1867   }
1868   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1869   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1870   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
1871   PetscFunctionReturn(0);
1872 }
1873 
1874 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1875 {
1876   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1877   PetscScalar       *z = 0,*work,*workt,*zarray;
1878   const PetscScalar *x,*xb;
1879   const MatScalar   *v;
1880   PetscErrorCode    ierr;
1881   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1882   PetscInt          ncols,k;
1883   const PetscInt    *ridx = NULL,*idx,*ii;
1884   PetscBool         usecprow = a->compressedrow.use;
1885 
1886   PetscFunctionBegin;
1887   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1888   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1889   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1890 
1891   idx = a->j;
1892   v   = a->a;
1893   if (usecprow) {
1894     mbs  = a->compressedrow.nrows;
1895     ii   = a->compressedrow.i;
1896     ridx = a->compressedrow.rindex;
1897   } else {
1898     mbs = a->mbs;
1899     ii  = a->i;
1900     z   = zarray;
1901   }
1902 
1903   if (!a->mult_work) {
1904     k    = PetscMax(A->rmap->n,A->cmap->n);
1905     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1906   }
1907   work = a->mult_work;
1908   for (i=0; i<mbs; i++) {
1909     n     = ii[1] - ii[0]; ii++;
1910     ncols = n*bs;
1911     workt = work;
1912     for (j=0; j<n; j++) {
1913       xb = x + bs*(*idx++);
1914       for (k=0; k<bs; k++) workt[k] = xb[k];
1915       workt += bs;
1916     }
1917     if (usecprow) z = zarray + bs*ridx[i];
1918     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1919     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1920     v += n*bs2;
1921     if (!usecprow) z += bs;
1922   }
1923   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1924   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1925   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1926   PetscFunctionReturn(0);
1927 }
1928 
1929 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1930 {
1931   PetscScalar    zero = 0.0;
1932   PetscErrorCode ierr;
1933 
1934   PetscFunctionBegin;
1935   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1936   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1937   PetscFunctionReturn(0);
1938 }
1939 
1940 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1941 {
1942   PetscScalar    zero = 0.0;
1943   PetscErrorCode ierr;
1944 
1945   PetscFunctionBegin;
1946   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1947   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1948   PetscFunctionReturn(0);
1949 }
1950 
1951 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1952 {
1953   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1954   PetscScalar       *z,x1,x2,x3,x4,x5;
1955   const PetscScalar *x,*xb = NULL;
1956   const MatScalar   *v;
1957   PetscErrorCode    ierr;
1958   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1959   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1960   Mat_CompressedRow cprow = a->compressedrow;
1961   PetscBool         usecprow = cprow.use;
1962 
1963   PetscFunctionBegin;
1964   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1965   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1966   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1967 
1968   idx = a->j;
1969   v   = a->a;
1970   if (usecprow) {
1971     mbs  = cprow.nrows;
1972     ii   = cprow.i;
1973     ridx = cprow.rindex;
1974   } else {
1975     mbs=a->mbs;
1976     ii = a->i;
1977     xb = x;
1978   }
1979 
1980   switch (bs) {
1981   case 1:
1982     for (i=0; i<mbs; i++) {
1983       if (usecprow) xb = x + ridx[i];
1984       x1 = xb[0];
1985       ib = idx + ii[0];
1986       n  = ii[1] - ii[0]; ii++;
1987       for (j=0; j<n; j++) {
1988         rval     = ib[j];
1989         z[rval] += PetscConj(*v) * x1;
1990         v++;
1991       }
1992       if (!usecprow) xb++;
1993     }
1994     break;
1995   case 2:
1996     for (i=0; i<mbs; i++) {
1997       if (usecprow) xb = x + 2*ridx[i];
1998       x1 = xb[0]; x2 = xb[1];
1999       ib = idx + ii[0];
2000       n  = ii[1] - ii[0]; ii++;
2001       for (j=0; j<n; j++) {
2002         rval       = ib[j]*2;
2003         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2004         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2005         v         += 4;
2006       }
2007       if (!usecprow) xb += 2;
2008     }
2009     break;
2010   case 3:
2011     for (i=0; i<mbs; i++) {
2012       if (usecprow) xb = x + 3*ridx[i];
2013       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2014       ib = idx + ii[0];
2015       n  = ii[1] - ii[0]; ii++;
2016       for (j=0; j<n; j++) {
2017         rval       = ib[j]*3;
2018         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2019         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2020         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2021         v         += 9;
2022       }
2023       if (!usecprow) xb += 3;
2024     }
2025     break;
2026   case 4:
2027     for (i=0; i<mbs; i++) {
2028       if (usecprow) xb = x + 4*ridx[i];
2029       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2030       ib = idx + ii[0];
2031       n  = ii[1] - ii[0]; ii++;
2032       for (j=0; j<n; j++) {
2033         rval       = ib[j]*4;
2034         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2035         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2036         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2037         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2038         v         += 16;
2039       }
2040       if (!usecprow) xb += 4;
2041     }
2042     break;
2043   case 5:
2044     for (i=0; i<mbs; i++) {
2045       if (usecprow) xb = x + 5*ridx[i];
2046       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2047       x4 = xb[3]; x5 = xb[4];
2048       ib = idx + ii[0];
2049       n  = ii[1] - ii[0]; ii++;
2050       for (j=0; j<n; j++) {
2051         rval       = ib[j]*5;
2052         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2053         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2054         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2055         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2056         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2057         v         += 25;
2058       }
2059       if (!usecprow) xb += 5;
2060     }
2061     break;
2062   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2063     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2064 #if 0
2065     {
2066       PetscInt          ncols,k,bs2=a->bs2;
2067       PetscScalar       *work,*workt,zb;
2068       const PetscScalar *xtmp;
2069       if (!a->mult_work) {
2070         k    = PetscMax(A->rmap->n,A->cmap->n);
2071         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2072       }
2073       work = a->mult_work;
2074       xtmp = x;
2075       for (i=0; i<mbs; i++) {
2076         n     = ii[1] - ii[0]; ii++;
2077         ncols = n*bs;
2078         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2079         if (usecprow) xtmp = x + bs*ridx[i];
2080         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2081         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2082         v += n*bs2;
2083         if (!usecprow) xtmp += bs;
2084         workt = work;
2085         for (j=0; j<n; j++) {
2086           zb = z + bs*(*idx++);
2087           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2088           workt += bs;
2089         }
2090       }
2091     }
2092 #endif
2093   }
2094   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2095   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2096   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2097   PetscFunctionReturn(0);
2098 }
2099 
2100 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2101 {
2102   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2103   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2104   const PetscScalar *x,*xb = 0;
2105   const MatScalar   *v;
2106   PetscErrorCode    ierr;
2107   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2108   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2109   Mat_CompressedRow cprow   = a->compressedrow;
2110   PetscBool         usecprow=cprow.use;
2111 
2112   PetscFunctionBegin;
2113   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2114   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2115   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2116 
2117   idx = a->j;
2118   v   = a->a;
2119   if (usecprow) {
2120     mbs  = cprow.nrows;
2121     ii   = cprow.i;
2122     ridx = cprow.rindex;
2123   } else {
2124     mbs=a->mbs;
2125     ii = a->i;
2126     xb = x;
2127   }
2128 
2129   switch (bs) {
2130   case 1:
2131     for (i=0; i<mbs; i++) {
2132       if (usecprow) xb = x + ridx[i];
2133       x1 = xb[0];
2134       ib = idx + ii[0];
2135       n  = ii[1] - ii[0]; ii++;
2136       for (j=0; j<n; j++) {
2137         rval     = ib[j];
2138         z[rval] += *v * x1;
2139         v++;
2140       }
2141       if (!usecprow) xb++;
2142     }
2143     break;
2144   case 2:
2145     for (i=0; i<mbs; i++) {
2146       if (usecprow) xb = x + 2*ridx[i];
2147       x1 = xb[0]; x2 = xb[1];
2148       ib = idx + ii[0];
2149       n  = ii[1] - ii[0]; ii++;
2150       for (j=0; j<n; j++) {
2151         rval       = ib[j]*2;
2152         z[rval++] += v[0]*x1 + v[1]*x2;
2153         z[rval++] += v[2]*x1 + v[3]*x2;
2154         v         += 4;
2155       }
2156       if (!usecprow) xb += 2;
2157     }
2158     break;
2159   case 3:
2160     for (i=0; i<mbs; i++) {
2161       if (usecprow) xb = x + 3*ridx[i];
2162       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2163       ib = idx + ii[0];
2164       n  = ii[1] - ii[0]; ii++;
2165       for (j=0; j<n; j++) {
2166         rval       = ib[j]*3;
2167         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2168         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2169         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2170         v         += 9;
2171       }
2172       if (!usecprow) xb += 3;
2173     }
2174     break;
2175   case 4:
2176     for (i=0; i<mbs; i++) {
2177       if (usecprow) xb = x + 4*ridx[i];
2178       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2179       ib = idx + ii[0];
2180       n  = ii[1] - ii[0]; ii++;
2181       for (j=0; j<n; j++) {
2182         rval       = ib[j]*4;
2183         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2184         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2185         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2186         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2187         v         += 16;
2188       }
2189       if (!usecprow) xb += 4;
2190     }
2191     break;
2192   case 5:
2193     for (i=0; i<mbs; i++) {
2194       if (usecprow) xb = x + 5*ridx[i];
2195       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2196       x4 = xb[3]; x5 = xb[4];
2197       ib = idx + ii[0];
2198       n  = ii[1] - ii[0]; ii++;
2199       for (j=0; j<n; j++) {
2200         rval       = ib[j]*5;
2201         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2202         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2203         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2204         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2205         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2206         v         += 25;
2207       }
2208       if (!usecprow) xb += 5;
2209     }
2210     break;
2211   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2212     PetscInt          ncols,k;
2213     PetscScalar       *work,*workt;
2214     const PetscScalar *xtmp;
2215     if (!a->mult_work) {
2216       k    = PetscMax(A->rmap->n,A->cmap->n);
2217       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2218     }
2219     work = a->mult_work;
2220     xtmp = x;
2221     for (i=0; i<mbs; i++) {
2222       n     = ii[1] - ii[0]; ii++;
2223       ncols = n*bs;
2224       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2225       if (usecprow) xtmp = x + bs*ridx[i];
2226       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2227       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2228       v += n*bs2;
2229       if (!usecprow) xtmp += bs;
2230       workt = work;
2231       for (j=0; j<n; j++) {
2232         zb = z + bs*(*idx++);
2233         for (k=0; k<bs; k++) zb[k] += workt[k];
2234         workt += bs;
2235       }
2236     }
2237     }
2238   }
2239   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2240   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2241   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2242   PetscFunctionReturn(0);
2243 }
2244 
2245 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2246 {
2247   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2248   PetscInt       totalnz = a->bs2*a->nz;
2249   PetscScalar    oalpha  = alpha;
2250   PetscErrorCode ierr;
2251   PetscBLASInt   one = 1,tnz;
2252 
2253   PetscFunctionBegin;
2254   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2255   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2256   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2257   PetscFunctionReturn(0);
2258 }
2259 
2260 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2261 {
2262   PetscErrorCode ierr;
2263   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2264   MatScalar      *v  = a->a;
2265   PetscReal      sum = 0.0;
2266   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2267 
2268   PetscFunctionBegin;
2269   if (type == NORM_FROBENIUS) {
2270 #if defined(PETSC_USE_REAL___FP16)
2271     PetscBLASInt one = 1,cnt = bs2*nz;
2272     *norm = BLASnrm2_(&cnt,v,&one);
2273 #else
2274     for (i=0; i<bs2*nz; i++) {
2275       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2276     }
2277 #endif
2278     *norm = PetscSqrtReal(sum);
2279     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
2280   } else if (type == NORM_1) { /* maximum column sum */
2281     PetscReal *tmp;
2282     PetscInt  *bcol = a->j;
2283     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2284     for (i=0; i<nz; i++) {
2285       for (j=0; j<bs; j++) {
2286         k1 = bs*(*bcol) + j; /* column index */
2287         for (k=0; k<bs; k++) {
2288           tmp[k1] += PetscAbsScalar(*v); v++;
2289         }
2290       }
2291       bcol++;
2292     }
2293     *norm = 0.0;
2294     for (j=0; j<A->cmap->n; j++) {
2295       if (tmp[j] > *norm) *norm = tmp[j];
2296     }
2297     ierr = PetscFree(tmp);CHKERRQ(ierr);
2298     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2299   } else if (type == NORM_INFINITY) { /* maximum row sum */
2300     *norm = 0.0;
2301     for (k=0; k<bs; k++) {
2302       for (j=0; j<a->mbs; j++) {
2303         v   = a->a + bs2*a->i[j] + k;
2304         sum = 0.0;
2305         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2306           for (k1=0; k1<bs; k1++) {
2307             sum += PetscAbsScalar(*v);
2308             v   += bs;
2309           }
2310         }
2311         if (sum > *norm) *norm = sum;
2312       }
2313     }
2314     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2315   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2316   PetscFunctionReturn(0);
2317 }
2318 
2319 
2320 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2321 {
2322   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2323   PetscErrorCode ierr;
2324 
2325   PetscFunctionBegin;
2326   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2327   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2328     *flg = PETSC_FALSE;
2329     PetscFunctionReturn(0);
2330   }
2331 
2332   /* if the a->i are the same */
2333   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2334   if (!*flg) PetscFunctionReturn(0);
2335 
2336   /* if a->j are the same */
2337   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2338   if (!*flg) PetscFunctionReturn(0);
2339 
2340   /* if a->a are the same */
2341   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2342   PetscFunctionReturn(0);
2343 
2344 }
2345 
2346 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2347 {
2348   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2349   PetscErrorCode ierr;
2350   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2351   PetscScalar    *x,zero = 0.0;
2352   MatScalar      *aa,*aa_j;
2353 
2354   PetscFunctionBegin;
2355   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2356   bs   = A->rmap->bs;
2357   aa   = a->a;
2358   ai   = a->i;
2359   aj   = a->j;
2360   ambs = a->mbs;
2361   bs2  = a->bs2;
2362 
2363   ierr = VecSet(v,zero);CHKERRQ(ierr);
2364   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2365   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2366   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2367   for (i=0; i<ambs; i++) {
2368     for (j=ai[i]; j<ai[i+1]; j++) {
2369       if (aj[j] == i) {
2370         row  = i*bs;
2371         aa_j = aa+j*bs2;
2372         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2373         break;
2374       }
2375     }
2376   }
2377   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2378   PetscFunctionReturn(0);
2379 }
2380 
2381 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2382 {
2383   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2384   const PetscScalar *l,*r,*li,*ri;
2385   PetscScalar       x;
2386   MatScalar         *aa, *v;
2387   PetscErrorCode    ierr;
2388   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2389   const PetscInt    *ai,*aj;
2390 
2391   PetscFunctionBegin;
2392   ai  = a->i;
2393   aj  = a->j;
2394   aa  = a->a;
2395   m   = A->rmap->n;
2396   n   = A->cmap->n;
2397   bs  = A->rmap->bs;
2398   mbs = a->mbs;
2399   bs2 = a->bs2;
2400   if (ll) {
2401     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2402     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2403     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2404     for (i=0; i<mbs; i++) { /* for each block row */
2405       M  = ai[i+1] - ai[i];
2406       li = l + i*bs;
2407       v  = aa + bs2*ai[i];
2408       for (j=0; j<M; j++) { /* for each block */
2409         for (k=0; k<bs2; k++) {
2410           (*v++) *= li[k%bs];
2411         }
2412       }
2413     }
2414     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2415     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2416   }
2417 
2418   if (rr) {
2419     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2420     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2421     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2422     for (i=0; i<mbs; i++) { /* for each block row */
2423       iai = ai[i];
2424       M   = ai[i+1] - iai;
2425       v   = aa + bs2*iai;
2426       for (j=0; j<M; j++) { /* for each block */
2427         ri = r + bs*aj[iai+j];
2428         for (k=0; k<bs; k++) {
2429           x = ri[k];
2430           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2431           v += bs;
2432         }
2433       }
2434     }
2435     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2436     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2437   }
2438   PetscFunctionReturn(0);
2439 }
2440 
2441 
2442 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2443 {
2444   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2445 
2446   PetscFunctionBegin;
2447   info->block_size   = a->bs2;
2448   info->nz_allocated = a->bs2*a->maxnz;
2449   info->nz_used      = a->bs2*a->nz;
2450   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2451   info->assemblies   = A->num_ass;
2452   info->mallocs      = A->info.mallocs;
2453   info->memory       = ((PetscObject)A)->mem;
2454   if (A->factortype) {
2455     info->fill_ratio_given  = A->info.fill_ratio_given;
2456     info->fill_ratio_needed = A->info.fill_ratio_needed;
2457     info->factor_mallocs    = A->info.factor_mallocs;
2458   } else {
2459     info->fill_ratio_given  = 0;
2460     info->fill_ratio_needed = 0;
2461     info->factor_mallocs    = 0;
2462   }
2463   PetscFunctionReturn(0);
2464 }
2465 
2466 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2467 {
2468   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2469   PetscErrorCode ierr;
2470 
2471   PetscFunctionBegin;
2472   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2473   PetscFunctionReturn(0);
2474 }
2475