xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 5a856986583887c326abe5dfd149e8184a29cd80)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
16   const PetscInt *idx;
17   PetscInt       start,end,*ai,*aj,bs,*nidx2;
18   PetscBT        table;
19 
20   PetscFunctionBegin;
21   m  = a->mbs;
22   ai = a->i;
23   aj = a->j;
24   bs = A->rmap->bs;
25 
26   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27 
28   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
29   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
30   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
31 
32   for (i=0; i<is_max; i++) {
33     /* Initialise the two local arrays */
34     isz  = 0;
35     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
36 
37     /* Extract the indices, assume there can be duplicate entries */
38     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
39     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
40 
41     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42     for (j=0; j<n; ++j) {
43       ival = idx[j]/bs; /* convert the indices into block indices */
44       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap*/
52       n = isz;
53       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
54         row   = nidx[k];
55         start = ai[row];
56         end   = ai[row+1];
57         for (l = start; l<end; l++) {
58           val = aj[l];
59           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
60         }
61       }
62     }
63     /* expand the Index Set */
64     for (j=0; j<isz; j++) {
65       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag;
87 
88   PetscFunctionBegin;
89   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
90   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
91   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
93 
94   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
95   ssmap = smap;
96   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
97   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98   /* determine lens of each row */
99   for (i=0; i<nrows; i++) {
100     kstart  = ai[irow[i]];
101     kend    = kstart + a->ilen[irow[i]];
102     lens[i] = 0;
103     for (k=kstart; k<kend; k++) {
104       if (ssmap[aj[k]]) lens[i]++;
105     }
106   }
107   /* Create and fill new matrix */
108   if (scall == MAT_REUSE_MATRIX) {
109     c = (Mat_SeqBAIJ*)((*B)->data);
110 
111     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
113     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
115     C    = *B;
116   } else {
117     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
118     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
119     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
120     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
121   }
122   c = (Mat_SeqBAIJ*)(C->data);
123   for (i=0; i<nrows; i++) {
124     row      = irow[i];
125     kstart   = ai[row];
126     kend     = kstart + a->ilen[row];
127     mat_i    = c->i[i];
128     mat_j    = c->j + mat_i;
129     mat_a    = c->a + mat_i*bs2;
130     mat_ilen = c->ilen + i;
131     for (k=kstart; k<kend; k++) {
132       if ((tcol=ssmap[a->j[k]])) {
133         *mat_j++ = tcol - 1;
134         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
135         mat_a   += bs2;
136         (*mat_ilen)++;
137       }
138     }
139   }
140   /* sort */
141   {
142     MatScalar *work;
143     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
144     for (i=0; i<nrows; i++) {
145       PetscInt ilen;
146       mat_i = c->i[i];
147       mat_j = c->j + mat_i;
148       mat_a = c->a + mat_i*bs2;
149       ilen  = c->ilen[i];
150       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
151     }
152     ierr = PetscFree(work);CHKERRQ(ierr);
153   }
154 
155   /* Free work space */
156   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
157   ierr = PetscFree(smap);CHKERRQ(ierr);
158   ierr = PetscFree(lens);CHKERRQ(ierr);
159   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
160   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161 
162   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
163   *B   = C;
164   PetscFunctionReturn(0);
165 }
166 
167 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168 {
169   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
170   IS             is1,is2;
171   PetscErrorCode ierr;
172   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173   const PetscInt *irow,*icol;
174 
175   PetscFunctionBegin;
176   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
177   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
178   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
179   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
180 
181   /* Verify if the indices corespond to each element in a block
182    and form the IS with compressed IS */
183   maxmnbs = PetscMax(a->mbs,a->nbs);
184   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
185   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
186   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187   for (i=0; i<a->mbs; i++) {
188     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189   }
190   count = 0;
191   for (i=0; i<nrows; i++) {
192     j = irow[i] / bs;
193     if ((vary[j]--)==bs) iary[count++] = j;
194   }
195   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
196 
197   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
198   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199   for (i=0; i<a->nbs; i++) {
200     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201   }
202   count = 0;
203   for (i=0; i<ncols; i++) {
204     j = icol[i] / bs;
205     if ((vary[j]--)==bs) iary[count++] = j;
206   }
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
210   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
211 
212   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
213   ierr = ISDestroy(&is1);CHKERRQ(ierr);
214   ierr = ISDestroy(&is2);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219 {
220   PetscErrorCode ierr;
221   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
222   Mat_SubSppt    *submatj = c->submatis1;
223 
224   PetscFunctionBegin;
225   ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
226   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231 {
232   PetscErrorCode ierr;
233   PetscInt       i;
234   Mat            C;
235   Mat_SeqBAIJ    *c;
236   Mat_SubSppt    *submatj;
237 
238   PetscFunctionBegin;
239   for (i=0; i<n; i++) {
240     C       = (*mat)[i];
241     c       = (Mat_SeqBAIJ*)C->data;
242     submatj = c->submatis1;
243     if (submatj) {
244       ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
245       ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
246       ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
247       ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
248       ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
249       ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
250     } else {
251       ierr = MatDestroy(&C);CHKERRQ(ierr);
252     }
253   }
254   ierr = PetscFree(*mat);CHKERRQ(ierr);
255   PetscFunctionReturn(0);
256 }
257 
258 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
259 {
260   PetscErrorCode ierr;
261   PetscInt       i;
262 
263   PetscFunctionBegin;
264   if (scall == MAT_INITIAL_MATRIX) {
265     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
266   }
267 
268   for (i=0; i<n; i++) {
269     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
270   }
271   PetscFunctionReturn(0);
272 }
273 
274 /* -------------------------------------------------------*/
275 /* Should check that shapes of vectors and matrices match */
276 /* -------------------------------------------------------*/
277 
278 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
279 {
280   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
281   PetscScalar       *z,sum;
282   const PetscScalar *x;
283   const MatScalar   *v;
284   PetscErrorCode    ierr;
285   PetscInt          mbs,i,n;
286   const PetscInt    *idx,*ii,*ridx=NULL;
287   PetscBool         usecprow=a->compressedrow.use;
288 
289   PetscFunctionBegin;
290   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
291   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
292 
293   if (usecprow) {
294     mbs  = a->compressedrow.nrows;
295     ii   = a->compressedrow.i;
296     ridx = a->compressedrow.rindex;
297     ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr);
298   } else {
299     mbs = a->mbs;
300     ii  = a->i;
301   }
302 
303   for (i=0; i<mbs; i++) {
304     n   = ii[1] - ii[0];
305     v   = a->a + ii[0];
306     idx = a->j + ii[0];
307     ii++;
308     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
309     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
310     sum = 0.0;
311     PetscSparseDensePlusDot(sum,x,v,idx,n);
312     if (usecprow) {
313       z[ridx[i]] = sum;
314     } else {
315       z[i]       = sum;
316     }
317   }
318   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
319   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
320   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
321   PetscFunctionReturn(0);
322 }
323 
324 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
325 {
326   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
327   PetscScalar       *z = 0,sum1,sum2,*zarray;
328   const PetscScalar *x,*xb;
329   PetscScalar       x1,x2;
330   const MatScalar   *v;
331   PetscErrorCode    ierr;
332   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
333   PetscBool         usecprow=a->compressedrow.use;
334 
335   PetscFunctionBegin;
336   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
337   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
338 
339   idx = a->j;
340   v   = a->a;
341   if (usecprow) {
342     mbs  = a->compressedrow.nrows;
343     ii   = a->compressedrow.i;
344     ridx = a->compressedrow.rindex;
345     ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr);
346   } else {
347     mbs = a->mbs;
348     ii  = a->i;
349     z   = zarray;
350   }
351 
352   for (i=0; i<mbs; i++) {
353     n           = ii[1] - ii[0]; ii++;
354     sum1        = 0.0; sum2 = 0.0;
355     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
356     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
357     for (j=0; j<n; j++) {
358       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
359       sum1 += v[0]*x1 + v[2]*x2;
360       sum2 += v[1]*x1 + v[3]*x2;
361       v    += 4;
362     }
363     if (usecprow) z = zarray + 2*ridx[i];
364     z[0] = sum1; z[1] = sum2;
365     if (!usecprow) z += 2;
366   }
367   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
368   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
369   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
370   PetscFunctionReturn(0);
371 }
372 
373 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
374 {
375   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
376   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
377   const PetscScalar *x,*xb;
378   const MatScalar   *v;
379   PetscErrorCode    ierr;
380   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
381   PetscBool         usecprow=a->compressedrow.use;
382 
383 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
384 #pragma disjoint(*v,*z,*xb)
385 #endif
386 
387   PetscFunctionBegin;
388   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
389   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
390 
391   idx = a->j;
392   v   = a->a;
393   if (usecprow) {
394     mbs  = a->compressedrow.nrows;
395     ii   = a->compressedrow.i;
396     ridx = a->compressedrow.rindex;
397     ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr);
398   } else {
399     mbs = a->mbs;
400     ii  = a->i;
401     z   = zarray;
402   }
403 
404   for (i=0; i<mbs; i++) {
405     n           = ii[1] - ii[0]; ii++;
406     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
407     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
408     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
409     for (j=0; j<n; j++) {
410       xb = x + 3*(*idx++);
411       x1 = xb[0];
412       x2 = xb[1];
413       x3 = xb[2];
414 
415       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
416       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
417       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
418       v    += 9;
419     }
420     if (usecprow) z = zarray + 3*ridx[i];
421     z[0] = sum1; z[1] = sum2; z[2] = sum3;
422     if (!usecprow) z += 3;
423   }
424   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
425   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
426   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
431 {
432   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
433   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
434   const PetscScalar *x,*xb;
435   const MatScalar   *v;
436   PetscErrorCode    ierr;
437   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
438   PetscBool         usecprow=a->compressedrow.use;
439 
440   PetscFunctionBegin;
441   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
442   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
443 
444   idx = a->j;
445   v   = a->a;
446   if (usecprow) {
447     mbs  = a->compressedrow.nrows;
448     ii   = a->compressedrow.i;
449     ridx = a->compressedrow.rindex;
450     ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr);
451   } else {
452     mbs = a->mbs;
453     ii  = a->i;
454     z   = zarray;
455   }
456 
457   for (i=0; i<mbs; i++) {
458     n = ii[1] - ii[0];
459     ii++;
460     sum1 = 0.0;
461     sum2 = 0.0;
462     sum3 = 0.0;
463     sum4 = 0.0;
464 
465     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
466     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
467     for (j=0; j<n; j++) {
468       xb    = x + 4*(*idx++);
469       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
470       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
471       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
472       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
473       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
474       v    += 16;
475     }
476     if (usecprow) z = zarray + 4*ridx[i];
477     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
478     if (!usecprow) z += 4;
479   }
480   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
481   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
482   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
487 {
488   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
489   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
490   const PetscScalar *xb,*x;
491   const MatScalar   *v;
492   PetscErrorCode    ierr;
493   const PetscInt    *idx,*ii,*ridx=NULL;
494   PetscInt          mbs,i,j,n;
495   PetscBool         usecprow=a->compressedrow.use;
496 
497   PetscFunctionBegin;
498   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
499   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
500 
501   idx = a->j;
502   v   = a->a;
503   if (usecprow) {
504     mbs  = a->compressedrow.nrows;
505     ii   = a->compressedrow.i;
506     ridx = a->compressedrow.rindex;
507     ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr);
508   } else {
509     mbs = a->mbs;
510     ii  = a->i;
511     z   = zarray;
512   }
513 
514   for (i=0; i<mbs; i++) {
515     n           = ii[1] - ii[0]; ii++;
516     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
517     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
518     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
519     for (j=0; j<n; j++) {
520       xb    = x + 5*(*idx++);
521       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
522       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
523       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
524       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
525       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
526       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
527       v    += 25;
528     }
529     if (usecprow) z = zarray + 5*ridx[i];
530     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
531     if (!usecprow) z += 5;
532   }
533   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
534   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
535   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
536   PetscFunctionReturn(0);
537 }
538 
539 
540 
541 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
542 {
543   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
544   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
545   const PetscScalar *x,*xb;
546   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
547   const MatScalar   *v;
548   PetscErrorCode    ierr;
549   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
550   PetscBool         usecprow=a->compressedrow.use;
551 
552   PetscFunctionBegin;
553   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
554   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
555 
556   idx = a->j;
557   v   = a->a;
558   if (usecprow) {
559     mbs  = a->compressedrow.nrows;
560     ii   = a->compressedrow.i;
561     ridx = a->compressedrow.rindex;
562     ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr);
563   } else {
564     mbs = a->mbs;
565     ii  = a->i;
566     z   = zarray;
567   }
568 
569   for (i=0; i<mbs; i++) {
570     n  = ii[1] - ii[0];
571     ii++;
572     sum1 = 0.0;
573     sum2 = 0.0;
574     sum3 = 0.0;
575     sum4 = 0.0;
576     sum5 = 0.0;
577     sum6 = 0.0;
578 
579     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
580     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
581     for (j=0; j<n; j++) {
582       xb    = x + 6*(*idx++);
583       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
584       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
585       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
586       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
587       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
588       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
589       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
590       v    += 36;
591     }
592     if (usecprow) z = zarray + 6*ridx[i];
593     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
594     if (!usecprow) z += 6;
595   }
596 
597   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
598   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
599   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
600   PetscFunctionReturn(0);
601 }
602 
603 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
604 {
605   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
606   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
607   const PetscScalar *x,*xb;
608   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
609   const MatScalar   *v;
610   PetscErrorCode    ierr;
611   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
612   PetscBool         usecprow=a->compressedrow.use;
613 
614   PetscFunctionBegin;
615   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
616   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
617 
618   idx = a->j;
619   v   = a->a;
620   if (usecprow) {
621     mbs  = a->compressedrow.nrows;
622     ii   = a->compressedrow.i;
623     ridx = a->compressedrow.rindex;
624     ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr);
625   } else {
626     mbs = a->mbs;
627     ii  = a->i;
628     z   = zarray;
629   }
630 
631   for (i=0; i<mbs; i++) {
632     n  = ii[1] - ii[0];
633     ii++;
634     sum1 = 0.0;
635     sum2 = 0.0;
636     sum3 = 0.0;
637     sum4 = 0.0;
638     sum5 = 0.0;
639     sum6 = 0.0;
640     sum7 = 0.0;
641 
642     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
643     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
644     for (j=0; j<n; j++) {
645       xb    = x + 7*(*idx++);
646       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
647       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
648       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
649       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
650       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
651       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
652       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
653       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
654       v    += 49;
655     }
656     if (usecprow) z = zarray + 7*ridx[i];
657     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
658     if (!usecprow) z += 7;
659   }
660 
661   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
662   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
663   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
668 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
669 {
670   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
671   PetscScalar    *z = 0,*work,*workt,*zarray;
672   const PetscScalar *x,*xb;
673   const MatScalar   *v;
674   PetscErrorCode ierr;
675   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
676   const PetscInt *idx,*ii,*ridx=NULL;
677   PetscInt       k;
678   PetscBool      usecprow=a->compressedrow.use;
679 
680   __m256d a0,a1,a2,a3,a4,a5;
681   __m256d w0,w1,w2,w3;
682   __m256d z0,z1,z2;
683   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
684 
685   PetscFunctionBegin;
686   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
687   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
688 
689   idx = a->j;
690   v   = a->a;
691   if (usecprow) {
692     mbs  = a->compressedrow.nrows;
693     ii   = a->compressedrow.i;
694     ridx = a->compressedrow.rindex;
695     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
696   } else {
697     mbs = a->mbs;
698     ii  = a->i;
699     z   = zarray;
700   }
701 
702   if (!a->mult_work) {
703     k    = PetscMax(A->rmap->n,A->cmap->n);
704     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
705   }
706 
707   work = a->mult_work;
708   for (i=0; i<mbs; i++) {
709     n           = ii[1] - ii[0]; ii++;
710     workt       = work;
711     for (j=0; j<n; j++) {
712       xb = x + bs*(*idx++);
713       for (k=0; k<bs; k++) workt[k] = xb[k];
714       workt += bs;
715     }
716     if (usecprow) z = zarray + bs*ridx[i];
717 
718     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
719 
720     for (j=0; j<n; j++) {
721       // first column of a
722       w0 = _mm256_set1_pd(work[j*9  ]);
723       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
724       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
725       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
726 
727       // second column of a
728       w1 = _mm256_set1_pd(work[j*9+ 1]);
729       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
730       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
731       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
732 
733       // third column of a
734       w2 = _mm256_set1_pd(work[j*9 +2]);
735       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
736       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
737       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
738 
739       // fourth column of a
740       w3 = _mm256_set1_pd(work[j*9+ 3]);
741       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
742       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
743       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
744 
745       // fifth column of a
746       w0 = _mm256_set1_pd(work[j*9+ 4]);
747       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
748       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
749       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
750 
751       // sixth column of a
752       w1 = _mm256_set1_pd(work[j*9+ 5]);
753       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
754       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
755       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
756 
757       // seventh column of a
758       w2 = _mm256_set1_pd(work[j*9+ 6]);
759       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
760       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
761       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
762 
763       // eigth column of a
764       w3 = _mm256_set1_pd(work[j*9+ 7]);
765       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
766       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
767       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
768 
769       // ninth column of a
770       w0 = _mm256_set1_pd(work[j*9+ 8]);
771       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
772       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
773       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
774     }
775 
776     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
777 
778     v += n*bs2;
779     if (!usecprow) z += bs;
780   }
781   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
782   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
783   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
784   PetscFunctionReturn(0);
785 }
786 #endif
787 
788 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
789 {
790   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
791   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
792   const PetscScalar *x,*xb;
793   PetscScalar       *zarray,xv;
794   const MatScalar   *v;
795   PetscErrorCode    ierr;
796   const PetscInt    *ii,*ij=a->j,*idx;
797   PetscInt          mbs,i,j,k,n,*ridx=NULL;
798   PetscBool         usecprow=a->compressedrow.use;
799 
800   PetscFunctionBegin;
801   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
802   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
803 
804   v = a->a;
805   if (usecprow) {
806     mbs  = a->compressedrow.nrows;
807     ii   = a->compressedrow.i;
808     ridx = a->compressedrow.rindex;
809     ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr);
810   } else {
811     mbs = a->mbs;
812     ii  = a->i;
813     z   = zarray;
814   }
815 
816   for (i=0; i<mbs; i++) {
817     n    = ii[i+1] - ii[i];
818     idx  = ij + ii[i];
819     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
820     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
821 
822     for (j=0; j<n; j++) {
823       xb = x + 11*(idx[j]);
824 
825       for (k=0; k<11; k++) {
826         xv     =  xb[k];
827         sum1  += v[0]*xv;
828         sum2  += v[1]*xv;
829         sum3  += v[2]*xv;
830         sum4  += v[3]*xv;
831         sum5  += v[4]*xv;
832         sum6  += v[5]*xv;
833         sum7  += v[6]*xv;
834         sum8  += v[7]*xv;
835         sum9  += v[8]*xv;
836         sum10 += v[9]*xv;
837         sum11 += v[10]*xv;
838         v     += 11;
839       }
840     }
841     if (usecprow) z = zarray + 11*ridx[i];
842     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
843     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
844 
845     if (!usecprow) z += 11;
846   }
847 
848   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
849   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
850   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
851   PetscFunctionReturn(0);
852 }
853 
854 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
855 /* Default MatMult for block size 15 */
856 
857 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
858 {
859   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
860   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
861   const PetscScalar *x,*xb;
862   PetscScalar       *zarray,xv;
863   const MatScalar   *v;
864   PetscErrorCode    ierr;
865   const PetscInt    *ii,*ij=a->j,*idx;
866   PetscInt          mbs,i,j,k,n,*ridx=NULL;
867   PetscBool         usecprow=a->compressedrow.use;
868 
869   PetscFunctionBegin;
870   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
871   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
872 
873   v = a->a;
874   if (usecprow) {
875     mbs  = a->compressedrow.nrows;
876     ii   = a->compressedrow.i;
877     ridx = a->compressedrow.rindex;
878     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
879   } else {
880     mbs = a->mbs;
881     ii  = a->i;
882     z   = zarray;
883   }
884 
885   for (i=0; i<mbs; i++) {
886     n    = ii[i+1] - ii[i];
887     idx  = ij + ii[i];
888     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
889     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
890 
891     for (j=0; j<n; j++) {
892       xb = x + 15*(idx[j]);
893 
894       for (k=0; k<15; k++) {
895         xv     =  xb[k];
896         sum1  += v[0]*xv;
897         sum2  += v[1]*xv;
898         sum3  += v[2]*xv;
899         sum4  += v[3]*xv;
900         sum5  += v[4]*xv;
901         sum6  += v[5]*xv;
902         sum7  += v[6]*xv;
903         sum8  += v[7]*xv;
904         sum9  += v[8]*xv;
905         sum10 += v[9]*xv;
906         sum11 += v[10]*xv;
907         sum12 += v[11]*xv;
908         sum13 += v[12]*xv;
909         sum14 += v[13]*xv;
910         sum15 += v[14]*xv;
911         v     += 15;
912       }
913     }
914     if (usecprow) z = zarray + 15*ridx[i];
915     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
916     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
917 
918     if (!usecprow) z += 15;
919   }
920 
921   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
922   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
923   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
924   PetscFunctionReturn(0);
925 }
926 
927 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
928 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
929 {
930   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
931   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
932   const PetscScalar *x,*xb;
933   PetscScalar       x1,x2,x3,x4,*zarray;
934   const MatScalar   *v;
935   PetscErrorCode    ierr;
936   const PetscInt    *ii,*ij=a->j,*idx;
937   PetscInt          mbs,i,j,n,*ridx=NULL;
938   PetscBool         usecprow=a->compressedrow.use;
939 
940   PetscFunctionBegin;
941   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
942   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
943 
944   v = a->a;
945   if (usecprow) {
946     mbs  = a->compressedrow.nrows;
947     ii   = a->compressedrow.i;
948     ridx = a->compressedrow.rindex;
949     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
950   } else {
951     mbs = a->mbs;
952     ii  = a->i;
953     z   = zarray;
954   }
955 
956   for (i=0; i<mbs; i++) {
957     n    = ii[i+1] - ii[i];
958     idx  = ij + ii[i];
959     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
960     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
961 
962     for (j=0; j<n; j++) {
963       xb = x + 15*(idx[j]);
964       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
965 
966       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
967       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
968       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
969       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
970       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
971       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
972       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
973       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
974       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
975       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
976       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
977       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
978       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
979       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
980       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
981 
982       v += 60;
983 
984       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
985 
986       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
987       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
988       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
989       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
990       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
991       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
992       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
993       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
994       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
995       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
996       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
997       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
998       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
999       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1000       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1001       v     += 60;
1002 
1003       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1004       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1005       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1006       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1007       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1008       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1009       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1010       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1011       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1012       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1013       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1014       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1015       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1016       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1017       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1018       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1019       v     += 60;
1020 
1021       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1022       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1023       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1024       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1025       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1026       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1027       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1028       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1029       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1030       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1031       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1032       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1033       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1034       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1035       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1036       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1037       v     += 45;
1038     }
1039     if (usecprow) z = zarray + 15*ridx[i];
1040     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1041     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1042 
1043     if (!usecprow) z += 15;
1044   }
1045 
1046   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1047   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1048   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1049   PetscFunctionReturn(0);
1050 }
1051 
1052 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1053 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1054 {
1055   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1056   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1057   const PetscScalar *x,*xb;
1058   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1059   const MatScalar   *v;
1060   PetscErrorCode    ierr;
1061   const PetscInt    *ii,*ij=a->j,*idx;
1062   PetscInt          mbs,i,j,n,*ridx=NULL;
1063   PetscBool         usecprow=a->compressedrow.use;
1064 
1065   PetscFunctionBegin;
1066   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1067   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1068 
1069   v = a->a;
1070   if (usecprow) {
1071     mbs  = a->compressedrow.nrows;
1072     ii   = a->compressedrow.i;
1073     ridx = a->compressedrow.rindex;
1074     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1075   } else {
1076     mbs = a->mbs;
1077     ii  = a->i;
1078     z   = zarray;
1079   }
1080 
1081   for (i=0; i<mbs; i++) {
1082     n    = ii[i+1] - ii[i];
1083     idx  = ij + ii[i];
1084     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1085     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1086 
1087     for (j=0; j<n; j++) {
1088       xb = x + 15*(idx[j]);
1089       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1090       x8 = xb[7];
1091 
1092       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1093       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1094       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1095       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1096       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1097       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1098       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1099       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1100       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1101       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1102       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1103       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1104       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1105       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1106       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1107       v     += 120;
1108 
1109       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1110 
1111       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1112       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1113       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1114       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1115       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1116       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1117       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1118       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1119       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1120       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1121       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1122       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1123       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1124       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1125       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1126       v     += 105;
1127     }
1128     if (usecprow) z = zarray + 15*ridx[i];
1129     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1130     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1131 
1132     if (!usecprow) z += 15;
1133   }
1134 
1135   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1136   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1137   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1138   PetscFunctionReturn(0);
1139 }
1140 
1141 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1142 
1143 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1144 {
1145   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1146   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1147   const PetscScalar *x,*xb;
1148   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1149   const MatScalar   *v;
1150   PetscErrorCode    ierr;
1151   const PetscInt    *ii,*ij=a->j,*idx;
1152   PetscInt          mbs,i,j,n,*ridx=NULL;
1153   PetscBool         usecprow=a->compressedrow.use;
1154 
1155   PetscFunctionBegin;
1156   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1157   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1158 
1159   v = a->a;
1160   if (usecprow) {
1161     mbs  = a->compressedrow.nrows;
1162     ii   = a->compressedrow.i;
1163     ridx = a->compressedrow.rindex;
1164     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1165   } else {
1166     mbs = a->mbs;
1167     ii  = a->i;
1168     z   = zarray;
1169   }
1170 
1171   for (i=0; i<mbs; i++) {
1172     n    = ii[i+1] - ii[i];
1173     idx  = ij + ii[i];
1174     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1175     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1176 
1177     for (j=0; j<n; j++) {
1178       xb = x + 15*(idx[j]);
1179       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1180       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1181 
1182       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1183       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1184       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1185       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1186       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1187       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1188       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1189       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1190       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1191       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1192       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1193       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1194       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1195       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1196       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1197       v     += 225;
1198     }
1199     if (usecprow) z = zarray + 15*ridx[i];
1200     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1201     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1202 
1203     if (!usecprow) z += 15;
1204   }
1205 
1206   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1207   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1208   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 
1213 /*
1214     This will not work with MatScalar == float because it calls the BLAS
1215 */
1216 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1217 {
1218   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1219   PetscScalar       *z = 0,*work,*workt,*zarray;
1220   const PetscScalar *x,*xb;
1221   const MatScalar   *v;
1222   PetscErrorCode    ierr;
1223   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1224   const PetscInt    *idx,*ii,*ridx=NULL;
1225   PetscInt          ncols,k;
1226   PetscBool         usecprow=a->compressedrow.use;
1227 
1228   PetscFunctionBegin;
1229   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1230   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1231 
1232   idx = a->j;
1233   v   = a->a;
1234   if (usecprow) {
1235     mbs  = a->compressedrow.nrows;
1236     ii   = a->compressedrow.i;
1237     ridx = a->compressedrow.rindex;
1238     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1239   } else {
1240     mbs = a->mbs;
1241     ii  = a->i;
1242     z   = zarray;
1243   }
1244 
1245   if (!a->mult_work) {
1246     k    = PetscMax(A->rmap->n,A->cmap->n);
1247     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1248   }
1249   work = a->mult_work;
1250   for (i=0; i<mbs; i++) {
1251     n           = ii[1] - ii[0]; ii++;
1252     ncols       = n*bs;
1253     workt       = work;
1254     for (j=0; j<n; j++) {
1255       xb = x + bs*(*idx++);
1256       for (k=0; k<bs; k++) workt[k] = xb[k];
1257       workt += bs;
1258     }
1259     if (usecprow) z = zarray + bs*ridx[i];
1260     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1261     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1262     v += n*bs2;
1263     if (!usecprow) z += bs;
1264   }
1265   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1266   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1267   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1268   PetscFunctionReturn(0);
1269 }
1270 
1271 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1272 {
1273   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1274   const PetscScalar *x;
1275   PetscScalar       *y,*z,sum;
1276   const MatScalar   *v;
1277   PetscErrorCode    ierr;
1278   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1279   const PetscInt    *idx,*ii;
1280   PetscBool         usecprow=a->compressedrow.use;
1281 
1282   PetscFunctionBegin;
1283   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1284   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1285 
1286   idx = a->j;
1287   v   = a->a;
1288   if (usecprow) {
1289     if (zz != yy) {
1290       ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr);
1291     }
1292     mbs  = a->compressedrow.nrows;
1293     ii   = a->compressedrow.i;
1294     ridx = a->compressedrow.rindex;
1295   } else {
1296     ii = a->i;
1297   }
1298 
1299   for (i=0; i<mbs; i++) {
1300     n = ii[1] - ii[0];
1301     ii++;
1302     if (!usecprow) {
1303       sum         = y[i];
1304     } else {
1305       sum = y[ridx[i]];
1306     }
1307     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1308     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1309     PetscSparseDensePlusDot(sum,x,v,idx,n);
1310     v   += n;
1311     idx += n;
1312     if (usecprow) {
1313       z[ridx[i]] = sum;
1314     } else {
1315       z[i] = sum;
1316     }
1317   }
1318   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1319   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1320   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1325 {
1326   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1327   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1328   const PetscScalar *x,*xb;
1329   PetscScalar       x1,x2,*yarray,*zarray;
1330   const MatScalar   *v;
1331   PetscErrorCode    ierr;
1332   PetscInt          mbs = a->mbs,i,n,j;
1333   const PetscInt    *idx,*ii,*ridx = NULL;
1334   PetscBool         usecprow = a->compressedrow.use;
1335 
1336   PetscFunctionBegin;
1337   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1338   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1339 
1340   idx = a->j;
1341   v   = a->a;
1342   if (usecprow) {
1343     if (zz != yy) {
1344       ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr);
1345     }
1346     mbs  = a->compressedrow.nrows;
1347     ii   = a->compressedrow.i;
1348     ridx = a->compressedrow.rindex;
1349   } else {
1350     ii = a->i;
1351     y  = yarray;
1352     z  = zarray;
1353   }
1354 
1355   for (i=0; i<mbs; i++) {
1356     n = ii[1] - ii[0]; ii++;
1357     if (usecprow) {
1358       z = zarray + 2*ridx[i];
1359       y = yarray + 2*ridx[i];
1360     }
1361     sum1 = y[0]; sum2 = y[1];
1362     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1363     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1364     for (j=0; j<n; j++) {
1365       xb = x + 2*(*idx++);
1366       x1 = xb[0];
1367       x2 = xb[1];
1368 
1369       sum1 += v[0]*x1 + v[2]*x2;
1370       sum2 += v[1]*x1 + v[3]*x2;
1371       v    += 4;
1372     }
1373     z[0] = sum1; z[1] = sum2;
1374     if (!usecprow) {
1375       z += 2; y += 2;
1376     }
1377   }
1378   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1379   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1380   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1385 {
1386   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1387   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1388   const PetscScalar *x,*xb;
1389   const MatScalar   *v;
1390   PetscErrorCode    ierr;
1391   PetscInt          mbs = a->mbs,i,j,n;
1392   const PetscInt    *idx,*ii,*ridx = NULL;
1393   PetscBool         usecprow = a->compressedrow.use;
1394 
1395   PetscFunctionBegin;
1396   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1397   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1398 
1399   idx = a->j;
1400   v   = a->a;
1401   if (usecprow) {
1402     if (zz != yy) {
1403       ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr);
1404     }
1405     mbs  = a->compressedrow.nrows;
1406     ii   = a->compressedrow.i;
1407     ridx = a->compressedrow.rindex;
1408   } else {
1409     ii = a->i;
1410     y  = yarray;
1411     z  = zarray;
1412   }
1413 
1414   for (i=0; i<mbs; i++) {
1415     n = ii[1] - ii[0]; ii++;
1416     if (usecprow) {
1417       z = zarray + 3*ridx[i];
1418       y = yarray + 3*ridx[i];
1419     }
1420     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1421     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1422     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1423     for (j=0; j<n; j++) {
1424       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1425       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1426       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1427       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1428       v    += 9;
1429     }
1430     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1431     if (!usecprow) {
1432       z += 3; y += 3;
1433     }
1434   }
1435   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1436   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1437   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1442 {
1443   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1444   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1445   const PetscScalar *x,*xb;
1446   const MatScalar   *v;
1447   PetscErrorCode    ierr;
1448   PetscInt          mbs = a->mbs,i,j,n;
1449   const PetscInt    *idx,*ii,*ridx=NULL;
1450   PetscBool         usecprow=a->compressedrow.use;
1451 
1452   PetscFunctionBegin;
1453   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1454   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1455 
1456   idx = a->j;
1457   v   = a->a;
1458   if (usecprow) {
1459     if (zz != yy) {
1460       ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr);
1461     }
1462     mbs  = a->compressedrow.nrows;
1463     ii   = a->compressedrow.i;
1464     ridx = a->compressedrow.rindex;
1465   } else {
1466     ii = a->i;
1467     y  = yarray;
1468     z  = zarray;
1469   }
1470 
1471   for (i=0; i<mbs; i++) {
1472     n = ii[1] - ii[0]; ii++;
1473     if (usecprow) {
1474       z = zarray + 4*ridx[i];
1475       y = yarray + 4*ridx[i];
1476     }
1477     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1478     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1479     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1480     for (j=0; j<n; j++) {
1481       xb    = x + 4*(*idx++);
1482       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1483       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1484       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1485       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1486       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1487       v    += 16;
1488     }
1489     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1490     if (!usecprow) {
1491       z += 4; y += 4;
1492     }
1493   }
1494   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1495   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1496   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1501 {
1502   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1503   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1504   const PetscScalar *x,*xb;
1505   PetscScalar       *yarray,*zarray;
1506   const MatScalar   *v;
1507   PetscErrorCode    ierr;
1508   PetscInt          mbs = a->mbs,i,j,n;
1509   const PetscInt    *idx,*ii,*ridx = NULL;
1510   PetscBool         usecprow=a->compressedrow.use;
1511 
1512   PetscFunctionBegin;
1513   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1514   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1515 
1516   idx = a->j;
1517   v   = a->a;
1518   if (usecprow) {
1519     if (zz != yy) {
1520       ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr);
1521     }
1522     mbs  = a->compressedrow.nrows;
1523     ii   = a->compressedrow.i;
1524     ridx = a->compressedrow.rindex;
1525   } else {
1526     ii = a->i;
1527     y  = yarray;
1528     z  = zarray;
1529   }
1530 
1531   for (i=0; i<mbs; i++) {
1532     n = ii[1] - ii[0]; ii++;
1533     if (usecprow) {
1534       z = zarray + 5*ridx[i];
1535       y = yarray + 5*ridx[i];
1536     }
1537     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1538     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1539     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1540     for (j=0; j<n; j++) {
1541       xb    = x + 5*(*idx++);
1542       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1543       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1544       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1545       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1546       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1547       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1548       v    += 25;
1549     }
1550     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1551     if (!usecprow) {
1552       z += 5; y += 5;
1553     }
1554   }
1555   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1556   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1557   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1558   PetscFunctionReturn(0);
1559 }
1560 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1561 {
1562   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1563   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1564   const PetscScalar *x,*xb;
1565   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1566   const MatScalar   *v;
1567   PetscErrorCode    ierr;
1568   PetscInt          mbs = a->mbs,i,j,n;
1569   const PetscInt    *idx,*ii,*ridx=NULL;
1570   PetscBool         usecprow=a->compressedrow.use;
1571 
1572   PetscFunctionBegin;
1573   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1574   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1575 
1576   idx = a->j;
1577   v   = a->a;
1578   if (usecprow) {
1579     if (zz != yy) {
1580       ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr);
1581     }
1582     mbs  = a->compressedrow.nrows;
1583     ii   = a->compressedrow.i;
1584     ridx = a->compressedrow.rindex;
1585   } else {
1586     ii = a->i;
1587     y  = yarray;
1588     z  = zarray;
1589   }
1590 
1591   for (i=0; i<mbs; i++) {
1592     n = ii[1] - ii[0]; ii++;
1593     if (usecprow) {
1594       z = zarray + 6*ridx[i];
1595       y = yarray + 6*ridx[i];
1596     }
1597     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1598     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1599     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1600     for (j=0; j<n; j++) {
1601       xb    = x + 6*(*idx++);
1602       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1603       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1604       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1605       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1606       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1607       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1608       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1609       v    += 36;
1610     }
1611     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1612     if (!usecprow) {
1613       z += 6; y += 6;
1614     }
1615   }
1616   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1617   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1618   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1623 {
1624   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1625   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1626   const PetscScalar *x,*xb;
1627   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1628   const MatScalar   *v;
1629   PetscErrorCode    ierr;
1630   PetscInt          mbs = a->mbs,i,j,n;
1631   const PetscInt    *idx,*ii,*ridx = NULL;
1632   PetscBool         usecprow=a->compressedrow.use;
1633 
1634   PetscFunctionBegin;
1635   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1636   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1637 
1638   idx = a->j;
1639   v   = a->a;
1640   if (usecprow) {
1641     if (zz != yy) {
1642       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1643     }
1644     mbs  = a->compressedrow.nrows;
1645     ii   = a->compressedrow.i;
1646     ridx = a->compressedrow.rindex;
1647   } else {
1648     ii = a->i;
1649     y  = yarray;
1650     z  = zarray;
1651   }
1652 
1653   for (i=0; i<mbs; i++) {
1654     n = ii[1] - ii[0]; ii++;
1655     if (usecprow) {
1656       z = zarray + 7*ridx[i];
1657       y = yarray + 7*ridx[i];
1658     }
1659     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1660     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1661     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1662     for (j=0; j<n; j++) {
1663       xb    = x + 7*(*idx++);
1664       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1665       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1666       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1667       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1668       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1669       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1670       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1671       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1672       v    += 49;
1673     }
1674     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1675     if (!usecprow) {
1676       z += 7; y += 7;
1677     }
1678   }
1679   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1680   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1681   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1686 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1687 {
1688   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1689   PetscScalar    *z = 0,*work,*workt,*zarray;
1690   const PetscScalar *x,*xb;
1691   const MatScalar   *v;
1692   PetscErrorCode ierr;
1693   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1694   PetscInt       k;
1695   PetscBool      usecprow=a->compressedrow.use;
1696   const PetscInt *idx,*ii,*ridx=NULL;
1697 
1698   __m256d a0,a1,a2,a3,a4,a5;
1699   __m256d w0,w1,w2,w3;
1700   __m256d z0,z1,z2;
1701   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1702 
1703   PetscFunctionBegin;
1704   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1705   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1706   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1707 
1708   idx = a->j;
1709   v   = a->a;
1710   if (usecprow) {
1711     mbs  = a->compressedrow.nrows;
1712     ii   = a->compressedrow.i;
1713     ridx = a->compressedrow.rindex;
1714   } else {
1715     mbs = a->mbs;
1716     ii  = a->i;
1717     z   = zarray;
1718   }
1719 
1720   if (!a->mult_work) {
1721     k    = PetscMax(A->rmap->n,A->cmap->n);
1722     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1723   }
1724 
1725   work = a->mult_work;
1726   for (i=0; i<mbs; i++) {
1727     n           = ii[1] - ii[0]; ii++;
1728     workt       = work;
1729     for (j=0; j<n; j++) {
1730       xb = x + bs*(*idx++);
1731       for (k=0; k<bs; k++) workt[k] = xb[k];
1732       workt += bs;
1733     }
1734     if (usecprow) z = zarray + bs*ridx[i];
1735 
1736     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1737 
1738     for (j=0; j<n; j++) {
1739       // first column of a
1740       w0 = _mm256_set1_pd(work[j*9  ]);
1741       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1742       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1743       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1744 
1745       // second column of a
1746       w1 = _mm256_set1_pd(work[j*9+ 1]);
1747       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1748       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1749       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1750 
1751       // third column of a
1752       w2 = _mm256_set1_pd(work[j*9+ 2]);
1753       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1754       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1755       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1756 
1757       // fourth column of a
1758       w3 = _mm256_set1_pd(work[j*9+ 3]);
1759       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1760       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1761       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1762 
1763       // fifth column of a
1764       w0 = _mm256_set1_pd(work[j*9+ 4]);
1765       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1766       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1767       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1768 
1769       // sixth column of a
1770       w1 = _mm256_set1_pd(work[j*9+ 5]);
1771       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1772       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1773       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1774 
1775       // seventh column of a
1776       w2 = _mm256_set1_pd(work[j*9+ 6]);
1777       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1778       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1779       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1780 
1781       // eigth column of a
1782       w3 = _mm256_set1_pd(work[j*9+ 7]);
1783       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1784       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1785       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1786 
1787       // ninth column of a
1788       w0 = _mm256_set1_pd(work[j*9+ 8]);
1789       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1790       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1791       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1792     }
1793 
1794     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1795 
1796     v += n*bs2;
1797     if (!usecprow) z += bs;
1798   }
1799   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1800   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1801   ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
1802   PetscFunctionReturn(0);
1803 }
1804 #endif
1805 
1806 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1807 {
1808   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1809   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1810   const PetscScalar *x,*xb;
1811   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1812   const MatScalar   *v;
1813   PetscErrorCode    ierr;
1814   PetscInt          mbs = a->mbs,i,j,n;
1815   const PetscInt    *idx,*ii,*ridx = NULL;
1816   PetscBool         usecprow=a->compressedrow.use;
1817 
1818   PetscFunctionBegin;
1819   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1820   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1821 
1822   idx = a->j;
1823   v   = a->a;
1824   if (usecprow) {
1825     if (zz != yy) {
1826       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1827     }
1828     mbs  = a->compressedrow.nrows;
1829     ii   = a->compressedrow.i;
1830     ridx = a->compressedrow.rindex;
1831   } else {
1832     ii = a->i;
1833     y  = yarray;
1834     z  = zarray;
1835   }
1836 
1837   for (i=0; i<mbs; i++) {
1838     n = ii[1] - ii[0]; ii++;
1839     if (usecprow) {
1840       z = zarray + 11*ridx[i];
1841       y = yarray + 11*ridx[i];
1842     }
1843     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1844     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1845     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1846     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1847     for (j=0; j<n; j++) {
1848       xb    = x + 11*(*idx++);
1849       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1850       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1851       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1852       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1853       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1854       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1855       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1856       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1857       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1858       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1859       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1860       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1861       v    += 121;
1862     }
1863     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1864     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1865     if (!usecprow) {
1866       z += 11; y += 11;
1867     }
1868   }
1869   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1870   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1871   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1876 {
1877   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1878   PetscScalar       *z = 0,*work,*workt,*zarray;
1879   const PetscScalar *x,*xb;
1880   const MatScalar   *v;
1881   PetscErrorCode    ierr;
1882   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1883   PetscInt          ncols,k;
1884   const PetscInt    *ridx = NULL,*idx,*ii;
1885   PetscBool         usecprow = a->compressedrow.use;
1886 
1887   PetscFunctionBegin;
1888   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1889   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1890   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1891 
1892   idx = a->j;
1893   v   = a->a;
1894   if (usecprow) {
1895     mbs  = a->compressedrow.nrows;
1896     ii   = a->compressedrow.i;
1897     ridx = a->compressedrow.rindex;
1898   } else {
1899     mbs = a->mbs;
1900     ii  = a->i;
1901     z   = zarray;
1902   }
1903 
1904   if (!a->mult_work) {
1905     k    = PetscMax(A->rmap->n,A->cmap->n);
1906     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1907   }
1908   work = a->mult_work;
1909   for (i=0; i<mbs; i++) {
1910     n     = ii[1] - ii[0]; ii++;
1911     ncols = n*bs;
1912     workt = work;
1913     for (j=0; j<n; j++) {
1914       xb = x + bs*(*idx++);
1915       for (k=0; k<bs; k++) workt[k] = xb[k];
1916       workt += bs;
1917     }
1918     if (usecprow) z = zarray + bs*ridx[i];
1919     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1920     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1921     v += n*bs2;
1922     if (!usecprow) z += bs;
1923   }
1924   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1925   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1926   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1927   PetscFunctionReturn(0);
1928 }
1929 
1930 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1931 {
1932   PetscScalar    zero = 0.0;
1933   PetscErrorCode ierr;
1934 
1935   PetscFunctionBegin;
1936   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1937   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1938   PetscFunctionReturn(0);
1939 }
1940 
1941 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1942 {
1943   PetscScalar    zero = 0.0;
1944   PetscErrorCode ierr;
1945 
1946   PetscFunctionBegin;
1947   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1948   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1949   PetscFunctionReturn(0);
1950 }
1951 
1952 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1953 {
1954   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1955   PetscScalar       *z,x1,x2,x3,x4,x5;
1956   const PetscScalar *x,*xb = NULL;
1957   const MatScalar   *v;
1958   PetscErrorCode    ierr;
1959   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1960   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1961   Mat_CompressedRow cprow = a->compressedrow;
1962   PetscBool         usecprow = cprow.use;
1963 
1964   PetscFunctionBegin;
1965   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1966   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1967   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1968 
1969   idx = a->j;
1970   v   = a->a;
1971   if (usecprow) {
1972     mbs  = cprow.nrows;
1973     ii   = cprow.i;
1974     ridx = cprow.rindex;
1975   } else {
1976     mbs=a->mbs;
1977     ii = a->i;
1978     xb = x;
1979   }
1980 
1981   switch (bs) {
1982   case 1:
1983     for (i=0; i<mbs; i++) {
1984       if (usecprow) xb = x + ridx[i];
1985       x1 = xb[0];
1986       ib = idx + ii[0];
1987       n  = ii[1] - ii[0]; ii++;
1988       for (j=0; j<n; j++) {
1989         rval     = ib[j];
1990         z[rval] += PetscConj(*v) * x1;
1991         v++;
1992       }
1993       if (!usecprow) xb++;
1994     }
1995     break;
1996   case 2:
1997     for (i=0; i<mbs; i++) {
1998       if (usecprow) xb = x + 2*ridx[i];
1999       x1 = xb[0]; x2 = xb[1];
2000       ib = idx + ii[0];
2001       n  = ii[1] - ii[0]; ii++;
2002       for (j=0; j<n; j++) {
2003         rval       = ib[j]*2;
2004         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2005         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2006         v         += 4;
2007       }
2008       if (!usecprow) xb += 2;
2009     }
2010     break;
2011   case 3:
2012     for (i=0; i<mbs; i++) {
2013       if (usecprow) xb = x + 3*ridx[i];
2014       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2015       ib = idx + ii[0];
2016       n  = ii[1] - ii[0]; ii++;
2017       for (j=0; j<n; j++) {
2018         rval       = ib[j]*3;
2019         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2020         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2021         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2022         v         += 9;
2023       }
2024       if (!usecprow) xb += 3;
2025     }
2026     break;
2027   case 4:
2028     for (i=0; i<mbs; i++) {
2029       if (usecprow) xb = x + 4*ridx[i];
2030       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2031       ib = idx + ii[0];
2032       n  = ii[1] - ii[0]; ii++;
2033       for (j=0; j<n; j++) {
2034         rval       = ib[j]*4;
2035         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2036         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2037         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2038         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2039         v         += 16;
2040       }
2041       if (!usecprow) xb += 4;
2042     }
2043     break;
2044   case 5:
2045     for (i=0; i<mbs; i++) {
2046       if (usecprow) xb = x + 5*ridx[i];
2047       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2048       x4 = xb[3]; x5 = xb[4];
2049       ib = idx + ii[0];
2050       n  = ii[1] - ii[0]; ii++;
2051       for (j=0; j<n; j++) {
2052         rval       = ib[j]*5;
2053         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2054         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2055         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2056         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2057         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2058         v         += 25;
2059       }
2060       if (!usecprow) xb += 5;
2061     }
2062     break;
2063   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2064     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2065 #if 0
2066     {
2067       PetscInt          ncols,k,bs2=a->bs2;
2068       PetscScalar       *work,*workt,zb;
2069       const PetscScalar *xtmp;
2070       if (!a->mult_work) {
2071         k    = PetscMax(A->rmap->n,A->cmap->n);
2072         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2073       }
2074       work = a->mult_work;
2075       xtmp = x;
2076       for (i=0; i<mbs; i++) {
2077         n     = ii[1] - ii[0]; ii++;
2078         ncols = n*bs;
2079         ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2080         if (usecprow) xtmp = x + bs*ridx[i];
2081         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2082         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2083         v += n*bs2;
2084         if (!usecprow) xtmp += bs;
2085         workt = work;
2086         for (j=0; j<n; j++) {
2087           zb = z + bs*(*idx++);
2088           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2089           workt += bs;
2090         }
2091       }
2092     }
2093 #endif
2094   }
2095   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2096   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2097   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2098   PetscFunctionReturn(0);
2099 }
2100 
2101 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2102 {
2103   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2104   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2105   const PetscScalar *x,*xb = 0;
2106   const MatScalar   *v;
2107   PetscErrorCode    ierr;
2108   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2109   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2110   Mat_CompressedRow cprow   = a->compressedrow;
2111   PetscBool         usecprow=cprow.use;
2112 
2113   PetscFunctionBegin;
2114   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2115   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2116   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2117 
2118   idx = a->j;
2119   v   = a->a;
2120   if (usecprow) {
2121     mbs  = cprow.nrows;
2122     ii   = cprow.i;
2123     ridx = cprow.rindex;
2124   } else {
2125     mbs=a->mbs;
2126     ii = a->i;
2127     xb = x;
2128   }
2129 
2130   switch (bs) {
2131   case 1:
2132     for (i=0; i<mbs; i++) {
2133       if (usecprow) xb = x + ridx[i];
2134       x1 = xb[0];
2135       ib = idx + ii[0];
2136       n  = ii[1] - ii[0]; ii++;
2137       for (j=0; j<n; j++) {
2138         rval     = ib[j];
2139         z[rval] += *v * x1;
2140         v++;
2141       }
2142       if (!usecprow) xb++;
2143     }
2144     break;
2145   case 2:
2146     for (i=0; i<mbs; i++) {
2147       if (usecprow) xb = x + 2*ridx[i];
2148       x1 = xb[0]; x2 = xb[1];
2149       ib = idx + ii[0];
2150       n  = ii[1] - ii[0]; ii++;
2151       for (j=0; j<n; j++) {
2152         rval       = ib[j]*2;
2153         z[rval++] += v[0]*x1 + v[1]*x2;
2154         z[rval++] += v[2]*x1 + v[3]*x2;
2155         v         += 4;
2156       }
2157       if (!usecprow) xb += 2;
2158     }
2159     break;
2160   case 3:
2161     for (i=0; i<mbs; i++) {
2162       if (usecprow) xb = x + 3*ridx[i];
2163       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2164       ib = idx + ii[0];
2165       n  = ii[1] - ii[0]; ii++;
2166       for (j=0; j<n; j++) {
2167         rval       = ib[j]*3;
2168         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2169         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2170         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2171         v         += 9;
2172       }
2173       if (!usecprow) xb += 3;
2174     }
2175     break;
2176   case 4:
2177     for (i=0; i<mbs; i++) {
2178       if (usecprow) xb = x + 4*ridx[i];
2179       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2180       ib = idx + ii[0];
2181       n  = ii[1] - ii[0]; ii++;
2182       for (j=0; j<n; j++) {
2183         rval       = ib[j]*4;
2184         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2185         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2186         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2187         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2188         v         += 16;
2189       }
2190       if (!usecprow) xb += 4;
2191     }
2192     break;
2193   case 5:
2194     for (i=0; i<mbs; i++) {
2195       if (usecprow) xb = x + 5*ridx[i];
2196       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2197       x4 = xb[3]; x5 = xb[4];
2198       ib = idx + ii[0];
2199       n  = ii[1] - ii[0]; ii++;
2200       for (j=0; j<n; j++) {
2201         rval       = ib[j]*5;
2202         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2203         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2204         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2205         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2206         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2207         v         += 25;
2208       }
2209       if (!usecprow) xb += 5;
2210     }
2211     break;
2212   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2213     PetscInt          ncols,k;
2214     PetscScalar       *work,*workt;
2215     const PetscScalar *xtmp;
2216     if (!a->mult_work) {
2217       k    = PetscMax(A->rmap->n,A->cmap->n);
2218       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2219     }
2220     work = a->mult_work;
2221     xtmp = x;
2222     for (i=0; i<mbs; i++) {
2223       n     = ii[1] - ii[0]; ii++;
2224       ncols = n*bs;
2225       ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2226       if (usecprow) xtmp = x + bs*ridx[i];
2227       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2228       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2229       v += n*bs2;
2230       if (!usecprow) xtmp += bs;
2231       workt = work;
2232       for (j=0; j<n; j++) {
2233         zb = z + bs*(*idx++);
2234         for (k=0; k<bs; k++) zb[k] += workt[k];
2235         workt += bs;
2236       }
2237     }
2238     }
2239   }
2240   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2241   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2242   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2243   PetscFunctionReturn(0);
2244 }
2245 
2246 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2247 {
2248   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2249   PetscInt       totalnz = a->bs2*a->nz;
2250   PetscScalar    oalpha  = alpha;
2251   PetscErrorCode ierr;
2252   PetscBLASInt   one = 1,tnz;
2253 
2254   PetscFunctionBegin;
2255   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2256   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2257   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2258   PetscFunctionReturn(0);
2259 }
2260 
2261 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2262 {
2263   PetscErrorCode ierr;
2264   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2265   MatScalar      *v  = a->a;
2266   PetscReal      sum = 0.0;
2267   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2268 
2269   PetscFunctionBegin;
2270   if (type == NORM_FROBENIUS) {
2271 #if defined(PETSC_USE_REAL___FP16)
2272     PetscBLASInt one = 1,cnt = bs2*nz;
2273     *norm = BLASnrm2_(&cnt,v,&one);
2274 #else
2275     for (i=0; i<bs2*nz; i++) {
2276       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2277     }
2278 #endif
2279     *norm = PetscSqrtReal(sum);
2280     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
2281   } else if (type == NORM_1) { /* maximum column sum */
2282     PetscReal *tmp;
2283     PetscInt  *bcol = a->j;
2284     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2285     for (i=0; i<nz; i++) {
2286       for (j=0; j<bs; j++) {
2287         k1 = bs*(*bcol) + j; /* column index */
2288         for (k=0; k<bs; k++) {
2289           tmp[k1] += PetscAbsScalar(*v); v++;
2290         }
2291       }
2292       bcol++;
2293     }
2294     *norm = 0.0;
2295     for (j=0; j<A->cmap->n; j++) {
2296       if (tmp[j] > *norm) *norm = tmp[j];
2297     }
2298     ierr = PetscFree(tmp);CHKERRQ(ierr);
2299     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2300   } else if (type == NORM_INFINITY) { /* maximum row sum */
2301     *norm = 0.0;
2302     for (k=0; k<bs; k++) {
2303       for (j=0; j<a->mbs; j++) {
2304         v   = a->a + bs2*a->i[j] + k;
2305         sum = 0.0;
2306         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2307           for (k1=0; k1<bs; k1++) {
2308             sum += PetscAbsScalar(*v);
2309             v   += bs;
2310           }
2311         }
2312         if (sum > *norm) *norm = sum;
2313       }
2314     }
2315     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2316   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2317   PetscFunctionReturn(0);
2318 }
2319 
2320 
2321 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2322 {
2323   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2324   PetscErrorCode ierr;
2325 
2326   PetscFunctionBegin;
2327   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2328   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2329     *flg = PETSC_FALSE;
2330     PetscFunctionReturn(0);
2331   }
2332 
2333   /* if the a->i are the same */
2334   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
2335   if (!*flg) PetscFunctionReturn(0);
2336 
2337   /* if a->j are the same */
2338   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
2339   if (!*flg) PetscFunctionReturn(0);
2340 
2341   /* if a->a are the same */
2342   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr);
2343   PetscFunctionReturn(0);
2344 
2345 }
2346 
2347 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2348 {
2349   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2350   PetscErrorCode ierr;
2351   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2352   PetscScalar    *x,zero = 0.0;
2353   MatScalar      *aa,*aa_j;
2354 
2355   PetscFunctionBegin;
2356   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2357   bs   = A->rmap->bs;
2358   aa   = a->a;
2359   ai   = a->i;
2360   aj   = a->j;
2361   ambs = a->mbs;
2362   bs2  = a->bs2;
2363 
2364   ierr = VecSet(v,zero);CHKERRQ(ierr);
2365   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2366   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2367   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2368   for (i=0; i<ambs; i++) {
2369     for (j=ai[i]; j<ai[i+1]; j++) {
2370       if (aj[j] == i) {
2371         row  = i*bs;
2372         aa_j = aa+j*bs2;
2373         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2374         break;
2375       }
2376     }
2377   }
2378   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2379   PetscFunctionReturn(0);
2380 }
2381 
2382 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2383 {
2384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2385   const PetscScalar *l,*r,*li,*ri;
2386   PetscScalar       x;
2387   MatScalar         *aa, *v;
2388   PetscErrorCode    ierr;
2389   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2390   const PetscInt    *ai,*aj;
2391 
2392   PetscFunctionBegin;
2393   ai  = a->i;
2394   aj  = a->j;
2395   aa  = a->a;
2396   m   = A->rmap->n;
2397   n   = A->cmap->n;
2398   bs  = A->rmap->bs;
2399   mbs = a->mbs;
2400   bs2 = a->bs2;
2401   if (ll) {
2402     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2403     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2404     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2405     for (i=0; i<mbs; i++) { /* for each block row */
2406       M  = ai[i+1] - ai[i];
2407       li = l + i*bs;
2408       v  = aa + bs2*ai[i];
2409       for (j=0; j<M; j++) { /* for each block */
2410         for (k=0; k<bs2; k++) {
2411           (*v++) *= li[k%bs];
2412         }
2413       }
2414     }
2415     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2416     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2417   }
2418 
2419   if (rr) {
2420     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2421     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2422     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2423     for (i=0; i<mbs; i++) { /* for each block row */
2424       iai = ai[i];
2425       M   = ai[i+1] - iai;
2426       v   = aa + bs2*iai;
2427       for (j=0; j<M; j++) { /* for each block */
2428         ri = r + bs*aj[iai+j];
2429         for (k=0; k<bs; k++) {
2430           x = ri[k];
2431           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2432           v += bs;
2433         }
2434       }
2435     }
2436     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2437     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2438   }
2439   PetscFunctionReturn(0);
2440 }
2441 
2442 
2443 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2444 {
2445   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2446 
2447   PetscFunctionBegin;
2448   info->block_size   = a->bs2;
2449   info->nz_allocated = a->bs2*a->maxnz;
2450   info->nz_used      = a->bs2*a->nz;
2451   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2452   info->assemblies   = A->num_ass;
2453   info->mallocs      = A->info.mallocs;
2454   info->memory       = ((PetscObject)A)->mem;
2455   if (A->factortype) {
2456     info->fill_ratio_given  = A->info.fill_ratio_given;
2457     info->fill_ratio_needed = A->info.fill_ratio_needed;
2458     info->factor_mallocs    = A->info.factor_mallocs;
2459   } else {
2460     info->fill_ratio_given  = 0;
2461     info->fill_ratio_needed = 0;
2462     info->factor_mallocs    = 0;
2463   }
2464   PetscFunctionReturn(0);
2465 }
2466 
2467 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2468 {
2469   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2470   PetscErrorCode ierr;
2471 
2472   PetscFunctionBegin;
2473   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
2474   PetscFunctionReturn(0);
2475 }
2476