xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision ebead697dbf761eb322f829370bbe90b3bd93fa3)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
14   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
15   const PetscInt *idx;
16   PetscInt       start,end,*ai,*aj,bs,*nidx2;
17   PetscBT        table;
18 
19   PetscFunctionBegin;
20   m  = a->mbs;
21   ai = a->i;
22   aj = a->j;
23   bs = A->rmap->bs;
24 
25   PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   PetscCall(PetscBTCreate(m,&table));
28   PetscCall(PetscMalloc1(m+1,&nidx));
29   PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2));
30 
31   for (i=0; i<is_max; i++) {
32     /* Initialise the two local arrays */
33     isz  = 0;
34     PetscCall(PetscBTMemzero(m,table));
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     PetscCall(ISGetIndices(is[i],&idx));
38     PetscCall(ISGetLocalSize(is[i],&n));
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j=0; j<n; ++j) {
42       ival = idx[j]/bs; /* convert the indices into block indices */
43       PetscCheck(ival<m,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
45     }
46     PetscCall(ISRestoreIndices(is[i],&idx));
47     PetscCall(ISDestroy(&is[i]));
48 
49     k = 0;
50     for (j=0; j<ov; j++) { /* for each overlap*/
51       n = isz;
52       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end; l++) {
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
65     }
66     PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i));
67   }
68   PetscCall(PetscBTDestroy(&table));
69   PetscCall(PetscFree(nidx));
70   PetscCall(PetscFree(nidx2));
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
75 {
76   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
77   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
78   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
79   const PetscInt *irow,*icol;
80   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
81   PetscInt       *aj = a->j,*ai = a->i;
82   MatScalar      *mat_a;
83   Mat            C;
84   PetscBool      flag;
85 
86   PetscFunctionBegin;
87   PetscCall(ISGetIndices(isrow,&irow));
88   PetscCall(ISGetIndices(iscol,&icol));
89   PetscCall(ISGetLocalSize(isrow,&nrows));
90   PetscCall(ISGetLocalSize(iscol,&ncols));
91 
92   PetscCall(PetscCalloc1(1+oldcols,&smap));
93   ssmap = smap;
94   PetscCall(PetscMalloc1(1+nrows,&lens));
95   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
96   /* determine lens of each row */
97   for (i=0; i<nrows; i++) {
98     kstart  = ai[irow[i]];
99     kend    = kstart + a->ilen[irow[i]];
100     lens[i] = 0;
101     for (k=kstart; k<kend; k++) {
102       if (ssmap[aj[k]]) lens[i]++;
103     }
104   }
105   /* Create and fill new matrix */
106   if (scall == MAT_REUSE_MATRIX) {
107     c = (Mat_SeqBAIJ*)((*B)->data);
108 
109     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
110     PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag));
111     PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
112     PetscCall(PetscArrayzero(c->ilen,c->mbs));
113     C    = *B;
114   } else {
115     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C));
116     PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE));
117     PetscCall(MatSetType(C,((PetscObject)A)->type_name));
118     PetscCall(MatSeqBAIJSetPreallocation(C,bs,0,lens));
119   }
120   c = (Mat_SeqBAIJ*)(C->data);
121   for (i=0; i<nrows; i++) {
122     row      = irow[i];
123     kstart   = ai[row];
124     kend     = kstart + a->ilen[row];
125     mat_i    = c->i[i];
126     mat_j    = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */
127     mat_a    = c->a ? c->a + mat_i*bs2 : NULL; /* mustn't add to NULL, that is UB */
128     mat_ilen = c->ilen + i;
129     for (k=kstart; k<kend; k++) {
130       if ((tcol=ssmap[a->j[k]])) {
131         *mat_j++ = tcol - 1;
132         PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2));
133         mat_a   += bs2;
134         (*mat_ilen)++;
135       }
136     }
137   }
138   /* sort */
139   if (c->j && c->a) {
140     MatScalar *work;
141     PetscCall(PetscMalloc1(bs2,&work));
142     for (i=0; i<nrows; i++) {
143       PetscInt ilen;
144       mat_i = c->i[i];
145       mat_j = c->j + mat_i;
146       mat_a = c->a + mat_i*bs2;
147       ilen  = c->ilen[i];
148       PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work));
149     }
150     PetscCall(PetscFree(work));
151   }
152 
153   /* Free work space */
154   PetscCall(ISRestoreIndices(iscol,&icol));
155   PetscCall(PetscFree(smap));
156   PetscCall(PetscFree(lens));
157   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
158   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
159 
160   PetscCall(ISRestoreIndices(isrow,&irow));
161   *B   = C;
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
166 {
167   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
168   IS             is1,is2;
169   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
170   const PetscInt *irow,*icol;
171 
172   PetscFunctionBegin;
173   PetscCall(ISGetIndices(isrow,&irow));
174   PetscCall(ISGetIndices(iscol,&icol));
175   PetscCall(ISGetLocalSize(isrow,&nrows));
176   PetscCall(ISGetLocalSize(iscol,&ncols));
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   maxmnbs = PetscMax(a->mbs,a->nbs);
181   PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary));
182   PetscCall(PetscArrayzero(vary,a->mbs));
183   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
184   for (i=0; i<a->mbs; i++) {
185     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
186   }
187   count = 0;
188   for (i=0; i<nrows; i++) {
189     j = irow[i] / bs;
190     if ((vary[j]--)==bs) iary[count++] = j;
191   }
192   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1));
193 
194   PetscCall(PetscArrayzero(vary,a->nbs));
195   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
196   for (i=0; i<a->nbs; i++) {
197     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
198   }
199   count = 0;
200   for (i=0; i<ncols; i++) {
201     j = icol[i] / bs;
202     if ((vary[j]--)==bs) iary[count++] = j;
203   }
204   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2));
205   PetscCall(ISRestoreIndices(isrow,&irow));
206   PetscCall(ISRestoreIndices(iscol,&icol));
207   PetscCall(PetscFree2(vary,iary));
208 
209   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B));
210   PetscCall(ISDestroy(&is1));
211   PetscCall(ISDestroy(&is2));
212   PetscFunctionReturn(0);
213 }
214 
215 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
216 {
217   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
218   Mat_SubSppt    *submatj = c->submatis1;
219 
220   PetscFunctionBegin;
221   PetscCall((*submatj->destroy)(C));
222   PetscCall(MatDestroySubMatrix_Private(submatj));
223   PetscFunctionReturn(0);
224 }
225 
226 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
227 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
228 {
229   PetscInt       i;
230   Mat            C;
231   Mat_SeqBAIJ    *c;
232   Mat_SubSppt    *submatj;
233 
234   PetscFunctionBegin;
235   for (i=0; i<n; i++) {
236     C       = (*mat)[i];
237     c       = (Mat_SeqBAIJ*)C->data;
238     submatj = c->submatis1;
239     if (submatj) {
240       if (--((PetscObject)C)->refct <= 0) {
241         PetscCall(PetscFree(C->factorprefix));
242         PetscCall((*submatj->destroy)(C));
243         PetscCall(MatDestroySubMatrix_Private(submatj));
244         PetscCall(PetscFree(C->defaultvectype));
245         PetscCall(PetscLayoutDestroy(&C->rmap));
246         PetscCall(PetscLayoutDestroy(&C->cmap));
247         PetscCall(PetscHeaderDestroy(&C));
248       }
249     } else {
250       PetscCall(MatDestroy(&C));
251     }
252   }
253 
254   /* Destroy Dummy submatrices created for reuse */
255   PetscCall(MatDestroySubMatrices_Dummy(n,mat));
256 
257   PetscCall(PetscFree(*mat));
258   PetscFunctionReturn(0);
259 }
260 
261 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
262 {
263   PetscInt       i;
264 
265   PetscFunctionBegin;
266   if (scall == MAT_INITIAL_MATRIX) {
267     PetscCall(PetscCalloc1(n+1,B));
268   }
269 
270   for (i=0; i<n; i++) {
271     PetscCall(MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]));
272   }
273   PetscFunctionReturn(0);
274 }
275 
276 /* -------------------------------------------------------*/
277 /* Should check that shapes of vectors and matrices match */
278 /* -------------------------------------------------------*/
279 
280 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
281 {
282   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
283   PetscScalar       *z,sum;
284   const PetscScalar *x;
285   const MatScalar   *v;
286   PetscInt          mbs,i,n;
287   const PetscInt    *idx,*ii,*ridx=NULL;
288   PetscBool         usecprow=a->compressedrow.use;
289 
290   PetscFunctionBegin;
291   PetscCall(VecGetArrayRead(xx,&x));
292   PetscCall(VecGetArrayWrite(zz,&z));
293 
294   if (usecprow) {
295     mbs  = a->compressedrow.nrows;
296     ii   = a->compressedrow.i;
297     ridx = a->compressedrow.rindex;
298     PetscCall(PetscArrayzero(z,a->mbs));
299   } else {
300     mbs = a->mbs;
301     ii  = a->i;
302   }
303 
304   for (i=0; i<mbs; i++) {
305     n   = ii[1] - ii[0];
306     v   = a->a + ii[0];
307     idx = a->j + ii[0];
308     ii++;
309     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
310     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311     sum = 0.0;
312     PetscSparseDensePlusDot(sum,x,v,idx,n);
313     if (usecprow) {
314       z[ridx[i]] = sum;
315     } else {
316       z[i]       = sum;
317     }
318   }
319   PetscCall(VecRestoreArrayRead(xx,&x));
320   PetscCall(VecRestoreArrayWrite(zz,&z));
321   PetscCall(PetscLogFlops(2.0*a->nz - a->nonzerorowcnt));
322   PetscFunctionReturn(0);
323 }
324 
325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326 {
327   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
328   PetscScalar       *z = NULL,sum1,sum2,*zarray;
329   const PetscScalar *x,*xb;
330   PetscScalar       x1,x2;
331   const MatScalar   *v;
332   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
333   PetscBool         usecprow=a->compressedrow.use;
334 
335   PetscFunctionBegin;
336   PetscCall(VecGetArrayRead(xx,&x));
337   PetscCall(VecGetArrayWrite(zz,&zarray));
338 
339   idx = a->j;
340   v   = a->a;
341   if (usecprow) {
342     mbs  = a->compressedrow.nrows;
343     ii   = a->compressedrow.i;
344     ridx = a->compressedrow.rindex;
345     PetscCall(PetscArrayzero(zarray,2*a->mbs));
346   } else {
347     mbs = a->mbs;
348     ii  = a->i;
349     z   = zarray;
350   }
351 
352   for (i=0; i<mbs; i++) {
353     n           = ii[1] - ii[0]; ii++;
354     sum1        = 0.0; sum2 = 0.0;
355     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
356     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
357     for (j=0; j<n; j++) {
358       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
359       sum1 += v[0]*x1 + v[2]*x2;
360       sum2 += v[1]*x1 + v[3]*x2;
361       v    += 4;
362     }
363     if (usecprow) z = zarray + 2*ridx[i];
364     z[0] = sum1; z[1] = sum2;
365     if (!usecprow) z += 2;
366   }
367   PetscCall(VecRestoreArrayRead(xx,&x));
368   PetscCall(VecRestoreArrayWrite(zz,&zarray));
369   PetscCall(PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt));
370   PetscFunctionReturn(0);
371 }
372 
373 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
374 {
375   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
376   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
377   const PetscScalar *x,*xb;
378   const MatScalar   *v;
379   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
380   PetscBool         usecprow=a->compressedrow.use;
381 
382 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383 #pragma disjoint(*v,*z,*xb)
384 #endif
385 
386   PetscFunctionBegin;
387   PetscCall(VecGetArrayRead(xx,&x));
388   PetscCall(VecGetArrayWrite(zz,&zarray));
389 
390   idx = a->j;
391   v   = a->a;
392   if (usecprow) {
393     mbs  = a->compressedrow.nrows;
394     ii   = a->compressedrow.i;
395     ridx = a->compressedrow.rindex;
396     PetscCall(PetscArrayzero(zarray,3*a->mbs));
397   } else {
398     mbs = a->mbs;
399     ii  = a->i;
400     z   = zarray;
401   }
402 
403   for (i=0; i<mbs; i++) {
404     n           = ii[1] - ii[0]; ii++;
405     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
406     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
407     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
408     for (j=0; j<n; j++) {
409       xb = x + 3*(*idx++);
410       x1 = xb[0];
411       x2 = xb[1];
412       x3 = xb[2];
413 
414       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
415       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
416       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
417       v    += 9;
418     }
419     if (usecprow) z = zarray + 3*ridx[i];
420     z[0] = sum1; z[1] = sum2; z[2] = sum3;
421     if (!usecprow) z += 3;
422   }
423   PetscCall(VecRestoreArrayRead(xx,&x));
424   PetscCall(VecRestoreArrayWrite(zz,&zarray));
425   PetscCall(PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt));
426   PetscFunctionReturn(0);
427 }
428 
429 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
430 {
431   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
432   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
433   const PetscScalar *x,*xb;
434   const MatScalar   *v;
435   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
436   PetscBool         usecprow=a->compressedrow.use;
437 
438   PetscFunctionBegin;
439   PetscCall(VecGetArrayRead(xx,&x));
440   PetscCall(VecGetArrayWrite(zz,&zarray));
441 
442   idx = a->j;
443   v   = a->a;
444   if (usecprow) {
445     mbs  = a->compressedrow.nrows;
446     ii   = a->compressedrow.i;
447     ridx = a->compressedrow.rindex;
448     PetscCall(PetscArrayzero(zarray,4*a->mbs));
449   } else {
450     mbs = a->mbs;
451     ii  = a->i;
452     z   = zarray;
453   }
454 
455   for (i=0; i<mbs; i++) {
456     n = ii[1] - ii[0];
457     ii++;
458     sum1 = 0.0;
459     sum2 = 0.0;
460     sum3 = 0.0;
461     sum4 = 0.0;
462 
463     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
464     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
465     for (j=0; j<n; j++) {
466       xb    = x + 4*(*idx++);
467       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
468       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
469       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
470       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
471       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
472       v    += 16;
473     }
474     if (usecprow) z = zarray + 4*ridx[i];
475     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
476     if (!usecprow) z += 4;
477   }
478   PetscCall(VecRestoreArrayRead(xx,&x));
479   PetscCall(VecRestoreArrayWrite(zz,&zarray));
480   PetscCall(PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt));
481   PetscFunctionReturn(0);
482 }
483 
484 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
485 {
486   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
487   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
488   const PetscScalar *xb,*x;
489   const MatScalar   *v;
490   const PetscInt    *idx,*ii,*ridx=NULL;
491   PetscInt          mbs,i,j,n;
492   PetscBool         usecprow=a->compressedrow.use;
493 
494   PetscFunctionBegin;
495   PetscCall(VecGetArrayRead(xx,&x));
496   PetscCall(VecGetArrayWrite(zz,&zarray));
497 
498   idx = a->j;
499   v   = a->a;
500   if (usecprow) {
501     mbs  = a->compressedrow.nrows;
502     ii   = a->compressedrow.i;
503     ridx = a->compressedrow.rindex;
504     PetscCall(PetscArrayzero(zarray,5*a->mbs));
505   } else {
506     mbs = a->mbs;
507     ii  = a->i;
508     z   = zarray;
509   }
510 
511   for (i=0; i<mbs; i++) {
512     n           = ii[1] - ii[0]; ii++;
513     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
514     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
515     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
516     for (j=0; j<n; j++) {
517       xb    = x + 5*(*idx++);
518       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
519       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
520       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
521       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
522       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
523       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
524       v    += 25;
525     }
526     if (usecprow) z = zarray + 5*ridx[i];
527     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
528     if (!usecprow) z += 5;
529   }
530   PetscCall(VecRestoreArrayRead(xx,&x));
531   PetscCall(VecRestoreArrayWrite(zz,&zarray));
532   PetscCall(PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt));
533   PetscFunctionReturn(0);
534 }
535 
536 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
537 {
538   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
539   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
540   const PetscScalar *x,*xb;
541   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
542   const MatScalar   *v;
543   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
544   PetscBool         usecprow=a->compressedrow.use;
545 
546   PetscFunctionBegin;
547   PetscCall(VecGetArrayRead(xx,&x));
548   PetscCall(VecGetArrayWrite(zz,&zarray));
549 
550   idx = a->j;
551   v   = a->a;
552   if (usecprow) {
553     mbs  = a->compressedrow.nrows;
554     ii   = a->compressedrow.i;
555     ridx = a->compressedrow.rindex;
556     PetscCall(PetscArrayzero(zarray,6*a->mbs));
557   } else {
558     mbs = a->mbs;
559     ii  = a->i;
560     z   = zarray;
561   }
562 
563   for (i=0; i<mbs; i++) {
564     n  = ii[1] - ii[0];
565     ii++;
566     sum1 = 0.0;
567     sum2 = 0.0;
568     sum3 = 0.0;
569     sum4 = 0.0;
570     sum5 = 0.0;
571     sum6 = 0.0;
572 
573     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
574     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
575     for (j=0; j<n; j++) {
576       xb    = x + 6*(*idx++);
577       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
578       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
579       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
580       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
581       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
582       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
583       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
584       v    += 36;
585     }
586     if (usecprow) z = zarray + 6*ridx[i];
587     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
588     if (!usecprow) z += 6;
589   }
590 
591   PetscCall(VecRestoreArrayRead(xx,&x));
592   PetscCall(VecRestoreArrayWrite(zz,&zarray));
593   PetscCall(PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt));
594   PetscFunctionReturn(0);
595 }
596 
597 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
598 {
599   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
600   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
601   const PetscScalar *x,*xb;
602   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
603   const MatScalar   *v;
604   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
605   PetscBool         usecprow=a->compressedrow.use;
606 
607   PetscFunctionBegin;
608   PetscCall(VecGetArrayRead(xx,&x));
609   PetscCall(VecGetArrayWrite(zz,&zarray));
610 
611   idx = a->j;
612   v   = a->a;
613   if (usecprow) {
614     mbs  = a->compressedrow.nrows;
615     ii   = a->compressedrow.i;
616     ridx = a->compressedrow.rindex;
617     PetscCall(PetscArrayzero(zarray,7*a->mbs));
618   } else {
619     mbs = a->mbs;
620     ii  = a->i;
621     z   = zarray;
622   }
623 
624   for (i=0; i<mbs; i++) {
625     n  = ii[1] - ii[0];
626     ii++;
627     sum1 = 0.0;
628     sum2 = 0.0;
629     sum3 = 0.0;
630     sum4 = 0.0;
631     sum5 = 0.0;
632     sum6 = 0.0;
633     sum7 = 0.0;
634 
635     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
636     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
637     for (j=0; j<n; j++) {
638       xb    = x + 7*(*idx++);
639       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
640       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
641       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
642       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
643       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
644       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
645       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
646       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
647       v    += 49;
648     }
649     if (usecprow) z = zarray + 7*ridx[i];
650     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
651     if (!usecprow) z += 7;
652   }
653 
654   PetscCall(VecRestoreArrayRead(xx,&x));
655   PetscCall(VecRestoreArrayWrite(zz,&zarray));
656   PetscCall(PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt));
657   PetscFunctionReturn(0);
658 }
659 
660 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
661 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
662 {
663   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
664   PetscScalar       *z = NULL,*work,*workt,*zarray;
665   const PetscScalar *x,*xb;
666   const MatScalar   *v;
667   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
668   const PetscInt    *idx,*ii,*ridx=NULL;
669   PetscInt          k;
670   PetscBool         usecprow=a->compressedrow.use;
671 
672   __m256d a0,a1,a2,a3,a4,a5;
673   __m256d w0,w1,w2,w3;
674   __m256d z0,z1,z2;
675   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
676 
677   PetscFunctionBegin;
678   PetscCall(VecGetArrayRead(xx,&x));
679   PetscCall(VecGetArrayWrite(zz,&zarray));
680 
681   idx = a->j;
682   v   = a->a;
683   if (usecprow) {
684     mbs  = a->compressedrow.nrows;
685     ii   = a->compressedrow.i;
686     ridx = a->compressedrow.rindex;
687     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
688   } else {
689     mbs = a->mbs;
690     ii  = a->i;
691     z   = zarray;
692   }
693 
694   if (!a->mult_work) {
695     k    = PetscMax(A->rmap->n,A->cmap->n);
696     PetscCall(PetscMalloc1(k+1,&a->mult_work));
697   }
698 
699   work = a->mult_work;
700   for (i=0; i<mbs; i++) {
701     n           = ii[1] - ii[0]; ii++;
702     workt       = work;
703     for (j=0; j<n; j++) {
704       xb = x + bs*(*idx++);
705       for (k=0; k<bs; k++) workt[k] = xb[k];
706       workt += bs;
707     }
708     if (usecprow) z = zarray + bs*ridx[i];
709 
710     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
711 
712     for (j=0; j<n; j++) {
713       /* first column of a */
714       w0 = _mm256_set1_pd(work[j*9  ]);
715       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
716       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
717       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
718 
719       /* second column of a */
720       w1 = _mm256_set1_pd(work[j*9+ 1]);
721       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
722       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
723       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
724 
725       /* third column of a */
726       w2 = _mm256_set1_pd(work[j*9 +2]);
727       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
728       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
729       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
730 
731       /* fourth column of a */
732       w3 = _mm256_set1_pd(work[j*9+ 3]);
733       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
734       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
735       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
736 
737       /* fifth column of a */
738       w0 = _mm256_set1_pd(work[j*9+ 4]);
739       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
740       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
741       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
742 
743       /* sixth column of a */
744       w1 = _mm256_set1_pd(work[j*9+ 5]);
745       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
746       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
747       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
748 
749       /* seventh column of a */
750       w2 = _mm256_set1_pd(work[j*9+ 6]);
751       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
752       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
753       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
754 
755       /* eighth column of a */
756       w3 = _mm256_set1_pd(work[j*9+ 7]);
757       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
758       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
759       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
760 
761       /* ninth column of a */
762       w0 = _mm256_set1_pd(work[j*9+ 8]);
763       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
764       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
765       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
766     }
767 
768     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
769 
770     v += n*bs2;
771     if (!usecprow) z += bs;
772   }
773   PetscCall(VecRestoreArrayRead(xx,&x));
774   PetscCall(VecRestoreArrayWrite(zz,&zarray));
775   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
776   PetscFunctionReturn(0);
777 }
778 #endif
779 
780 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
781 {
782   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
783   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
784   const PetscScalar *x,*xb;
785   PetscScalar       *zarray,xv;
786   const MatScalar   *v;
787   const PetscInt    *ii,*ij=a->j,*idx;
788   PetscInt          mbs,i,j,k,n,*ridx=NULL;
789   PetscBool         usecprow=a->compressedrow.use;
790 
791   PetscFunctionBegin;
792   PetscCall(VecGetArrayRead(xx,&x));
793   PetscCall(VecGetArrayWrite(zz,&zarray));
794 
795   v = a->a;
796   if (usecprow) {
797     mbs  = a->compressedrow.nrows;
798     ii   = a->compressedrow.i;
799     ridx = a->compressedrow.rindex;
800     PetscCall(PetscArrayzero(zarray,11*a->mbs));
801   } else {
802     mbs = a->mbs;
803     ii  = a->i;
804     z   = zarray;
805   }
806 
807   for (i=0; i<mbs; i++) {
808     n    = ii[i+1] - ii[i];
809     idx  = ij + ii[i];
810     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
811     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
812 
813     for (j=0; j<n; j++) {
814       xb = x + 11*(idx[j]);
815 
816       for (k=0; k<11; k++) {
817         xv     =  xb[k];
818         sum1  += v[0]*xv;
819         sum2  += v[1]*xv;
820         sum3  += v[2]*xv;
821         sum4  += v[3]*xv;
822         sum5  += v[4]*xv;
823         sum6  += v[5]*xv;
824         sum7  += v[6]*xv;
825         sum8  += v[7]*xv;
826         sum9  += v[8]*xv;
827         sum10 += v[9]*xv;
828         sum11 += v[10]*xv;
829         v     += 11;
830       }
831     }
832     if (usecprow) z = zarray + 11*ridx[i];
833     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
834     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
835 
836     if (!usecprow) z += 11;
837   }
838 
839   PetscCall(VecRestoreArrayRead(xx,&x));
840   PetscCall(VecRestoreArrayWrite(zz,&zarray));
841   PetscCall(PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt));
842   PetscFunctionReturn(0);
843 }
844 
845 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
846 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
847 {
848   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
849   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
850   const PetscScalar *x,*xb;
851   PetscScalar       *zarray,xv;
852   const MatScalar   *v;
853   const PetscInt    *ii,*ij=a->j,*idx;
854   PetscInt          mbs,i,j,k,n,*ridx=NULL;
855   PetscBool         usecprow=a->compressedrow.use;
856 
857   PetscFunctionBegin;
858   PetscCall(VecGetArrayRead(xx,&x));
859   PetscCall(VecGetArrayWrite(zz,&zarray));
860 
861   v = a->a;
862   if (usecprow) {
863     mbs  = a->compressedrow.nrows;
864     ii   = a->compressedrow.i;
865     ridx = a->compressedrow.rindex;
866     PetscCall(PetscArrayzero(zarray,12*a->mbs));
867   } else {
868     mbs = a->mbs;
869     ii  = a->i;
870     z   = zarray;
871   }
872 
873   for (i=0; i<mbs; i++) {
874     n    = ii[i+1] - ii[i];
875     idx  = ij + ii[i];
876     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
877     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
878 
879     for (j=0; j<n; j++) {
880       xb = x + 12*(idx[j]);
881 
882       for (k=0; k<12; k++) {
883         xv     =  xb[k];
884         sum1  += v[0]*xv;
885         sum2  += v[1]*xv;
886         sum3  += v[2]*xv;
887         sum4  += v[3]*xv;
888         sum5  += v[4]*xv;
889         sum6  += v[5]*xv;
890         sum7  += v[6]*xv;
891         sum8  += v[7]*xv;
892         sum9  += v[8]*xv;
893         sum10 += v[9]*xv;
894         sum11 += v[10]*xv;
895         sum12 += v[11]*xv;
896         v     += 12;
897       }
898     }
899     if (usecprow) z = zarray + 12*ridx[i];
900     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
901     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
902     if (!usecprow) z += 12;
903   }
904   PetscCall(VecRestoreArrayRead(xx,&x));
905   PetscCall(VecRestoreArrayWrite(zz,&zarray));
906   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
907   PetscFunctionReturn(0);
908 }
909 
910 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
911 {
912   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
913   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
914   const PetscScalar *x,*xb;
915   PetscScalar       *zarray,*yarray,xv;
916   const MatScalar   *v;
917   const PetscInt    *ii,*ij=a->j,*idx;
918   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
919   PetscBool         usecprow=a->compressedrow.use;
920 
921   PetscFunctionBegin;
922   PetscCall(VecGetArrayRead(xx,&x));
923   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
924 
925   v = a->a;
926   if (usecprow) {
927    if (zz != yy) {
928      PetscCall(PetscArraycpy(zarray,yarray,12*mbs));
929     }
930     mbs  = a->compressedrow.nrows;
931     ii   = a->compressedrow.i;
932     ridx = a->compressedrow.rindex;
933   } else {
934     ii  = a->i;
935     y   = yarray;
936     z   = zarray;
937   }
938 
939   for (i=0; i<mbs; i++) {
940     n    = ii[i+1] - ii[i];
941     idx  = ij + ii[i];
942 
943     if (usecprow) {
944       y = yarray + 12*ridx[i];
945       z = zarray + 12*ridx[i];
946     }
947     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
948     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
949 
950     for (j=0; j<n; j++) {
951       xb = x + 12*(idx[j]);
952 
953       for (k=0; k<12; k++) {
954         xv     =  xb[k];
955         sum1  += v[0]*xv;
956         sum2  += v[1]*xv;
957         sum3  += v[2]*xv;
958         sum4  += v[3]*xv;
959         sum5  += v[4]*xv;
960         sum6  += v[5]*xv;
961         sum7  += v[6]*xv;
962         sum8  += v[7]*xv;
963         sum9  += v[8]*xv;
964         sum10 += v[9]*xv;
965         sum11 += v[10]*xv;
966         sum12 += v[11]*xv;
967         v     += 12;
968       }
969     }
970 
971     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
972     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
973     if (!usecprow) {
974       y += 12;
975       z += 12;
976     }
977   }
978   PetscCall(VecRestoreArrayRead(xx,&x));
979   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
980   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
981   PetscFunctionReturn(0);
982 }
983 
984 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
985 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
986 {
987   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
988   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
989   const PetscScalar *x,*xb;
990   PetscScalar       x1,x2,x3,x4,*zarray;
991   const MatScalar   *v;
992   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
993   PetscInt          mbs,i,j,n;
994   PetscBool         usecprow=a->compressedrow.use;
995 
996   PetscFunctionBegin;
997   PetscCall(VecGetArrayRead(xx,&x));
998   PetscCall(VecGetArrayWrite(zz,&zarray));
999 
1000   v = a->a;
1001   if (usecprow) {
1002     mbs  = a->compressedrow.nrows;
1003     ii   = a->compressedrow.i;
1004     ridx = a->compressedrow.rindex;
1005     PetscCall(PetscArrayzero(zarray,12*a->mbs));
1006   } else {
1007     mbs = a->mbs;
1008     ii  = a->i;
1009     z   = zarray;
1010   }
1011 
1012   for (i=0; i<mbs; i++) {
1013     n    = ii[i+1] - ii[i];
1014     idx  = ij + ii[i];
1015 
1016     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1017     for (j=0; j<n; j++) {
1018       xb = x + 12*(idx[j]);
1019       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1020 
1021       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1022       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1023       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1024       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1025       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1026       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1027       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1028       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1029       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1030       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1031       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1032       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1033       v += 48;
1034 
1035       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1036 
1037       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1038       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1039       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1040       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1041       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1042       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1043       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1044       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1045       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1046       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1047       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1048       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1049       v     += 48;
1050 
1051       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1052       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1053       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1054       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1055       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1056       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1057       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1058       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1059       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1060       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1061       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1062       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1063       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1064       v     += 48;
1065 
1066     }
1067     if (usecprow) z = zarray + 12*ridx[i];
1068     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1069     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1070     if (!usecprow) z += 12;
1071   }
1072   PetscCall(VecRestoreArrayRead(xx,&x));
1073   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1074   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
1075   PetscFunctionReturn(0);
1076 }
1077 
1078 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1079 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1080 {
1081   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1082   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1083   const PetscScalar *x,*xb;
1084   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1085   const MatScalar   *v;
1086   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1087   PetscInt          mbs = a->mbs,i,j,n;
1088   PetscBool         usecprow=a->compressedrow.use;
1089 
1090   PetscFunctionBegin;
1091   PetscCall(VecGetArrayRead(xx,&x));
1092   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1093 
1094   v = a->a;
1095   if (usecprow) {
1096     if (zz != yy) {
1097       PetscCall(PetscArraycpy(zarray,yarray,12*mbs));
1098     }
1099     mbs  = a->compressedrow.nrows;
1100     ii   = a->compressedrow.i;
1101     ridx = a->compressedrow.rindex;
1102   } else {
1103     ii  = a->i;
1104     y   = yarray;
1105     z   = zarray;
1106   }
1107 
1108   for (i=0; i<mbs; i++) {
1109     n    = ii[i+1] - ii[i];
1110     idx  = ij + ii[i];
1111 
1112     if (usecprow) {
1113       y = yarray + 12*ridx[i];
1114       z = zarray + 12*ridx[i];
1115     }
1116     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1117     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1118 
1119     for (j=0; j<n; j++) {
1120       xb = x + 12*(idx[j]);
1121       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1122 
1123       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1124       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1125       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1126       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1127       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1128       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1129       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1130       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1131       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1132       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1133       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1134       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1135       v += 48;
1136 
1137       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1138 
1139       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1140       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1141       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1142       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1143       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1144       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1145       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1146       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1147       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1148       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1149       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1150       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1151       v     += 48;
1152 
1153       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1154       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1155       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1156       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1157       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1158       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1159       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1160       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1161       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1162       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1163       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1164       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1165       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1166       v     += 48;
1167 
1168     }
1169     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1170     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1171     if (!usecprow) {
1172       y += 12;
1173       z += 12;
1174     }
1175   }
1176   PetscCall(VecRestoreArrayRead(xx,&x));
1177   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1178   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1183 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1184 {
1185   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1186   PetscScalar       *z = NULL,*zarray;
1187   const PetscScalar *x,*work;
1188   const MatScalar   *v = a->a;
1189   PetscInt          mbs,i,j,n;
1190   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1191   PetscBool         usecprow=a->compressedrow.use;
1192   const PetscInt    bs = 12, bs2 = 144;
1193 
1194   __m256d a0,a1,a2,a3,a4,a5;
1195   __m256d w0,w1,w2,w3;
1196   __m256d z0,z1,z2;
1197 
1198   PetscFunctionBegin;
1199   PetscCall(VecGetArrayRead(xx,&x));
1200   PetscCall(VecGetArrayWrite(zz,&zarray));
1201 
1202   if (usecprow) {
1203     mbs  = a->compressedrow.nrows;
1204     ii   = a->compressedrow.i;
1205     ridx = a->compressedrow.rindex;
1206     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
1207   } else {
1208     mbs = a->mbs;
1209     ii  = a->i;
1210     z   = zarray;
1211   }
1212 
1213   for (i=0; i<mbs; i++) {
1214     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1215 
1216     n  = ii[1] - ii[0]; ii++;
1217     for (j=0; j<n; j++) {
1218       work = x + bs*(*idx++);
1219 
1220       /* first column of a */
1221       w0 = _mm256_set1_pd(work[0]);
1222       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1223       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1224       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1225 
1226       /* second column of a */
1227       w1 = _mm256_set1_pd(work[1]);
1228       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1229       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1230       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1231 
1232       /* third column of a */
1233       w2 = _mm256_set1_pd(work[2]);
1234       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1235       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1236       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1237 
1238       /* fourth column of a */
1239       w3 = _mm256_set1_pd(work[3]);
1240       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1241       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1242       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1243 
1244       /* fifth column of a */
1245       w0 = _mm256_set1_pd(work[4]);
1246       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1247       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1248       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1249 
1250       /* sixth column of a */
1251       w1 = _mm256_set1_pd(work[5]);
1252       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1253       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1254       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1255 
1256       /* seventh column of a */
1257       w2 = _mm256_set1_pd(work[6]);
1258       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1259       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1260       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1261 
1262       /* eighth column of a */
1263       w3 = _mm256_set1_pd(work[7]);
1264       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1265       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1266       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1267 
1268       /* ninth column of a */
1269       w0 = _mm256_set1_pd(work[8]);
1270       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1271       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1272       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1273 
1274       /* tenth column of a */
1275       w1 = _mm256_set1_pd(work[9]);
1276       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1277       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1278       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1279 
1280       /* eleventh column of a */
1281       w2 = _mm256_set1_pd(work[10]);
1282       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1283       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1284       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1285 
1286       /* twelveth column of a */
1287       w3 = _mm256_set1_pd(work[11]);
1288       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1289       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1290       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1291 
1292       v += bs2;
1293     }
1294     if (usecprow) z = zarray + bs*ridx[i];
1295     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1296     if (!usecprow) z += bs;
1297   }
1298   PetscCall(VecRestoreArrayRead(xx,&x));
1299   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1300   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
1301   PetscFunctionReturn(0);
1302 }
1303 #endif
1304 
1305 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1306 /* Default MatMult for block size 15 */
1307 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1308 {
1309   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1310   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1311   const PetscScalar *x,*xb;
1312   PetscScalar       *zarray,xv;
1313   const MatScalar   *v;
1314   const PetscInt    *ii,*ij=a->j,*idx;
1315   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1316   PetscBool         usecprow=a->compressedrow.use;
1317 
1318   PetscFunctionBegin;
1319   PetscCall(VecGetArrayRead(xx,&x));
1320   PetscCall(VecGetArrayWrite(zz,&zarray));
1321 
1322   v = a->a;
1323   if (usecprow) {
1324     mbs  = a->compressedrow.nrows;
1325     ii   = a->compressedrow.i;
1326     ridx = a->compressedrow.rindex;
1327     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1328   } else {
1329     mbs = a->mbs;
1330     ii  = a->i;
1331     z   = zarray;
1332   }
1333 
1334   for (i=0; i<mbs; i++) {
1335     n    = ii[i+1] - ii[i];
1336     idx  = ij + ii[i];
1337     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1338     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1339 
1340     for (j=0; j<n; j++) {
1341       xb = x + 15*(idx[j]);
1342 
1343       for (k=0; k<15; k++) {
1344         xv     =  xb[k];
1345         sum1  += v[0]*xv;
1346         sum2  += v[1]*xv;
1347         sum3  += v[2]*xv;
1348         sum4  += v[3]*xv;
1349         sum5  += v[4]*xv;
1350         sum6  += v[5]*xv;
1351         sum7  += v[6]*xv;
1352         sum8  += v[7]*xv;
1353         sum9  += v[8]*xv;
1354         sum10 += v[9]*xv;
1355         sum11 += v[10]*xv;
1356         sum12 += v[11]*xv;
1357         sum13 += v[12]*xv;
1358         sum14 += v[13]*xv;
1359         sum15 += v[14]*xv;
1360         v     += 15;
1361       }
1362     }
1363     if (usecprow) z = zarray + 15*ridx[i];
1364     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1365     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1366 
1367     if (!usecprow) z += 15;
1368   }
1369 
1370   PetscCall(VecRestoreArrayRead(xx,&x));
1371   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1372   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1377 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1378 {
1379   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1380   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1381   const PetscScalar *x,*xb;
1382   PetscScalar       x1,x2,x3,x4,*zarray;
1383   const MatScalar   *v;
1384   const PetscInt    *ii,*ij=a->j,*idx;
1385   PetscInt          mbs,i,j,n,*ridx=NULL;
1386   PetscBool         usecprow=a->compressedrow.use;
1387 
1388   PetscFunctionBegin;
1389   PetscCall(VecGetArrayRead(xx,&x));
1390   PetscCall(VecGetArrayWrite(zz,&zarray));
1391 
1392   v = a->a;
1393   if (usecprow) {
1394     mbs  = a->compressedrow.nrows;
1395     ii   = a->compressedrow.i;
1396     ridx = a->compressedrow.rindex;
1397     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1398   } else {
1399     mbs = a->mbs;
1400     ii  = a->i;
1401     z   = zarray;
1402   }
1403 
1404   for (i=0; i<mbs; i++) {
1405     n    = ii[i+1] - ii[i];
1406     idx  = ij + ii[i];
1407     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1408     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1409 
1410     for (j=0; j<n; j++) {
1411       xb = x + 15*(idx[j]);
1412       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1413 
1414       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1415       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1416       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1417       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1418       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1419       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1420       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1421       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1422       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1423       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1424       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1425       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1426       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1427       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1428       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1429 
1430       v += 60;
1431 
1432       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1433 
1434       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1435       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1436       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1437       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1438       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1439       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1440       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1441       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1442       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1443       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1444       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1445       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1446       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1447       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1448       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1449       v     += 60;
1450 
1451       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1452       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1453       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1454       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1455       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1456       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1457       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1458       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1459       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1460       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1461       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1462       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1463       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1464       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1465       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1466       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1467       v     += 60;
1468 
1469       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1470       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1471       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1472       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1473       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1474       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1475       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1476       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1477       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1478       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1479       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1480       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1481       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1482       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1483       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1484       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1485       v     += 45;
1486     }
1487     if (usecprow) z = zarray + 15*ridx[i];
1488     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1489     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1490 
1491     if (!usecprow) z += 15;
1492   }
1493 
1494   PetscCall(VecRestoreArrayRead(xx,&x));
1495   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1496   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1501 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1502 {
1503   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1504   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1505   const PetscScalar *x,*xb;
1506   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1507   const MatScalar   *v;
1508   const PetscInt    *ii,*ij=a->j,*idx;
1509   PetscInt          mbs,i,j,n,*ridx=NULL;
1510   PetscBool         usecprow=a->compressedrow.use;
1511 
1512   PetscFunctionBegin;
1513   PetscCall(VecGetArrayRead(xx,&x));
1514   PetscCall(VecGetArrayWrite(zz,&zarray));
1515 
1516   v = a->a;
1517   if (usecprow) {
1518     mbs  = a->compressedrow.nrows;
1519     ii   = a->compressedrow.i;
1520     ridx = a->compressedrow.rindex;
1521     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1522   } else {
1523     mbs = a->mbs;
1524     ii  = a->i;
1525     z   = zarray;
1526   }
1527 
1528   for (i=0; i<mbs; i++) {
1529     n    = ii[i+1] - ii[i];
1530     idx  = ij + ii[i];
1531     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1532     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1533 
1534     for (j=0; j<n; j++) {
1535       xb = x + 15*(idx[j]);
1536       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1537       x8 = xb[7];
1538 
1539       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1540       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1541       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1542       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1543       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1544       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1545       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1546       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1547       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1548       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1549       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1550       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1551       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1552       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1553       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1554       v     += 120;
1555 
1556       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1557 
1558       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1559       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1560       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1561       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1562       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1563       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1564       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1565       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1566       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1567       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1568       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1569       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1570       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1571       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1572       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1573       v     += 105;
1574     }
1575     if (usecprow) z = zarray + 15*ridx[i];
1576     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1577     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1578 
1579     if (!usecprow) z += 15;
1580   }
1581 
1582   PetscCall(VecRestoreArrayRead(xx,&x));
1583   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1584   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1589 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1590 {
1591   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1592   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1593   const PetscScalar *x,*xb;
1594   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1595   const MatScalar   *v;
1596   const PetscInt    *ii,*ij=a->j,*idx;
1597   PetscInt          mbs,i,j,n,*ridx=NULL;
1598   PetscBool         usecprow=a->compressedrow.use;
1599 
1600   PetscFunctionBegin;
1601   PetscCall(VecGetArrayRead(xx,&x));
1602   PetscCall(VecGetArrayWrite(zz,&zarray));
1603 
1604   v = a->a;
1605   if (usecprow) {
1606     mbs  = a->compressedrow.nrows;
1607     ii   = a->compressedrow.i;
1608     ridx = a->compressedrow.rindex;
1609     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1610   } else {
1611     mbs = a->mbs;
1612     ii  = a->i;
1613     z   = zarray;
1614   }
1615 
1616   for (i=0; i<mbs; i++) {
1617     n    = ii[i+1] - ii[i];
1618     idx  = ij + ii[i];
1619     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1620     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1621 
1622     for (j=0; j<n; j++) {
1623       xb = x + 15*(idx[j]);
1624       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1625       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1626 
1627       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1628       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1629       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1630       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1631       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1632       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1633       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1634       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1635       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1636       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1637       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1638       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1639       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1640       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1641       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1642       v     += 225;
1643     }
1644     if (usecprow) z = zarray + 15*ridx[i];
1645     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1646     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1647 
1648     if (!usecprow) z += 15;
1649   }
1650 
1651   PetscCall(VecRestoreArrayRead(xx,&x));
1652   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1653   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 /*
1658     This will not work with MatScalar == float because it calls the BLAS
1659 */
1660 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1661 {
1662   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1663   PetscScalar       *z = NULL,*work,*workt,*zarray;
1664   const PetscScalar *x,*xb;
1665   const MatScalar   *v;
1666   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1667   const PetscInt    *idx,*ii,*ridx=NULL;
1668   PetscInt          ncols,k;
1669   PetscBool         usecprow=a->compressedrow.use;
1670 
1671   PetscFunctionBegin;
1672   PetscCall(VecGetArrayRead(xx,&x));
1673   PetscCall(VecGetArrayWrite(zz,&zarray));
1674 
1675   idx = a->j;
1676   v   = a->a;
1677   if (usecprow) {
1678     mbs  = a->compressedrow.nrows;
1679     ii   = a->compressedrow.i;
1680     ridx = a->compressedrow.rindex;
1681     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
1682   } else {
1683     mbs = a->mbs;
1684     ii  = a->i;
1685     z   = zarray;
1686   }
1687 
1688   if (!a->mult_work) {
1689     k    = PetscMax(A->rmap->n,A->cmap->n);
1690     PetscCall(PetscMalloc1(k+1,&a->mult_work));
1691   }
1692   work = a->mult_work;
1693   for (i=0; i<mbs; i++) {
1694     n           = ii[1] - ii[0]; ii++;
1695     ncols       = n*bs;
1696     workt       = work;
1697     for (j=0; j<n; j++) {
1698       xb = x + bs*(*idx++);
1699       for (k=0; k<bs; k++) workt[k] = xb[k];
1700       workt += bs;
1701     }
1702     if (usecprow) z = zarray + bs*ridx[i];
1703     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1704     v += n*bs2;
1705     if (!usecprow) z += bs;
1706   }
1707   PetscCall(VecRestoreArrayRead(xx,&x));
1708   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1709   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1714 {
1715   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1716   const PetscScalar *x;
1717   PetscScalar       *y,*z,sum;
1718   const MatScalar   *v;
1719   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1720   const PetscInt    *idx,*ii;
1721   PetscBool         usecprow=a->compressedrow.use;
1722 
1723   PetscFunctionBegin;
1724   PetscCall(VecGetArrayRead(xx,&x));
1725   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
1726 
1727   idx = a->j;
1728   v   = a->a;
1729   if (usecprow) {
1730     if (zz != yy) {
1731       PetscCall(PetscArraycpy(z,y,mbs));
1732     }
1733     mbs  = a->compressedrow.nrows;
1734     ii   = a->compressedrow.i;
1735     ridx = a->compressedrow.rindex;
1736   } else {
1737     ii = a->i;
1738   }
1739 
1740   for (i=0; i<mbs; i++) {
1741     n = ii[1] - ii[0];
1742     ii++;
1743     if (!usecprow) {
1744       sum = y[i];
1745     } else {
1746       sum = y[ridx[i]];
1747     }
1748     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1749     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1750     PetscSparseDensePlusDot(sum,x,v,idx,n);
1751     v   += n;
1752     idx += n;
1753     if (usecprow) {
1754       z[ridx[i]] = sum;
1755     } else {
1756       z[i] = sum;
1757     }
1758   }
1759   PetscCall(VecRestoreArrayRead(xx,&x));
1760   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
1761   PetscCall(PetscLogFlops(2.0*a->nz));
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1766 {
1767   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1768   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1769   const PetscScalar *x,*xb;
1770   PetscScalar       x1,x2,*yarray,*zarray;
1771   const MatScalar   *v;
1772   PetscInt          mbs = a->mbs,i,n,j;
1773   const PetscInt    *idx,*ii,*ridx = NULL;
1774   PetscBool         usecprow = a->compressedrow.use;
1775 
1776   PetscFunctionBegin;
1777   PetscCall(VecGetArrayRead(xx,&x));
1778   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1779 
1780   idx = a->j;
1781   v   = a->a;
1782   if (usecprow) {
1783     if (zz != yy) {
1784       PetscCall(PetscArraycpy(zarray,yarray,2*mbs));
1785     }
1786     mbs  = a->compressedrow.nrows;
1787     ii   = a->compressedrow.i;
1788     ridx = a->compressedrow.rindex;
1789   } else {
1790     ii = a->i;
1791     y  = yarray;
1792     z  = zarray;
1793   }
1794 
1795   for (i=0; i<mbs; i++) {
1796     n = ii[1] - ii[0]; ii++;
1797     if (usecprow) {
1798       z = zarray + 2*ridx[i];
1799       y = yarray + 2*ridx[i];
1800     }
1801     sum1 = y[0]; sum2 = y[1];
1802     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1803     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1804     for (j=0; j<n; j++) {
1805       xb = x + 2*(*idx++);
1806       x1 = xb[0];
1807       x2 = xb[1];
1808 
1809       sum1 += v[0]*x1 + v[2]*x2;
1810       sum2 += v[1]*x1 + v[3]*x2;
1811       v    += 4;
1812     }
1813     z[0] = sum1; z[1] = sum2;
1814     if (!usecprow) {
1815       z += 2; y += 2;
1816     }
1817   }
1818   PetscCall(VecRestoreArrayRead(xx,&x));
1819   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1820   PetscCall(PetscLogFlops(4.0*a->nz));
1821   PetscFunctionReturn(0);
1822 }
1823 
1824 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1825 {
1826   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1827   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1828   const PetscScalar *x,*xb;
1829   const MatScalar   *v;
1830   PetscInt          mbs = a->mbs,i,j,n;
1831   const PetscInt    *idx,*ii,*ridx = NULL;
1832   PetscBool         usecprow = a->compressedrow.use;
1833 
1834   PetscFunctionBegin;
1835   PetscCall(VecGetArrayRead(xx,&x));
1836   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1837 
1838   idx = a->j;
1839   v   = a->a;
1840   if (usecprow) {
1841     if (zz != yy) {
1842       PetscCall(PetscArraycpy(zarray,yarray,3*mbs));
1843     }
1844     mbs  = a->compressedrow.nrows;
1845     ii   = a->compressedrow.i;
1846     ridx = a->compressedrow.rindex;
1847   } else {
1848     ii = a->i;
1849     y  = yarray;
1850     z  = zarray;
1851   }
1852 
1853   for (i=0; i<mbs; i++) {
1854     n = ii[1] - ii[0]; ii++;
1855     if (usecprow) {
1856       z = zarray + 3*ridx[i];
1857       y = yarray + 3*ridx[i];
1858     }
1859     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1860     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1861     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1862     for (j=0; j<n; j++) {
1863       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1864       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1865       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1866       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1867       v    += 9;
1868     }
1869     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1870     if (!usecprow) {
1871       z += 3; y += 3;
1872     }
1873   }
1874   PetscCall(VecRestoreArrayRead(xx,&x));
1875   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1876   PetscCall(PetscLogFlops(18.0*a->nz));
1877   PetscFunctionReturn(0);
1878 }
1879 
1880 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1881 {
1882   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1883   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1884   const PetscScalar *x,*xb;
1885   const MatScalar   *v;
1886   PetscInt          mbs = a->mbs,i,j,n;
1887   const PetscInt    *idx,*ii,*ridx=NULL;
1888   PetscBool         usecprow=a->compressedrow.use;
1889 
1890   PetscFunctionBegin;
1891   PetscCall(VecGetArrayRead(xx,&x));
1892   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1893 
1894   idx = a->j;
1895   v   = a->a;
1896   if (usecprow) {
1897     if (zz != yy) {
1898       PetscCall(PetscArraycpy(zarray,yarray,4*mbs));
1899     }
1900     mbs  = a->compressedrow.nrows;
1901     ii   = a->compressedrow.i;
1902     ridx = a->compressedrow.rindex;
1903   } else {
1904     ii = a->i;
1905     y  = yarray;
1906     z  = zarray;
1907   }
1908 
1909   for (i=0; i<mbs; i++) {
1910     n = ii[1] - ii[0]; ii++;
1911     if (usecprow) {
1912       z = zarray + 4*ridx[i];
1913       y = yarray + 4*ridx[i];
1914     }
1915     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1916     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1917     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1918     for (j=0; j<n; j++) {
1919       xb    = x + 4*(*idx++);
1920       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1921       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1922       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1923       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1924       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1925       v    += 16;
1926     }
1927     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1928     if (!usecprow) {
1929       z += 4; y += 4;
1930     }
1931   }
1932   PetscCall(VecRestoreArrayRead(xx,&x));
1933   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1934   PetscCall(PetscLogFlops(32.0*a->nz));
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1939 {
1940   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1941   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1942   const PetscScalar *x,*xb;
1943   PetscScalar       *yarray,*zarray;
1944   const MatScalar   *v;
1945   PetscInt          mbs = a->mbs,i,j,n;
1946   const PetscInt    *idx,*ii,*ridx = NULL;
1947   PetscBool         usecprow=a->compressedrow.use;
1948 
1949   PetscFunctionBegin;
1950   PetscCall(VecGetArrayRead(xx,&x));
1951   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1952 
1953   idx = a->j;
1954   v   = a->a;
1955   if (usecprow) {
1956     if (zz != yy) {
1957       PetscCall(PetscArraycpy(zarray,yarray,5*mbs));
1958     }
1959     mbs  = a->compressedrow.nrows;
1960     ii   = a->compressedrow.i;
1961     ridx = a->compressedrow.rindex;
1962   } else {
1963     ii = a->i;
1964     y  = yarray;
1965     z  = zarray;
1966   }
1967 
1968   for (i=0; i<mbs; i++) {
1969     n = ii[1] - ii[0]; ii++;
1970     if (usecprow) {
1971       z = zarray + 5*ridx[i];
1972       y = yarray + 5*ridx[i];
1973     }
1974     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1975     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1976     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1977     for (j=0; j<n; j++) {
1978       xb    = x + 5*(*idx++);
1979       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1980       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1981       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1982       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1983       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1984       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1985       v    += 25;
1986     }
1987     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1988     if (!usecprow) {
1989       z += 5; y += 5;
1990     }
1991   }
1992   PetscCall(VecRestoreArrayRead(xx,&x));
1993   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1994   PetscCall(PetscLogFlops(50.0*a->nz));
1995   PetscFunctionReturn(0);
1996 }
1997 
1998 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1999 {
2000   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2001   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2002   const PetscScalar *x,*xb;
2003   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2004   const MatScalar   *v;
2005   PetscInt          mbs = a->mbs,i,j,n;
2006   const PetscInt    *idx,*ii,*ridx=NULL;
2007   PetscBool         usecprow=a->compressedrow.use;
2008 
2009   PetscFunctionBegin;
2010   PetscCall(VecGetArrayRead(xx,&x));
2011   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2012 
2013   idx = a->j;
2014   v   = a->a;
2015   if (usecprow) {
2016     if (zz != yy) {
2017       PetscCall(PetscArraycpy(zarray,yarray,6*mbs));
2018     }
2019     mbs  = a->compressedrow.nrows;
2020     ii   = a->compressedrow.i;
2021     ridx = a->compressedrow.rindex;
2022   } else {
2023     ii = a->i;
2024     y  = yarray;
2025     z  = zarray;
2026   }
2027 
2028   for (i=0; i<mbs; i++) {
2029     n = ii[1] - ii[0]; ii++;
2030     if (usecprow) {
2031       z = zarray + 6*ridx[i];
2032       y = yarray + 6*ridx[i];
2033     }
2034     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2035     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2036     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2037     for (j=0; j<n; j++) {
2038       xb    = x + 6*(*idx++);
2039       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2040       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2041       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2042       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2043       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2044       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2045       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2046       v    += 36;
2047     }
2048     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2049     if (!usecprow) {
2050       z += 6; y += 6;
2051     }
2052   }
2053   PetscCall(VecRestoreArrayRead(xx,&x));
2054   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2055   PetscCall(PetscLogFlops(72.0*a->nz));
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2060 {
2061   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2062   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2063   const PetscScalar *x,*xb;
2064   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2065   const MatScalar   *v;
2066   PetscInt          mbs = a->mbs,i,j,n;
2067   const PetscInt    *idx,*ii,*ridx = NULL;
2068   PetscBool         usecprow=a->compressedrow.use;
2069 
2070   PetscFunctionBegin;
2071   PetscCall(VecGetArrayRead(xx,&x));
2072   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2073 
2074   idx = a->j;
2075   v   = a->a;
2076   if (usecprow) {
2077     if (zz != yy) {
2078       PetscCall(PetscArraycpy(zarray,yarray,7*mbs));
2079     }
2080     mbs  = a->compressedrow.nrows;
2081     ii   = a->compressedrow.i;
2082     ridx = a->compressedrow.rindex;
2083   } else {
2084     ii = a->i;
2085     y  = yarray;
2086     z  = zarray;
2087   }
2088 
2089   for (i=0; i<mbs; i++) {
2090     n = ii[1] - ii[0]; ii++;
2091     if (usecprow) {
2092       z = zarray + 7*ridx[i];
2093       y = yarray + 7*ridx[i];
2094     }
2095     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2096     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2097     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2098     for (j=0; j<n; j++) {
2099       xb    = x + 7*(*idx++);
2100       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2101       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2102       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2103       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2104       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2105       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2106       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2107       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2108       v    += 49;
2109     }
2110     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2111     if (!usecprow) {
2112       z += 7; y += 7;
2113     }
2114   }
2115   PetscCall(VecRestoreArrayRead(xx,&x));
2116   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2117   PetscCall(PetscLogFlops(98.0*a->nz));
2118   PetscFunctionReturn(0);
2119 }
2120 
2121 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2122 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2123 {
2124   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2125   PetscScalar       *z = NULL,*work,*workt,*zarray;
2126   const PetscScalar *x,*xb;
2127   const MatScalar   *v;
2128   PetscInt          mbs,i,j,n;
2129   PetscInt          k;
2130   PetscBool         usecprow=a->compressedrow.use;
2131   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2132 
2133   __m256d a0,a1,a2,a3,a4,a5;
2134   __m256d w0,w1,w2,w3;
2135   __m256d z0,z1,z2;
2136   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2137 
2138   PetscFunctionBegin;
2139   PetscCall(VecCopy(yy,zz));
2140   PetscCall(VecGetArrayRead(xx,&x));
2141   PetscCall(VecGetArray(zz,&zarray));
2142 
2143   idx = a->j;
2144   v   = a->a;
2145   if (usecprow) {
2146     mbs  = a->compressedrow.nrows;
2147     ii   = a->compressedrow.i;
2148     ridx = a->compressedrow.rindex;
2149   } else {
2150     mbs = a->mbs;
2151     ii  = a->i;
2152     z   = zarray;
2153   }
2154 
2155   if (!a->mult_work) {
2156     k    = PetscMax(A->rmap->n,A->cmap->n);
2157     PetscCall(PetscMalloc1(k+1,&a->mult_work));
2158   }
2159 
2160   work = a->mult_work;
2161   for (i=0; i<mbs; i++) {
2162     n           = ii[1] - ii[0]; ii++;
2163     workt       = work;
2164     for (j=0; j<n; j++) {
2165       xb = x + bs*(*idx++);
2166       for (k=0; k<bs; k++) workt[k] = xb[k];
2167       workt += bs;
2168     }
2169     if (usecprow) z = zarray + bs*ridx[i];
2170 
2171     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2172 
2173     for (j=0; j<n; j++) {
2174       /* first column of a */
2175       w0 = _mm256_set1_pd(work[j*9  ]);
2176       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2177       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2178       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2179 
2180       /* second column of a */
2181       w1 = _mm256_set1_pd(work[j*9+ 1]);
2182       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2183       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2184       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2185 
2186       /* third column of a */
2187       w2 = _mm256_set1_pd(work[j*9+ 2]);
2188       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2189       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2190       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2191 
2192       /* fourth column of a */
2193       w3 = _mm256_set1_pd(work[j*9+ 3]);
2194       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2195       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2196       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2197 
2198       /* fifth column of a */
2199       w0 = _mm256_set1_pd(work[j*9+ 4]);
2200       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2201       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2202       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2203 
2204       /* sixth column of a */
2205       w1 = _mm256_set1_pd(work[j*9+ 5]);
2206       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2207       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2208       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2209 
2210       /* seventh column of a */
2211       w2 = _mm256_set1_pd(work[j*9+ 6]);
2212       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2213       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2214       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2215 
2216       /* eighth column of a */
2217       w3 = _mm256_set1_pd(work[j*9+ 7]);
2218       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2219       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2220       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2221 
2222       /* ninth column of a */
2223       w0 = _mm256_set1_pd(work[j*9+ 8]);
2224       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2225       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2226       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2227     }
2228 
2229     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2230 
2231     v += n*bs2;
2232     if (!usecprow) z += bs;
2233   }
2234   PetscCall(VecRestoreArrayRead(xx,&x));
2235   PetscCall(VecRestoreArray(zz,&zarray));
2236   PetscCall(PetscLogFlops(162.0*a->nz));
2237   PetscFunctionReturn(0);
2238 }
2239 #endif
2240 
2241 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2242 {
2243   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2244   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2245   const PetscScalar *x,*xb;
2246   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2247   const MatScalar   *v;
2248   PetscInt          mbs = a->mbs,i,j,n;
2249   const PetscInt    *idx,*ii,*ridx = NULL;
2250   PetscBool         usecprow=a->compressedrow.use;
2251 
2252   PetscFunctionBegin;
2253   PetscCall(VecGetArrayRead(xx,&x));
2254   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2255 
2256   idx = a->j;
2257   v   = a->a;
2258   if (usecprow) {
2259     if (zz != yy) {
2260       PetscCall(PetscArraycpy(zarray,yarray,7*mbs));
2261     }
2262     mbs  = a->compressedrow.nrows;
2263     ii   = a->compressedrow.i;
2264     ridx = a->compressedrow.rindex;
2265   } else {
2266     ii = a->i;
2267     y  = yarray;
2268     z  = zarray;
2269   }
2270 
2271   for (i=0; i<mbs; i++) {
2272     n = ii[1] - ii[0]; ii++;
2273     if (usecprow) {
2274       z = zarray + 11*ridx[i];
2275       y = yarray + 11*ridx[i];
2276     }
2277     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2278     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2279     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2280     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2281     for (j=0; j<n; j++) {
2282       xb    = x + 11*(*idx++);
2283       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2284       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2285       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2286       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2287       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2288       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2289       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2290       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2291       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2292       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2293       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2294       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2295       v    += 121;
2296     }
2297     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2298     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2299     if (!usecprow) {
2300       z += 11; y += 11;
2301     }
2302   }
2303   PetscCall(VecRestoreArrayRead(xx,&x));
2304   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2305   PetscCall(PetscLogFlops(242.0*a->nz));
2306   PetscFunctionReturn(0);
2307 }
2308 
2309 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2310 {
2311   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2312   PetscScalar       *z = NULL,*work,*workt,*zarray;
2313   const PetscScalar *x,*xb;
2314   const MatScalar   *v;
2315   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2316   PetscInt          ncols,k;
2317   const PetscInt    *ridx = NULL,*idx,*ii;
2318   PetscBool         usecprow = a->compressedrow.use;
2319 
2320   PetscFunctionBegin;
2321   PetscCall(VecCopy(yy,zz));
2322   PetscCall(VecGetArrayRead(xx,&x));
2323   PetscCall(VecGetArray(zz,&zarray));
2324 
2325   idx = a->j;
2326   v   = a->a;
2327   if (usecprow) {
2328     mbs  = a->compressedrow.nrows;
2329     ii   = a->compressedrow.i;
2330     ridx = a->compressedrow.rindex;
2331   } else {
2332     mbs = a->mbs;
2333     ii  = a->i;
2334     z   = zarray;
2335   }
2336 
2337   if (!a->mult_work) {
2338     k    = PetscMax(A->rmap->n,A->cmap->n);
2339     PetscCall(PetscMalloc1(k+1,&a->mult_work));
2340   }
2341   work = a->mult_work;
2342   for (i=0; i<mbs; i++) {
2343     n     = ii[1] - ii[0]; ii++;
2344     ncols = n*bs;
2345     workt = work;
2346     for (j=0; j<n; j++) {
2347       xb = x + bs*(*idx++);
2348       for (k=0; k<bs; k++) workt[k] = xb[k];
2349       workt += bs;
2350     }
2351     if (usecprow) z = zarray + bs*ridx[i];
2352     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2353     v += n*bs2;
2354     if (!usecprow) z += bs;
2355   }
2356   PetscCall(VecRestoreArrayRead(xx,&x));
2357   PetscCall(VecRestoreArray(zz,&zarray));
2358   PetscCall(PetscLogFlops(2.0*a->nz*bs2));
2359   PetscFunctionReturn(0);
2360 }
2361 
2362 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2363 {
2364   PetscScalar    zero = 0.0;
2365 
2366   PetscFunctionBegin;
2367   PetscCall(VecSet(zz,zero));
2368   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz));
2369   PetscFunctionReturn(0);
2370 }
2371 
2372 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2373 {
2374   PetscScalar    zero = 0.0;
2375 
2376   PetscFunctionBegin;
2377   PetscCall(VecSet(zz,zero));
2378   PetscCall(MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz));
2379   PetscFunctionReturn(0);
2380 }
2381 
2382 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2383 {
2384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2385   PetscScalar       *z,x1,x2,x3,x4,x5;
2386   const PetscScalar *x,*xb = NULL;
2387   const MatScalar   *v;
2388   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2389   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2390   Mat_CompressedRow cprow = a->compressedrow;
2391   PetscBool         usecprow = cprow.use;
2392 
2393   PetscFunctionBegin;
2394   if (yy != zz) PetscCall(VecCopy(yy,zz));
2395   PetscCall(VecGetArrayRead(xx,&x));
2396   PetscCall(VecGetArray(zz,&z));
2397 
2398   idx = a->j;
2399   v   = a->a;
2400   if (usecprow) {
2401     mbs  = cprow.nrows;
2402     ii   = cprow.i;
2403     ridx = cprow.rindex;
2404   } else {
2405     mbs=a->mbs;
2406     ii = a->i;
2407     xb = x;
2408   }
2409 
2410   switch (bs) {
2411   case 1:
2412     for (i=0; i<mbs; i++) {
2413       if (usecprow) xb = x + ridx[i];
2414       x1 = xb[0];
2415       ib = idx + ii[0];
2416       n  = ii[1] - ii[0]; ii++;
2417       for (j=0; j<n; j++) {
2418         rval     = ib[j];
2419         z[rval] += PetscConj(*v) * x1;
2420         v++;
2421       }
2422       if (!usecprow) xb++;
2423     }
2424     break;
2425   case 2:
2426     for (i=0; i<mbs; i++) {
2427       if (usecprow) xb = x + 2*ridx[i];
2428       x1 = xb[0]; x2 = xb[1];
2429       ib = idx + ii[0];
2430       n  = ii[1] - ii[0]; ii++;
2431       for (j=0; j<n; j++) {
2432         rval       = ib[j]*2;
2433         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2434         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2435         v         += 4;
2436       }
2437       if (!usecprow) xb += 2;
2438     }
2439     break;
2440   case 3:
2441     for (i=0; i<mbs; i++) {
2442       if (usecprow) xb = x + 3*ridx[i];
2443       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2444       ib = idx + ii[0];
2445       n  = ii[1] - ii[0]; ii++;
2446       for (j=0; j<n; j++) {
2447         rval       = ib[j]*3;
2448         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2449         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2450         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2451         v         += 9;
2452       }
2453       if (!usecprow) xb += 3;
2454     }
2455     break;
2456   case 4:
2457     for (i=0; i<mbs; i++) {
2458       if (usecprow) xb = x + 4*ridx[i];
2459       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2460       ib = idx + ii[0];
2461       n  = ii[1] - ii[0]; ii++;
2462       for (j=0; j<n; j++) {
2463         rval       = ib[j]*4;
2464         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2465         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2466         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2467         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2468         v         += 16;
2469       }
2470       if (!usecprow) xb += 4;
2471     }
2472     break;
2473   case 5:
2474     for (i=0; i<mbs; i++) {
2475       if (usecprow) xb = x + 5*ridx[i];
2476       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2477       x4 = xb[3]; x5 = xb[4];
2478       ib = idx + ii[0];
2479       n  = ii[1] - ii[0]; ii++;
2480       for (j=0; j<n; j++) {
2481         rval       = ib[j]*5;
2482         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2483         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2484         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2485         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2486         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2487         v         += 25;
2488       }
2489       if (!usecprow) xb += 5;
2490     }
2491     break;
2492   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2493     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2494 #if 0
2495     {
2496       PetscInt          ncols,k,bs2=a->bs2;
2497       PetscScalar       *work,*workt,zb;
2498       const PetscScalar *xtmp;
2499       if (!a->mult_work) {
2500         k    = PetscMax(A->rmap->n,A->cmap->n);
2501         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2502       }
2503       work = a->mult_work;
2504       xtmp = x;
2505       for (i=0; i<mbs; i++) {
2506         n     = ii[1] - ii[0]; ii++;
2507         ncols = n*bs;
2508         PetscCall(PetscArrayzero(work,ncols));
2509         if (usecprow) xtmp = x + bs*ridx[i];
2510         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2511         v += n*bs2;
2512         if (!usecprow) xtmp += bs;
2513         workt = work;
2514         for (j=0; j<n; j++) {
2515           zb = z + bs*(*idx++);
2516           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2517           workt += bs;
2518         }
2519       }
2520     }
2521 #endif
2522   }
2523   PetscCall(VecRestoreArrayRead(xx,&x));
2524   PetscCall(VecRestoreArray(zz,&z));
2525   PetscCall(PetscLogFlops(2.0*a->nz*a->bs2));
2526   PetscFunctionReturn(0);
2527 }
2528 
2529 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2530 {
2531   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2532   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2533   const PetscScalar *x,*xb = NULL;
2534   const MatScalar   *v;
2535   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2536   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2537   Mat_CompressedRow cprow   = a->compressedrow;
2538   PetscBool         usecprow=cprow.use;
2539 
2540   PetscFunctionBegin;
2541   if (yy != zz) PetscCall(VecCopy(yy,zz));
2542   PetscCall(VecGetArrayRead(xx,&x));
2543   PetscCall(VecGetArray(zz,&z));
2544 
2545   idx = a->j;
2546   v   = a->a;
2547   if (usecprow) {
2548     mbs  = cprow.nrows;
2549     ii   = cprow.i;
2550     ridx = cprow.rindex;
2551   } else {
2552     mbs=a->mbs;
2553     ii = a->i;
2554     xb = x;
2555   }
2556 
2557   switch (bs) {
2558   case 1:
2559     for (i=0; i<mbs; i++) {
2560       if (usecprow) xb = x + ridx[i];
2561       x1 = xb[0];
2562       ib = idx + ii[0];
2563       n  = ii[1] - ii[0]; ii++;
2564       for (j=0; j<n; j++) {
2565         rval     = ib[j];
2566         z[rval] += *v * x1;
2567         v++;
2568       }
2569       if (!usecprow) xb++;
2570     }
2571     break;
2572   case 2:
2573     for (i=0; i<mbs; i++) {
2574       if (usecprow) xb = x + 2*ridx[i];
2575       x1 = xb[0]; x2 = xb[1];
2576       ib = idx + ii[0];
2577       n  = ii[1] - ii[0]; ii++;
2578       for (j=0; j<n; j++) {
2579         rval       = ib[j]*2;
2580         z[rval++] += v[0]*x1 + v[1]*x2;
2581         z[rval++] += v[2]*x1 + v[3]*x2;
2582         v         += 4;
2583       }
2584       if (!usecprow) xb += 2;
2585     }
2586     break;
2587   case 3:
2588     for (i=0; i<mbs; i++) {
2589       if (usecprow) xb = x + 3*ridx[i];
2590       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2591       ib = idx + ii[0];
2592       n  = ii[1] - ii[0]; ii++;
2593       for (j=0; j<n; j++) {
2594         rval       = ib[j]*3;
2595         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2596         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2597         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2598         v         += 9;
2599       }
2600       if (!usecprow) xb += 3;
2601     }
2602     break;
2603   case 4:
2604     for (i=0; i<mbs; i++) {
2605       if (usecprow) xb = x + 4*ridx[i];
2606       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2607       ib = idx + ii[0];
2608       n  = ii[1] - ii[0]; ii++;
2609       for (j=0; j<n; j++) {
2610         rval       = ib[j]*4;
2611         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2612         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2613         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2614         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2615         v         += 16;
2616       }
2617       if (!usecprow) xb += 4;
2618     }
2619     break;
2620   case 5:
2621     for (i=0; i<mbs; i++) {
2622       if (usecprow) xb = x + 5*ridx[i];
2623       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2624       x4 = xb[3]; x5 = xb[4];
2625       ib = idx + ii[0];
2626       n  = ii[1] - ii[0]; ii++;
2627       for (j=0; j<n; j++) {
2628         rval       = ib[j]*5;
2629         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2630         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2631         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2632         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2633         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2634         v         += 25;
2635       }
2636       if (!usecprow) xb += 5;
2637     }
2638     break;
2639   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2640     PetscInt          ncols,k;
2641     PetscScalar       *work,*workt;
2642     const PetscScalar *xtmp;
2643     if (!a->mult_work) {
2644       k    = PetscMax(A->rmap->n,A->cmap->n);
2645       PetscCall(PetscMalloc1(k+1,&a->mult_work));
2646     }
2647     work = a->mult_work;
2648     xtmp = x;
2649     for (i=0; i<mbs; i++) {
2650       n     = ii[1] - ii[0]; ii++;
2651       ncols = n*bs;
2652       PetscCall(PetscArrayzero(work,ncols));
2653       if (usecprow) xtmp = x + bs*ridx[i];
2654       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2655       v += n*bs2;
2656       if (!usecprow) xtmp += bs;
2657       workt = work;
2658       for (j=0; j<n; j++) {
2659         zb = z + bs*(*idx++);
2660         for (k=0; k<bs; k++) zb[k] += workt[k];
2661         workt += bs;
2662       }
2663     }
2664     }
2665   }
2666   PetscCall(VecRestoreArrayRead(xx,&x));
2667   PetscCall(VecRestoreArray(zz,&z));
2668   PetscCall(PetscLogFlops(2.0*a->nz*a->bs2));
2669   PetscFunctionReturn(0);
2670 }
2671 
2672 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2673 {
2674   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2675   PetscInt       totalnz = a->bs2*a->nz;
2676   PetscScalar    oalpha  = alpha;
2677   PetscBLASInt   one = 1,tnz;
2678 
2679   PetscFunctionBegin;
2680   PetscCall(PetscBLASIntCast(totalnz,&tnz));
2681   PetscCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2682   PetscCall(PetscLogFlops(totalnz));
2683   PetscFunctionReturn(0);
2684 }
2685 
2686 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2687 {
2688   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2689   MatScalar      *v  = a->a;
2690   PetscReal      sum = 0.0;
2691   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2692 
2693   PetscFunctionBegin;
2694   if (type == NORM_FROBENIUS) {
2695 #if defined(PETSC_USE_REAL___FP16)
2696     PetscBLASInt one = 1,cnt = bs2*nz;
2697     PetscCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2698 #else
2699     for (i=0; i<bs2*nz; i++) {
2700       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2701     }
2702 #endif
2703     *norm = PetscSqrtReal(sum);
2704     PetscCall(PetscLogFlops(2.0*bs2*nz));
2705   } else if (type == NORM_1) { /* maximum column sum */
2706     PetscReal *tmp;
2707     PetscInt  *bcol = a->j;
2708     PetscCall(PetscCalloc1(A->cmap->n+1,&tmp));
2709     for (i=0; i<nz; i++) {
2710       for (j=0; j<bs; j++) {
2711         k1 = bs*(*bcol) + j; /* column index */
2712         for (k=0; k<bs; k++) {
2713           tmp[k1] += PetscAbsScalar(*v); v++;
2714         }
2715       }
2716       bcol++;
2717     }
2718     *norm = 0.0;
2719     for (j=0; j<A->cmap->n; j++) {
2720       if (tmp[j] > *norm) *norm = tmp[j];
2721     }
2722     PetscCall(PetscFree(tmp));
2723     PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0)));
2724   } else if (type == NORM_INFINITY) { /* maximum row sum */
2725     *norm = 0.0;
2726     for (k=0; k<bs; k++) {
2727       for (j=0; j<a->mbs; j++) {
2728         v   = a->a + bs2*a->i[j] + k;
2729         sum = 0.0;
2730         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2731           for (k1=0; k1<bs; k1++) {
2732             sum += PetscAbsScalar(*v);
2733             v   += bs;
2734           }
2735         }
2736         if (sum > *norm) *norm = sum;
2737       }
2738     }
2739     PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0)));
2740   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2741   PetscFunctionReturn(0);
2742 }
2743 
2744 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2745 {
2746   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2747 
2748   PetscFunctionBegin;
2749   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2750   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2751     *flg = PETSC_FALSE;
2752     PetscFunctionReturn(0);
2753   }
2754 
2755   /* if the a->i are the same */
2756   PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg));
2757   if (!*flg) PetscFunctionReturn(0);
2758 
2759   /* if a->j are the same */
2760   PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg));
2761   if (!*flg) PetscFunctionReturn(0);
2762 
2763   /* if a->a are the same */
2764   PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg));
2765   PetscFunctionReturn(0);
2766 
2767 }
2768 
2769 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2770 {
2771   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2772   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2773   PetscScalar    *x,zero = 0.0;
2774   MatScalar      *aa,*aa_j;
2775 
2776   PetscFunctionBegin;
2777   PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2778   bs   = A->rmap->bs;
2779   aa   = a->a;
2780   ai   = a->i;
2781   aj   = a->j;
2782   ambs = a->mbs;
2783   bs2  = a->bs2;
2784 
2785   PetscCall(VecSet(v,zero));
2786   PetscCall(VecGetArray(v,&x));
2787   PetscCall(VecGetLocalSize(v,&n));
2788   PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2789   for (i=0; i<ambs; i++) {
2790     for (j=ai[i]; j<ai[i+1]; j++) {
2791       if (aj[j] == i) {
2792         row  = i*bs;
2793         aa_j = aa+j*bs2;
2794         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2795         break;
2796       }
2797     }
2798   }
2799   PetscCall(VecRestoreArray(v,&x));
2800   PetscFunctionReturn(0);
2801 }
2802 
2803 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2804 {
2805   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2806   const PetscScalar *l,*r,*li,*ri;
2807   PetscScalar       x;
2808   MatScalar         *aa, *v;
2809   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2810   const PetscInt    *ai,*aj;
2811 
2812   PetscFunctionBegin;
2813   ai  = a->i;
2814   aj  = a->j;
2815   aa  = a->a;
2816   m   = A->rmap->n;
2817   n   = A->cmap->n;
2818   bs  = A->rmap->bs;
2819   mbs = a->mbs;
2820   bs2 = a->bs2;
2821   if (ll) {
2822     PetscCall(VecGetArrayRead(ll,&l));
2823     PetscCall(VecGetLocalSize(ll,&lm));
2824     PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2825     for (i=0; i<mbs; i++) { /* for each block row */
2826       M  = ai[i+1] - ai[i];
2827       li = l + i*bs;
2828       v  = aa + bs2*ai[i];
2829       for (j=0; j<M; j++) { /* for each block */
2830         for (k=0; k<bs2; k++) {
2831           (*v++) *= li[k%bs];
2832         }
2833       }
2834     }
2835     PetscCall(VecRestoreArrayRead(ll,&l));
2836     PetscCall(PetscLogFlops(a->nz));
2837   }
2838 
2839   if (rr) {
2840     PetscCall(VecGetArrayRead(rr,&r));
2841     PetscCall(VecGetLocalSize(rr,&rn));
2842     PetscCheck(rn == n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2843     for (i=0; i<mbs; i++) { /* for each block row */
2844       iai = ai[i];
2845       M   = ai[i+1] - iai;
2846       v   = aa + bs2*iai;
2847       for (j=0; j<M; j++) { /* for each block */
2848         ri = r + bs*aj[iai+j];
2849         for (k=0; k<bs; k++) {
2850           x = ri[k];
2851           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2852           v += bs;
2853         }
2854       }
2855     }
2856     PetscCall(VecRestoreArrayRead(rr,&r));
2857     PetscCall(PetscLogFlops(a->nz));
2858   }
2859   PetscFunctionReturn(0);
2860 }
2861 
2862 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2863 {
2864   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2865 
2866   PetscFunctionBegin;
2867   info->block_size   = a->bs2;
2868   info->nz_allocated = a->bs2*a->maxnz;
2869   info->nz_used      = a->bs2*a->nz;
2870   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2871   info->assemblies   = A->num_ass;
2872   info->mallocs      = A->info.mallocs;
2873   info->memory       = ((PetscObject)A)->mem;
2874   if (A->factortype) {
2875     info->fill_ratio_given  = A->info.fill_ratio_given;
2876     info->fill_ratio_needed = A->info.fill_ratio_needed;
2877     info->factor_mallocs    = A->info.factor_mallocs;
2878   } else {
2879     info->fill_ratio_given  = 0;
2880     info->fill_ratio_needed = 0;
2881     info->factor_mallocs    = 0;
2882   }
2883   PetscFunctionReturn(0);
2884 }
2885 
2886 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2887 {
2888   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2889 
2890   PetscFunctionBegin;
2891   PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs]));
2892   PetscFunctionReturn(0);
2893 }
2894 
2895 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2896 {
2897   PetscFunctionBegin;
2898   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
2899   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2900   PetscFunctionReturn(0);
2901 }
2902 
2903 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2904 {
2905   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2906   PetscScalar       *z = NULL,sum1;
2907   const PetscScalar *xb;
2908   PetscScalar       x1;
2909   const MatScalar   *v,*vv;
2910   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2911   PetscBool         usecprow=a->compressedrow.use;
2912 
2913   PetscFunctionBegin;
2914   idx = a->j;
2915   v   = a->a;
2916   if (usecprow) {
2917     mbs  = a->compressedrow.nrows;
2918     ii   = a->compressedrow.i;
2919     ridx = a->compressedrow.rindex;
2920   } else {
2921     mbs = a->mbs;
2922     ii  = a->i;
2923     z   = c;
2924   }
2925 
2926   for (i=0; i<mbs; i++) {
2927     n           = ii[1] - ii[0]; ii++;
2928     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2929     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2930     if (usecprow) z = c + ridx[i];
2931     jj = idx;
2932     vv = v;
2933     for (k=0; k<cn; k++) {
2934       idx = jj;
2935       v = vv;
2936       sum1    = 0.0;
2937       for (j=0; j<n; j++) {
2938         xb    = b + (*idx++); x1 = xb[0+k*bm];
2939         sum1 += v[0]*x1;
2940         v    += 1;
2941       }
2942       z[0+k*cm] = sum1;
2943     }
2944     if (!usecprow) z += 1;
2945   }
2946   PetscFunctionReturn(0);
2947 }
2948 
2949 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2950 {
2951   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2952   PetscScalar       *z = NULL,sum1,sum2;
2953   const PetscScalar *xb;
2954   PetscScalar       x1,x2;
2955   const MatScalar   *v,*vv;
2956   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2957   PetscBool         usecprow=a->compressedrow.use;
2958 
2959   PetscFunctionBegin;
2960   idx = a->j;
2961   v   = a->a;
2962   if (usecprow) {
2963     mbs  = a->compressedrow.nrows;
2964     ii   = a->compressedrow.i;
2965     ridx = a->compressedrow.rindex;
2966   } else {
2967     mbs = a->mbs;
2968     ii  = a->i;
2969     z   = c;
2970   }
2971 
2972   for (i=0; i<mbs; i++) {
2973     n           = ii[1] - ii[0]; ii++;
2974     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2975     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2976     if (usecprow) z = c + 2*ridx[i];
2977     jj = idx;
2978     vv = v;
2979     for (k=0; k<cn; k++) {
2980       idx = jj;
2981       v = vv;
2982       sum1    = 0.0; sum2 = 0.0;
2983       for (j=0; j<n; j++) {
2984         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2985         sum1 += v[0]*x1 + v[2]*x2;
2986         sum2 += v[1]*x1 + v[3]*x2;
2987         v    += 4;
2988       }
2989       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2990     }
2991     if (!usecprow) z += 2;
2992   }
2993   PetscFunctionReturn(0);
2994 }
2995 
2996 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2997 {
2998   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2999   PetscScalar       *z = NULL,sum1,sum2,sum3;
3000   const PetscScalar *xb;
3001   PetscScalar       x1,x2,x3;
3002   const MatScalar   *v,*vv;
3003   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3004   PetscBool         usecprow=a->compressedrow.use;
3005 
3006   PetscFunctionBegin;
3007   idx = a->j;
3008   v   = a->a;
3009   if (usecprow) {
3010     mbs  = a->compressedrow.nrows;
3011     ii   = a->compressedrow.i;
3012     ridx = a->compressedrow.rindex;
3013   } else {
3014     mbs = a->mbs;
3015     ii  = a->i;
3016     z   = c;
3017   }
3018 
3019   for (i=0; i<mbs; i++) {
3020     n           = ii[1] - ii[0]; ii++;
3021     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3022     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3023     if (usecprow) z = c + 3*ridx[i];
3024     jj = idx;
3025     vv = v;
3026     for (k=0; k<cn; k++) {
3027       idx = jj;
3028       v = vv;
3029       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3030       for (j=0; j<n; j++) {
3031         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3032         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3033         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3034         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3035         v    += 9;
3036       }
3037       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3038     }
3039     if (!usecprow) z += 3;
3040   }
3041   PetscFunctionReturn(0);
3042 }
3043 
3044 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3045 {
3046   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3047   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3048   const PetscScalar *xb;
3049   PetscScalar       x1,x2,x3,x4;
3050   const MatScalar   *v,*vv;
3051   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3052   PetscBool         usecprow=a->compressedrow.use;
3053 
3054   PetscFunctionBegin;
3055   idx = a->j;
3056   v   = a->a;
3057   if (usecprow) {
3058     mbs  = a->compressedrow.nrows;
3059     ii   = a->compressedrow.i;
3060     ridx = a->compressedrow.rindex;
3061   } else {
3062     mbs = a->mbs;
3063     ii  = a->i;
3064     z   = c;
3065   }
3066 
3067   for (i=0; i<mbs; i++) {
3068     n           = ii[1] - ii[0]; ii++;
3069     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3070     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3071     if (usecprow) z = c + 4*ridx[i];
3072     jj = idx;
3073     vv = v;
3074     for (k=0; k<cn; k++) {
3075       idx = jj;
3076       v = vv;
3077       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3078       for (j=0; j<n; j++) {
3079         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3080         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3081         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3082         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3083         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3084         v    += 16;
3085       }
3086       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3087     }
3088     if (!usecprow) z += 4;
3089   }
3090   PetscFunctionReturn(0);
3091 }
3092 
3093 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3094 {
3095   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3096   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3097   const PetscScalar *xb;
3098   PetscScalar       x1,x2,x3,x4,x5;
3099   const MatScalar   *v,*vv;
3100   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3101   PetscBool         usecprow=a->compressedrow.use;
3102 
3103   PetscFunctionBegin;
3104   idx = a->j;
3105   v   = a->a;
3106   if (usecprow) {
3107     mbs  = a->compressedrow.nrows;
3108     ii   = a->compressedrow.i;
3109     ridx = a->compressedrow.rindex;
3110   } else {
3111     mbs = a->mbs;
3112     ii  = a->i;
3113     z   = c;
3114   }
3115 
3116   for (i=0; i<mbs; i++) {
3117     n           = ii[1] - ii[0]; ii++;
3118     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3119     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3120     if (usecprow) z = c + 5*ridx[i];
3121     jj = idx;
3122     vv = v;
3123     for (k=0; k<cn; k++) {
3124       idx = jj;
3125       v = vv;
3126       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3127       for (j=0; j<n; j++) {
3128         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3129         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3130         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3131         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3132         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3133         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3134         v    += 25;
3135       }
3136       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3137     }
3138     if (!usecprow) z += 5;
3139   }
3140   PetscFunctionReturn(0);
3141 }
3142 
3143 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3144 {
3145   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3146   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3147   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3148   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3149   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3150   PetscBLASInt      bbs,bcn,bbm,bcm;
3151   PetscScalar       *z = NULL;
3152   PetscScalar       *c,*b;
3153   const MatScalar   *v;
3154   const PetscInt    *idx,*ii,*ridx=NULL;
3155   PetscScalar       _DZero=0.0,_DOne=1.0;
3156   PetscBool         usecprow=a->compressedrow.use;
3157 
3158   PetscFunctionBegin;
3159   if (!cm || !cn) PetscFunctionReturn(0);
3160   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
3161   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
3162   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
3163   b = bd->v;
3164   if (a->nonzerorowcnt != A->rmap->n) {
3165     PetscCall(MatZeroEntries(C));
3166   }
3167   PetscCall(MatDenseGetArray(C,&c));
3168   switch (bs) {
3169   case 1:
3170     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3171     break;
3172   case 2:
3173     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3174     break;
3175   case 3:
3176     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3177     break;
3178   case 4:
3179     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3180     break;
3181   case 5:
3182     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3183     break;
3184   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3185     PetscCall(PetscBLASIntCast(bs,&bbs));
3186     PetscCall(PetscBLASIntCast(cn,&bcn));
3187     PetscCall(PetscBLASIntCast(bm,&bbm));
3188     PetscCall(PetscBLASIntCast(cm,&bcm));
3189     idx = a->j;
3190     v   = a->a;
3191     if (usecprow) {
3192       mbs  = a->compressedrow.nrows;
3193       ii   = a->compressedrow.i;
3194       ridx = a->compressedrow.rindex;
3195     } else {
3196       mbs = a->mbs;
3197       ii  = a->i;
3198       z   = c;
3199     }
3200     for (i=0; i<mbs; i++) {
3201       n = ii[1] - ii[0]; ii++;
3202       if (usecprow) z = c + bs*ridx[i];
3203       if (n) {
3204         PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3205         v += bs2;
3206       }
3207       for (j=1; j<n; j++) {
3208         PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3209         v += bs2;
3210       }
3211       if (!usecprow) z += bs;
3212     }
3213   }
3214   PetscCall(MatDenseRestoreArray(C,&c));
3215   PetscCall(PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn));
3216   PetscFunctionReturn(0);
3217 }
3218