xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
14   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
15   const PetscInt *idx;
16   PetscInt       start,end,*ai,*aj,bs,*nidx2;
17   PetscBT        table;
18 
19   PetscFunctionBegin;
20   m  = a->mbs;
21   ai = a->i;
22   aj = a->j;
23   bs = A->rmap->bs;
24 
25   PetscCheck(ov >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   PetscCall(PetscBTCreate(m,&table));
28   PetscCall(PetscMalloc1(m+1,&nidx));
29   PetscCall(PetscMalloc1(A->rmap->N+1,&nidx2));
30 
31   for (i=0; i<is_max; i++) {
32     /* Initialise the two local arrays */
33     isz  = 0;
34     PetscCall(PetscBTMemzero(m,table));
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     PetscCall(ISGetIndices(is[i],&idx));
38     PetscCall(ISGetLocalSize(is[i],&n));
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j=0; j<n; ++j) {
42       ival = idx[j]/bs; /* convert the indices into block indices */
43       PetscCheck(ival<m,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
45     }
46     PetscCall(ISRestoreIndices(is[i],&idx));
47     PetscCall(ISDestroy(&is[i]));
48 
49     k = 0;
50     for (j=0; j<ov; j++) { /* for each overlap*/
51       n = isz;
52       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row+1];
56         for (l = start; l<end; l++) {
57           val = aj[l];
58           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j=0; j<isz; j++) {
64       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
65     }
66     PetscCall(ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i));
67   }
68   PetscCall(PetscBTDestroy(&table));
69   PetscCall(PetscFree(nidx));
70   PetscCall(PetscFree(nidx2));
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
75 {
76   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
77   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
78   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
79   const PetscInt *irow,*icol;
80   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
81   PetscInt       *aj = a->j,*ai = a->i;
82   MatScalar      *mat_a;
83   Mat            C;
84   PetscBool      flag;
85 
86   PetscFunctionBegin;
87   PetscCall(ISGetIndices(isrow,&irow));
88   PetscCall(ISGetIndices(iscol,&icol));
89   PetscCall(ISGetLocalSize(isrow,&nrows));
90   PetscCall(ISGetLocalSize(iscol,&ncols));
91 
92   PetscCall(PetscCalloc1(1+oldcols,&smap));
93   ssmap = smap;
94   PetscCall(PetscMalloc1(1+nrows,&lens));
95   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
96   /* determine lens of each row */
97   for (i=0; i<nrows; i++) {
98     kstart  = ai[irow[i]];
99     kend    = kstart + a->ilen[irow[i]];
100     lens[i] = 0;
101     for (k=kstart; k<kend; k++) {
102       if (ssmap[aj[k]]) lens[i]++;
103     }
104   }
105   /* Create and fill new matrix */
106   if (scall == MAT_REUSE_MATRIX) {
107     c = (Mat_SeqBAIJ*)((*B)->data);
108 
109     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
110     PetscCall(PetscArraycmp(c->ilen,lens,c->mbs,&flag));
111     PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
112     PetscCall(PetscArrayzero(c->ilen,c->mbs));
113     C    = *B;
114   } else {
115     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&C));
116     PetscCall(MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE));
117     PetscCall(MatSetType(C,((PetscObject)A)->type_name));
118     PetscCall(MatSeqBAIJSetPreallocation(C,bs,0,lens));
119   }
120   c = (Mat_SeqBAIJ*)(C->data);
121   for (i=0; i<nrows; i++) {
122     row      = irow[i];
123     kstart   = ai[row];
124     kend     = kstart + a->ilen[row];
125     mat_i    = c->i[i];
126     mat_j    = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */
127     mat_a    = c->a ? c->a + mat_i*bs2 : NULL; /* mustn't add to NULL, that is UB */
128     mat_ilen = c->ilen + i;
129     for (k=kstart; k<kend; k++) {
130       if ((tcol=ssmap[a->j[k]])) {
131         *mat_j++ = tcol - 1;
132         PetscCall(PetscArraycpy(mat_a,a->a+k*bs2,bs2));
133         mat_a   += bs2;
134         (*mat_ilen)++;
135       }
136     }
137   }
138   /* sort */
139   if (c->j && c->a) {
140     MatScalar *work;
141     PetscCall(PetscMalloc1(bs2,&work));
142     for (i=0; i<nrows; i++) {
143       PetscInt ilen;
144       mat_i = c->i[i];
145       mat_j = c->j + mat_i;
146       mat_a = c->a + mat_i*bs2;
147       ilen  = c->ilen[i];
148       PetscCall(PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work));
149     }
150     PetscCall(PetscFree(work));
151   }
152 
153   /* Free work space */
154   PetscCall(ISRestoreIndices(iscol,&icol));
155   PetscCall(PetscFree(smap));
156   PetscCall(PetscFree(lens));
157   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
158   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
159 
160   PetscCall(ISRestoreIndices(isrow,&irow));
161   *B   = C;
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
166 {
167   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
168   IS             is1,is2;
169   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
170   const PetscInt *irow,*icol;
171 
172   PetscFunctionBegin;
173   PetscCall(ISGetIndices(isrow,&irow));
174   PetscCall(ISGetIndices(iscol,&icol));
175   PetscCall(ISGetLocalSize(isrow,&nrows));
176   PetscCall(ISGetLocalSize(iscol,&ncols));
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   maxmnbs = PetscMax(a->mbs,a->nbs);
181   PetscCall(PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary));
182   PetscCall(PetscArrayzero(vary,a->mbs));
183   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
184   for (i=0; i<a->mbs; i++) {
185     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
186   }
187   count = 0;
188   for (i=0; i<nrows; i++) {
189     j = irow[i] / bs;
190     if ((vary[j]--)==bs) iary[count++] = j;
191   }
192   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1));
193 
194   PetscCall(PetscArrayzero(vary,a->nbs));
195   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
196   for (i=0; i<a->nbs; i++) {
197     PetscCheck(vary[i] == 0 || vary[i] == bs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
198   }
199   count = 0;
200   for (i=0; i<ncols; i++) {
201     j = icol[i] / bs;
202     if ((vary[j]--)==bs) iary[count++] = j;
203   }
204   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2));
205   PetscCall(ISRestoreIndices(isrow,&irow));
206   PetscCall(ISRestoreIndices(iscol,&icol));
207   PetscCall(PetscFree2(vary,iary));
208 
209   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B));
210   PetscCall(ISDestroy(&is1));
211   PetscCall(ISDestroy(&is2));
212   PetscFunctionReturn(0);
213 }
214 
215 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
216 {
217   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
218   Mat_SubSppt    *submatj = c->submatis1;
219 
220   PetscFunctionBegin;
221   PetscCall((*submatj->destroy)(C));
222   PetscCall(MatDestroySubMatrix_Private(submatj));
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
227 {
228   PetscInt       i;
229   Mat            C;
230   Mat_SeqBAIJ    *c;
231   Mat_SubSppt    *submatj;
232 
233   PetscFunctionBegin;
234   for (i=0; i<n; i++) {
235     C       = (*mat)[i];
236     c       = (Mat_SeqBAIJ*)C->data;
237     submatj = c->submatis1;
238     if (submatj) {
239       if (--((PetscObject)C)->refct <= 0) {
240         PetscCall((*submatj->destroy)(C));
241         PetscCall(MatDestroySubMatrix_Private(submatj));
242         PetscCall(PetscFree(C->defaultvectype));
243         PetscCall(PetscLayoutDestroy(&C->rmap));
244         PetscCall(PetscLayoutDestroy(&C->cmap));
245         PetscCall(PetscHeaderDestroy(&C));
246       }
247     } else {
248       PetscCall(MatDestroy(&C));
249     }
250   }
251 
252   /* Destroy Dummy submatrices created for reuse */
253   PetscCall(MatDestroySubMatrices_Dummy(n,mat));
254 
255   PetscCall(PetscFree(*mat));
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260 {
261   PetscInt       i;
262 
263   PetscFunctionBegin;
264   if (scall == MAT_INITIAL_MATRIX) {
265     PetscCall(PetscCalloc1(n+1,B));
266   }
267 
268   for (i=0; i<n; i++) {
269     PetscCall(MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]));
270   }
271   PetscFunctionReturn(0);
272 }
273 
274 /* -------------------------------------------------------*/
275 /* Should check that shapes of vectors and matrices match */
276 /* -------------------------------------------------------*/
277 
278 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
279 {
280   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
281   PetscScalar       *z,sum;
282   const PetscScalar *x;
283   const MatScalar   *v;
284   PetscInt          mbs,i,n;
285   const PetscInt    *idx,*ii,*ridx=NULL;
286   PetscBool         usecprow=a->compressedrow.use;
287 
288   PetscFunctionBegin;
289   PetscCall(VecGetArrayRead(xx,&x));
290   PetscCall(VecGetArrayWrite(zz,&z));
291 
292   if (usecprow) {
293     mbs  = a->compressedrow.nrows;
294     ii   = a->compressedrow.i;
295     ridx = a->compressedrow.rindex;
296     PetscCall(PetscArrayzero(z,a->mbs));
297   } else {
298     mbs = a->mbs;
299     ii  = a->i;
300   }
301 
302   for (i=0; i<mbs; i++) {
303     n   = ii[1] - ii[0];
304     v   = a->a + ii[0];
305     idx = a->j + ii[0];
306     ii++;
307     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
308     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
309     sum = 0.0;
310     PetscSparseDensePlusDot(sum,x,v,idx,n);
311     if (usecprow) {
312       z[ridx[i]] = sum;
313     } else {
314       z[i]       = sum;
315     }
316   }
317   PetscCall(VecRestoreArrayRead(xx,&x));
318   PetscCall(VecRestoreArrayWrite(zz,&z));
319   PetscCall(PetscLogFlops(2.0*a->nz - a->nonzerorowcnt));
320   PetscFunctionReturn(0);
321 }
322 
323 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
324 {
325   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
326   PetscScalar       *z = NULL,sum1,sum2,*zarray;
327   const PetscScalar *x,*xb;
328   PetscScalar       x1,x2;
329   const MatScalar   *v;
330   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
331   PetscBool         usecprow=a->compressedrow.use;
332 
333   PetscFunctionBegin;
334   PetscCall(VecGetArrayRead(xx,&x));
335   PetscCall(VecGetArrayWrite(zz,&zarray));
336 
337   idx = a->j;
338   v   = a->a;
339   if (usecprow) {
340     mbs  = a->compressedrow.nrows;
341     ii   = a->compressedrow.i;
342     ridx = a->compressedrow.rindex;
343     PetscCall(PetscArrayzero(zarray,2*a->mbs));
344   } else {
345     mbs = a->mbs;
346     ii  = a->i;
347     z   = zarray;
348   }
349 
350   for (i=0; i<mbs; i++) {
351     n           = ii[1] - ii[0]; ii++;
352     sum1        = 0.0; sum2 = 0.0;
353     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
354     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
355     for (j=0; j<n; j++) {
356       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
357       sum1 += v[0]*x1 + v[2]*x2;
358       sum2 += v[1]*x1 + v[3]*x2;
359       v    += 4;
360     }
361     if (usecprow) z = zarray + 2*ridx[i];
362     z[0] = sum1; z[1] = sum2;
363     if (!usecprow) z += 2;
364   }
365   PetscCall(VecRestoreArrayRead(xx,&x));
366   PetscCall(VecRestoreArrayWrite(zz,&zarray));
367   PetscCall(PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt));
368   PetscFunctionReturn(0);
369 }
370 
371 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
372 {
373   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
374   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
375   const PetscScalar *x,*xb;
376   const MatScalar   *v;
377   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
378   PetscBool         usecprow=a->compressedrow.use;
379 
380 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
381 #pragma disjoint(*v,*z,*xb)
382 #endif
383 
384   PetscFunctionBegin;
385   PetscCall(VecGetArrayRead(xx,&x));
386   PetscCall(VecGetArrayWrite(zz,&zarray));
387 
388   idx = a->j;
389   v   = a->a;
390   if (usecprow) {
391     mbs  = a->compressedrow.nrows;
392     ii   = a->compressedrow.i;
393     ridx = a->compressedrow.rindex;
394     PetscCall(PetscArrayzero(zarray,3*a->mbs));
395   } else {
396     mbs = a->mbs;
397     ii  = a->i;
398     z   = zarray;
399   }
400 
401   for (i=0; i<mbs; i++) {
402     n           = ii[1] - ii[0]; ii++;
403     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
404     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
405     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
406     for (j=0; j<n; j++) {
407       xb = x + 3*(*idx++);
408       x1 = xb[0];
409       x2 = xb[1];
410       x3 = xb[2];
411 
412       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
413       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
414       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
415       v    += 9;
416     }
417     if (usecprow) z = zarray + 3*ridx[i];
418     z[0] = sum1; z[1] = sum2; z[2] = sum3;
419     if (!usecprow) z += 3;
420   }
421   PetscCall(VecRestoreArrayRead(xx,&x));
422   PetscCall(VecRestoreArrayWrite(zz,&zarray));
423   PetscCall(PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt));
424   PetscFunctionReturn(0);
425 }
426 
427 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
428 {
429   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
430   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
431   const PetscScalar *x,*xb;
432   const MatScalar   *v;
433   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
434   PetscBool         usecprow=a->compressedrow.use;
435 
436   PetscFunctionBegin;
437   PetscCall(VecGetArrayRead(xx,&x));
438   PetscCall(VecGetArrayWrite(zz,&zarray));
439 
440   idx = a->j;
441   v   = a->a;
442   if (usecprow) {
443     mbs  = a->compressedrow.nrows;
444     ii   = a->compressedrow.i;
445     ridx = a->compressedrow.rindex;
446     PetscCall(PetscArrayzero(zarray,4*a->mbs));
447   } else {
448     mbs = a->mbs;
449     ii  = a->i;
450     z   = zarray;
451   }
452 
453   for (i=0; i<mbs; i++) {
454     n = ii[1] - ii[0];
455     ii++;
456     sum1 = 0.0;
457     sum2 = 0.0;
458     sum3 = 0.0;
459     sum4 = 0.0;
460 
461     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
462     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
463     for (j=0; j<n; j++) {
464       xb    = x + 4*(*idx++);
465       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
466       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
467       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
468       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
469       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
470       v    += 16;
471     }
472     if (usecprow) z = zarray + 4*ridx[i];
473     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
474     if (!usecprow) z += 4;
475   }
476   PetscCall(VecRestoreArrayRead(xx,&x));
477   PetscCall(VecRestoreArrayWrite(zz,&zarray));
478   PetscCall(PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt));
479   PetscFunctionReturn(0);
480 }
481 
482 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
483 {
484   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
485   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
486   const PetscScalar *xb,*x;
487   const MatScalar   *v;
488   const PetscInt    *idx,*ii,*ridx=NULL;
489   PetscInt          mbs,i,j,n;
490   PetscBool         usecprow=a->compressedrow.use;
491 
492   PetscFunctionBegin;
493   PetscCall(VecGetArrayRead(xx,&x));
494   PetscCall(VecGetArrayWrite(zz,&zarray));
495 
496   idx = a->j;
497   v   = a->a;
498   if (usecprow) {
499     mbs  = a->compressedrow.nrows;
500     ii   = a->compressedrow.i;
501     ridx = a->compressedrow.rindex;
502     PetscCall(PetscArrayzero(zarray,5*a->mbs));
503   } else {
504     mbs = a->mbs;
505     ii  = a->i;
506     z   = zarray;
507   }
508 
509   for (i=0; i<mbs; i++) {
510     n           = ii[1] - ii[0]; ii++;
511     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
512     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
513     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
514     for (j=0; j<n; j++) {
515       xb    = x + 5*(*idx++);
516       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
517       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
518       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
519       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
520       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
521       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
522       v    += 25;
523     }
524     if (usecprow) z = zarray + 5*ridx[i];
525     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
526     if (!usecprow) z += 5;
527   }
528   PetscCall(VecRestoreArrayRead(xx,&x));
529   PetscCall(VecRestoreArrayWrite(zz,&zarray));
530   PetscCall(PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt));
531   PetscFunctionReturn(0);
532 }
533 
534 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
535 {
536   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
537   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
538   const PetscScalar *x,*xb;
539   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
540   const MatScalar   *v;
541   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
542   PetscBool         usecprow=a->compressedrow.use;
543 
544   PetscFunctionBegin;
545   PetscCall(VecGetArrayRead(xx,&x));
546   PetscCall(VecGetArrayWrite(zz,&zarray));
547 
548   idx = a->j;
549   v   = a->a;
550   if (usecprow) {
551     mbs  = a->compressedrow.nrows;
552     ii   = a->compressedrow.i;
553     ridx = a->compressedrow.rindex;
554     PetscCall(PetscArrayzero(zarray,6*a->mbs));
555   } else {
556     mbs = a->mbs;
557     ii  = a->i;
558     z   = zarray;
559   }
560 
561   for (i=0; i<mbs; i++) {
562     n  = ii[1] - ii[0];
563     ii++;
564     sum1 = 0.0;
565     sum2 = 0.0;
566     sum3 = 0.0;
567     sum4 = 0.0;
568     sum5 = 0.0;
569     sum6 = 0.0;
570 
571     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
572     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
573     for (j=0; j<n; j++) {
574       xb    = x + 6*(*idx++);
575       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
576       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
577       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
578       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
579       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
580       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
581       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
582       v    += 36;
583     }
584     if (usecprow) z = zarray + 6*ridx[i];
585     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
586     if (!usecprow) z += 6;
587   }
588 
589   PetscCall(VecRestoreArrayRead(xx,&x));
590   PetscCall(VecRestoreArrayWrite(zz,&zarray));
591   PetscCall(PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt));
592   PetscFunctionReturn(0);
593 }
594 
595 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
596 {
597   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
598   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
599   const PetscScalar *x,*xb;
600   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
601   const MatScalar   *v;
602   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
603   PetscBool         usecprow=a->compressedrow.use;
604 
605   PetscFunctionBegin;
606   PetscCall(VecGetArrayRead(xx,&x));
607   PetscCall(VecGetArrayWrite(zz,&zarray));
608 
609   idx = a->j;
610   v   = a->a;
611   if (usecprow) {
612     mbs  = a->compressedrow.nrows;
613     ii   = a->compressedrow.i;
614     ridx = a->compressedrow.rindex;
615     PetscCall(PetscArrayzero(zarray,7*a->mbs));
616   } else {
617     mbs = a->mbs;
618     ii  = a->i;
619     z   = zarray;
620   }
621 
622   for (i=0; i<mbs; i++) {
623     n  = ii[1] - ii[0];
624     ii++;
625     sum1 = 0.0;
626     sum2 = 0.0;
627     sum3 = 0.0;
628     sum4 = 0.0;
629     sum5 = 0.0;
630     sum6 = 0.0;
631     sum7 = 0.0;
632 
633     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
634     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
635     for (j=0; j<n; j++) {
636       xb    = x + 7*(*idx++);
637       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
638       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
639       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
640       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
641       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
642       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
643       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
644       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
645       v    += 49;
646     }
647     if (usecprow) z = zarray + 7*ridx[i];
648     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
649     if (!usecprow) z += 7;
650   }
651 
652   PetscCall(VecRestoreArrayRead(xx,&x));
653   PetscCall(VecRestoreArrayWrite(zz,&zarray));
654   PetscCall(PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt));
655   PetscFunctionReturn(0);
656 }
657 
658 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
659 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
660 {
661   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
662   PetscScalar       *z = NULL,*work,*workt,*zarray;
663   const PetscScalar *x,*xb;
664   const MatScalar   *v;
665   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
666   const PetscInt    *idx,*ii,*ridx=NULL;
667   PetscInt          k;
668   PetscBool         usecprow=a->compressedrow.use;
669 
670   __m256d a0,a1,a2,a3,a4,a5;
671   __m256d w0,w1,w2,w3;
672   __m256d z0,z1,z2;
673   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
674 
675   PetscFunctionBegin;
676   PetscCall(VecGetArrayRead(xx,&x));
677   PetscCall(VecGetArrayWrite(zz,&zarray));
678 
679   idx = a->j;
680   v   = a->a;
681   if (usecprow) {
682     mbs  = a->compressedrow.nrows;
683     ii   = a->compressedrow.i;
684     ridx = a->compressedrow.rindex;
685     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
686   } else {
687     mbs = a->mbs;
688     ii  = a->i;
689     z   = zarray;
690   }
691 
692   if (!a->mult_work) {
693     k    = PetscMax(A->rmap->n,A->cmap->n);
694     PetscCall(PetscMalloc1(k+1,&a->mult_work));
695   }
696 
697   work = a->mult_work;
698   for (i=0; i<mbs; i++) {
699     n           = ii[1] - ii[0]; ii++;
700     workt       = work;
701     for (j=0; j<n; j++) {
702       xb = x + bs*(*idx++);
703       for (k=0; k<bs; k++) workt[k] = xb[k];
704       workt += bs;
705     }
706     if (usecprow) z = zarray + bs*ridx[i];
707 
708     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
709 
710     for (j=0; j<n; j++) {
711       /* first column of a */
712       w0 = _mm256_set1_pd(work[j*9  ]);
713       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
714       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
715       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
716 
717       /* second column of a */
718       w1 = _mm256_set1_pd(work[j*9+ 1]);
719       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
720       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
721       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
722 
723       /* third column of a */
724       w2 = _mm256_set1_pd(work[j*9 +2]);
725       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
726       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
727       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
728 
729       /* fourth column of a */
730       w3 = _mm256_set1_pd(work[j*9+ 3]);
731       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
732       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
733       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
734 
735       /* fifth column of a */
736       w0 = _mm256_set1_pd(work[j*9+ 4]);
737       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
738       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
739       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
740 
741       /* sixth column of a */
742       w1 = _mm256_set1_pd(work[j*9+ 5]);
743       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
744       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
745       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
746 
747       /* seventh column of a */
748       w2 = _mm256_set1_pd(work[j*9+ 6]);
749       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
750       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
751       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
752 
753       /* eighth column of a */
754       w3 = _mm256_set1_pd(work[j*9+ 7]);
755       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
756       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
757       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
758 
759       /* ninth column of a */
760       w0 = _mm256_set1_pd(work[j*9+ 8]);
761       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
762       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
763       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
764     }
765 
766     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
767 
768     v += n*bs2;
769     if (!usecprow) z += bs;
770   }
771   PetscCall(VecRestoreArrayRead(xx,&x));
772   PetscCall(VecRestoreArrayWrite(zz,&zarray));
773   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
774   PetscFunctionReturn(0);
775 }
776 #endif
777 
778 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
779 {
780   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
781   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
782   const PetscScalar *x,*xb;
783   PetscScalar       *zarray,xv;
784   const MatScalar   *v;
785   const PetscInt    *ii,*ij=a->j,*idx;
786   PetscInt          mbs,i,j,k,n,*ridx=NULL;
787   PetscBool         usecprow=a->compressedrow.use;
788 
789   PetscFunctionBegin;
790   PetscCall(VecGetArrayRead(xx,&x));
791   PetscCall(VecGetArrayWrite(zz,&zarray));
792 
793   v = a->a;
794   if (usecprow) {
795     mbs  = a->compressedrow.nrows;
796     ii   = a->compressedrow.i;
797     ridx = a->compressedrow.rindex;
798     PetscCall(PetscArrayzero(zarray,11*a->mbs));
799   } else {
800     mbs = a->mbs;
801     ii  = a->i;
802     z   = zarray;
803   }
804 
805   for (i=0; i<mbs; i++) {
806     n    = ii[i+1] - ii[i];
807     idx  = ij + ii[i];
808     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
809     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
810 
811     for (j=0; j<n; j++) {
812       xb = x + 11*(idx[j]);
813 
814       for (k=0; k<11; k++) {
815         xv     =  xb[k];
816         sum1  += v[0]*xv;
817         sum2  += v[1]*xv;
818         sum3  += v[2]*xv;
819         sum4  += v[3]*xv;
820         sum5  += v[4]*xv;
821         sum6  += v[5]*xv;
822         sum7  += v[6]*xv;
823         sum8  += v[7]*xv;
824         sum9  += v[8]*xv;
825         sum10 += v[9]*xv;
826         sum11 += v[10]*xv;
827         v     += 11;
828       }
829     }
830     if (usecprow) z = zarray + 11*ridx[i];
831     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
832     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
833 
834     if (!usecprow) z += 11;
835   }
836 
837   PetscCall(VecRestoreArrayRead(xx,&x));
838   PetscCall(VecRestoreArrayWrite(zz,&zarray));
839   PetscCall(PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt));
840   PetscFunctionReturn(0);
841 }
842 
843 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
844 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
845 {
846   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
847   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
848   const PetscScalar *x,*xb;
849   PetscScalar       *zarray,xv;
850   const MatScalar   *v;
851   const PetscInt    *ii,*ij=a->j,*idx;
852   PetscInt          mbs,i,j,k,n,*ridx=NULL;
853   PetscBool         usecprow=a->compressedrow.use;
854 
855   PetscFunctionBegin;
856   PetscCall(VecGetArrayRead(xx,&x));
857   PetscCall(VecGetArrayWrite(zz,&zarray));
858 
859   v = a->a;
860   if (usecprow) {
861     mbs  = a->compressedrow.nrows;
862     ii   = a->compressedrow.i;
863     ridx = a->compressedrow.rindex;
864     PetscCall(PetscArrayzero(zarray,12*a->mbs));
865   } else {
866     mbs = a->mbs;
867     ii  = a->i;
868     z   = zarray;
869   }
870 
871   for (i=0; i<mbs; i++) {
872     n    = ii[i+1] - ii[i];
873     idx  = ij + ii[i];
874     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
875     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
876 
877     for (j=0; j<n; j++) {
878       xb = x + 12*(idx[j]);
879 
880       for (k=0; k<12; k++) {
881         xv     =  xb[k];
882         sum1  += v[0]*xv;
883         sum2  += v[1]*xv;
884         sum3  += v[2]*xv;
885         sum4  += v[3]*xv;
886         sum5  += v[4]*xv;
887         sum6  += v[5]*xv;
888         sum7  += v[6]*xv;
889         sum8  += v[7]*xv;
890         sum9  += v[8]*xv;
891         sum10 += v[9]*xv;
892         sum11 += v[10]*xv;
893         sum12 += v[11]*xv;
894         v     += 12;
895       }
896     }
897     if (usecprow) z = zarray + 12*ridx[i];
898     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
899     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
900     if (!usecprow) z += 12;
901   }
902   PetscCall(VecRestoreArrayRead(xx,&x));
903   PetscCall(VecRestoreArrayWrite(zz,&zarray));
904   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
905   PetscFunctionReturn(0);
906 }
907 
908 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
909 {
910   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
911   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
912   const PetscScalar *x,*xb;
913   PetscScalar       *zarray,*yarray,xv;
914   const MatScalar   *v;
915   const PetscInt    *ii,*ij=a->j,*idx;
916   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
917   PetscBool         usecprow=a->compressedrow.use;
918 
919   PetscFunctionBegin;
920   PetscCall(VecGetArrayRead(xx,&x));
921   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
922 
923   v = a->a;
924   if (usecprow) {
925    if (zz != yy) {
926      PetscCall(PetscArraycpy(zarray,yarray,12*mbs));
927     }
928     mbs  = a->compressedrow.nrows;
929     ii   = a->compressedrow.i;
930     ridx = a->compressedrow.rindex;
931   } else {
932     ii  = a->i;
933     y   = yarray;
934     z   = zarray;
935   }
936 
937   for (i=0; i<mbs; i++) {
938     n    = ii[i+1] - ii[i];
939     idx  = ij + ii[i];
940 
941     if (usecprow) {
942       y = yarray + 12*ridx[i];
943       z = zarray + 12*ridx[i];
944     }
945     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
946     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
947 
948     for (j=0; j<n; j++) {
949       xb = x + 12*(idx[j]);
950 
951       for (k=0; k<12; k++) {
952         xv     =  xb[k];
953         sum1  += v[0]*xv;
954         sum2  += v[1]*xv;
955         sum3  += v[2]*xv;
956         sum4  += v[3]*xv;
957         sum5  += v[4]*xv;
958         sum6  += v[5]*xv;
959         sum7  += v[6]*xv;
960         sum8  += v[7]*xv;
961         sum9  += v[8]*xv;
962         sum10 += v[9]*xv;
963         sum11 += v[10]*xv;
964         sum12 += v[11]*xv;
965         v     += 12;
966       }
967     }
968 
969     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
970     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
971     if (!usecprow) {
972       y += 12;
973       z += 12;
974     }
975   }
976   PetscCall(VecRestoreArrayRead(xx,&x));
977   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
978   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
979   PetscFunctionReturn(0);
980 }
981 
982 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
983 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
984 {
985   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
986   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
987   const PetscScalar *x,*xb;
988   PetscScalar       x1,x2,x3,x4,*zarray;
989   const MatScalar   *v;
990   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
991   PetscInt          mbs,i,j,n;
992   PetscBool         usecprow=a->compressedrow.use;
993 
994   PetscFunctionBegin;
995   PetscCall(VecGetArrayRead(xx,&x));
996   PetscCall(VecGetArrayWrite(zz,&zarray));
997 
998   v = a->a;
999   if (usecprow) {
1000     mbs  = a->compressedrow.nrows;
1001     ii   = a->compressedrow.i;
1002     ridx = a->compressedrow.rindex;
1003     PetscCall(PetscArrayzero(zarray,12*a->mbs));
1004   } else {
1005     mbs = a->mbs;
1006     ii  = a->i;
1007     z   = zarray;
1008   }
1009 
1010   for (i=0; i<mbs; i++) {
1011     n    = ii[i+1] - ii[i];
1012     idx  = ij + ii[i];
1013 
1014     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1015     for (j=0; j<n; j++) {
1016       xb = x + 12*(idx[j]);
1017       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1018 
1019       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1020       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1021       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1022       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1023       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1024       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1025       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1026       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1027       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1028       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1029       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1030       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1031       v += 48;
1032 
1033       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1034 
1035       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1036       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1037       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1038       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1039       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1040       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1041       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1042       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1043       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1044       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1045       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1046       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1047       v     += 48;
1048 
1049       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1050       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1051       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1052       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1053       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1054       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1055       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1056       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1057       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1058       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1059       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1060       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1061       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1062       v     += 48;
1063 
1064     }
1065     if (usecprow) z = zarray + 12*ridx[i];
1066     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1067     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1068     if (!usecprow) z += 12;
1069   }
1070   PetscCall(VecRestoreArrayRead(xx,&x));
1071   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1072   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1077 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1078 {
1079   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1080   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1081   const PetscScalar *x,*xb;
1082   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1083   const MatScalar   *v;
1084   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1085   PetscInt          mbs = a->mbs,i,j,n;
1086   PetscBool         usecprow=a->compressedrow.use;
1087 
1088   PetscFunctionBegin;
1089   PetscCall(VecGetArrayRead(xx,&x));
1090   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1091 
1092   v = a->a;
1093   if (usecprow) {
1094     if (zz != yy) {
1095       PetscCall(PetscArraycpy(zarray,yarray,12*mbs));
1096     }
1097     mbs  = a->compressedrow.nrows;
1098     ii   = a->compressedrow.i;
1099     ridx = a->compressedrow.rindex;
1100   } else {
1101     ii  = a->i;
1102     y   = yarray;
1103     z   = zarray;
1104   }
1105 
1106   for (i=0; i<mbs; i++) {
1107     n    = ii[i+1] - ii[i];
1108     idx  = ij + ii[i];
1109 
1110     if (usecprow) {
1111       y = yarray + 12*ridx[i];
1112       z = zarray + 12*ridx[i];
1113     }
1114     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1115     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1116 
1117     for (j=0; j<n; j++) {
1118       xb = x + 12*(idx[j]);
1119       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1120 
1121       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1122       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1123       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1124       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1125       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1126       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1127       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1128       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1129       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1130       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1131       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1132       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1133       v += 48;
1134 
1135       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1136 
1137       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1138       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1139       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1140       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1141       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1142       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1143       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1144       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1145       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1146       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1147       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1148       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1149       v     += 48;
1150 
1151       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1152       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1153       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1154       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1155       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1156       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1157       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1158       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1159       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1160       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1161       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1162       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1163       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1164       v     += 48;
1165 
1166     }
1167     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1168     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1169     if (!usecprow) {
1170       y += 12;
1171       z += 12;
1172     }
1173   }
1174   PetscCall(VecRestoreArrayRead(xx,&x));
1175   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1176   PetscCall(PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt));
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1181 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1182 {
1183   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1184   PetscScalar       *z = NULL,*zarray;
1185   const PetscScalar *x,*work;
1186   const MatScalar   *v = a->a;
1187   PetscInt          mbs,i,j,n;
1188   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1189   PetscBool         usecprow=a->compressedrow.use;
1190   const PetscInt    bs = 12, bs2 = 144;
1191 
1192   __m256d a0,a1,a2,a3,a4,a5;
1193   __m256d w0,w1,w2,w3;
1194   __m256d z0,z1,z2;
1195 
1196   PetscFunctionBegin;
1197   PetscCall(VecGetArrayRead(xx,&x));
1198   PetscCall(VecGetArrayWrite(zz,&zarray));
1199 
1200   if (usecprow) {
1201     mbs  = a->compressedrow.nrows;
1202     ii   = a->compressedrow.i;
1203     ridx = a->compressedrow.rindex;
1204     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
1205   } else {
1206     mbs = a->mbs;
1207     ii  = a->i;
1208     z   = zarray;
1209   }
1210 
1211   for (i=0; i<mbs; i++) {
1212     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1213 
1214     n  = ii[1] - ii[0]; ii++;
1215     for (j=0; j<n; j++) {
1216       work = x + bs*(*idx++);
1217 
1218       /* first column of a */
1219       w0 = _mm256_set1_pd(work[0]);
1220       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1221       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1222       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1223 
1224       /* second column of a */
1225       w1 = _mm256_set1_pd(work[1]);
1226       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1227       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1228       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1229 
1230       /* third column of a */
1231       w2 = _mm256_set1_pd(work[2]);
1232       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1233       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1234       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1235 
1236       /* fourth column of a */
1237       w3 = _mm256_set1_pd(work[3]);
1238       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1239       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1240       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1241 
1242       /* fifth column of a */
1243       w0 = _mm256_set1_pd(work[4]);
1244       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1245       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1246       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1247 
1248       /* sixth column of a */
1249       w1 = _mm256_set1_pd(work[5]);
1250       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1251       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1252       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1253 
1254       /* seventh column of a */
1255       w2 = _mm256_set1_pd(work[6]);
1256       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1257       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1258       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1259 
1260       /* eighth column of a */
1261       w3 = _mm256_set1_pd(work[7]);
1262       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1263       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1264       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1265 
1266       /* ninth column of a */
1267       w0 = _mm256_set1_pd(work[8]);
1268       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1269       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1270       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1271 
1272       /* tenth column of a */
1273       w1 = _mm256_set1_pd(work[9]);
1274       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1275       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1276       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1277 
1278       /* eleventh column of a */
1279       w2 = _mm256_set1_pd(work[10]);
1280       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1281       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1282       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1283 
1284       /* twelveth column of a */
1285       w3 = _mm256_set1_pd(work[11]);
1286       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1287       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1288       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1289 
1290       v += bs2;
1291     }
1292     if (usecprow) z = zarray + bs*ridx[i];
1293     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1294     if (!usecprow) z += bs;
1295   }
1296   PetscCall(VecRestoreArrayRead(xx,&x));
1297   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1298   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
1299   PetscFunctionReturn(0);
1300 }
1301 #endif
1302 
1303 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1304 /* Default MatMult for block size 15 */
1305 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1306 {
1307   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1308   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1309   const PetscScalar *x,*xb;
1310   PetscScalar       *zarray,xv;
1311   const MatScalar   *v;
1312   const PetscInt    *ii,*ij=a->j,*idx;
1313   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1314   PetscBool         usecprow=a->compressedrow.use;
1315 
1316   PetscFunctionBegin;
1317   PetscCall(VecGetArrayRead(xx,&x));
1318   PetscCall(VecGetArrayWrite(zz,&zarray));
1319 
1320   v = a->a;
1321   if (usecprow) {
1322     mbs  = a->compressedrow.nrows;
1323     ii   = a->compressedrow.i;
1324     ridx = a->compressedrow.rindex;
1325     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1326   } else {
1327     mbs = a->mbs;
1328     ii  = a->i;
1329     z   = zarray;
1330   }
1331 
1332   for (i=0; i<mbs; i++) {
1333     n    = ii[i+1] - ii[i];
1334     idx  = ij + ii[i];
1335     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1336     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1337 
1338     for (j=0; j<n; j++) {
1339       xb = x + 15*(idx[j]);
1340 
1341       for (k=0; k<15; k++) {
1342         xv     =  xb[k];
1343         sum1  += v[0]*xv;
1344         sum2  += v[1]*xv;
1345         sum3  += v[2]*xv;
1346         sum4  += v[3]*xv;
1347         sum5  += v[4]*xv;
1348         sum6  += v[5]*xv;
1349         sum7  += v[6]*xv;
1350         sum8  += v[7]*xv;
1351         sum9  += v[8]*xv;
1352         sum10 += v[9]*xv;
1353         sum11 += v[10]*xv;
1354         sum12 += v[11]*xv;
1355         sum13 += v[12]*xv;
1356         sum14 += v[13]*xv;
1357         sum15 += v[14]*xv;
1358         v     += 15;
1359       }
1360     }
1361     if (usecprow) z = zarray + 15*ridx[i];
1362     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1363     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1364 
1365     if (!usecprow) z += 15;
1366   }
1367 
1368   PetscCall(VecRestoreArrayRead(xx,&x));
1369   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1370   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1371   PetscFunctionReturn(0);
1372 }
1373 
1374 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1375 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1376 {
1377   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1378   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1379   const PetscScalar *x,*xb;
1380   PetscScalar       x1,x2,x3,x4,*zarray;
1381   const MatScalar   *v;
1382   const PetscInt    *ii,*ij=a->j,*idx;
1383   PetscInt          mbs,i,j,n,*ridx=NULL;
1384   PetscBool         usecprow=a->compressedrow.use;
1385 
1386   PetscFunctionBegin;
1387   PetscCall(VecGetArrayRead(xx,&x));
1388   PetscCall(VecGetArrayWrite(zz,&zarray));
1389 
1390   v = a->a;
1391   if (usecprow) {
1392     mbs  = a->compressedrow.nrows;
1393     ii   = a->compressedrow.i;
1394     ridx = a->compressedrow.rindex;
1395     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1396   } else {
1397     mbs = a->mbs;
1398     ii  = a->i;
1399     z   = zarray;
1400   }
1401 
1402   for (i=0; i<mbs; i++) {
1403     n    = ii[i+1] - ii[i];
1404     idx  = ij + ii[i];
1405     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1406     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1407 
1408     for (j=0; j<n; j++) {
1409       xb = x + 15*(idx[j]);
1410       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1411 
1412       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1413       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1414       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1415       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1416       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1417       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1418       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1419       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1420       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1421       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1422       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1423       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1424       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1425       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1426       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1427 
1428       v += 60;
1429 
1430       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1431 
1432       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1433       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1434       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1435       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1436       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1437       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1438       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1439       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1440       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1441       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1442       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1443       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1444       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1445       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1446       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1447       v     += 60;
1448 
1449       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1450       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1451       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1452       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1453       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1454       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1455       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1456       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1457       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1458       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1459       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1460       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1461       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1462       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1463       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1464       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1465       v     += 60;
1466 
1467       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1468       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1469       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1470       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1471       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1472       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1473       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1474       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1475       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1476       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1477       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1478       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1479       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1480       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1481       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1482       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1483       v     += 45;
1484     }
1485     if (usecprow) z = zarray + 15*ridx[i];
1486     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1487     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1488 
1489     if (!usecprow) z += 15;
1490   }
1491 
1492   PetscCall(VecRestoreArrayRead(xx,&x));
1493   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1494   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1499 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1500 {
1501   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1502   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1503   const PetscScalar *x,*xb;
1504   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1505   const MatScalar   *v;
1506   const PetscInt    *ii,*ij=a->j,*idx;
1507   PetscInt          mbs,i,j,n,*ridx=NULL;
1508   PetscBool         usecprow=a->compressedrow.use;
1509 
1510   PetscFunctionBegin;
1511   PetscCall(VecGetArrayRead(xx,&x));
1512   PetscCall(VecGetArrayWrite(zz,&zarray));
1513 
1514   v = a->a;
1515   if (usecprow) {
1516     mbs  = a->compressedrow.nrows;
1517     ii   = a->compressedrow.i;
1518     ridx = a->compressedrow.rindex;
1519     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1520   } else {
1521     mbs = a->mbs;
1522     ii  = a->i;
1523     z   = zarray;
1524   }
1525 
1526   for (i=0; i<mbs; i++) {
1527     n    = ii[i+1] - ii[i];
1528     idx  = ij + ii[i];
1529     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1530     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1531 
1532     for (j=0; j<n; j++) {
1533       xb = x + 15*(idx[j]);
1534       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1535       x8 = xb[7];
1536 
1537       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1538       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1539       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1540       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1541       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1542       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1543       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1544       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1545       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1546       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1547       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1548       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1549       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1550       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1551       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1552       v     += 120;
1553 
1554       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1555 
1556       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1557       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1558       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1559       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1560       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1561       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1562       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1563       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1564       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1565       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1566       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1567       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1568       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1569       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1570       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1571       v     += 105;
1572     }
1573     if (usecprow) z = zarray + 15*ridx[i];
1574     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1575     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1576 
1577     if (!usecprow) z += 15;
1578   }
1579 
1580   PetscCall(VecRestoreArrayRead(xx,&x));
1581   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1582   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1587 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1588 {
1589   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1590   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1591   const PetscScalar *x,*xb;
1592   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1593   const MatScalar   *v;
1594   const PetscInt    *ii,*ij=a->j,*idx;
1595   PetscInt          mbs,i,j,n,*ridx=NULL;
1596   PetscBool         usecprow=a->compressedrow.use;
1597 
1598   PetscFunctionBegin;
1599   PetscCall(VecGetArrayRead(xx,&x));
1600   PetscCall(VecGetArrayWrite(zz,&zarray));
1601 
1602   v = a->a;
1603   if (usecprow) {
1604     mbs  = a->compressedrow.nrows;
1605     ii   = a->compressedrow.i;
1606     ridx = a->compressedrow.rindex;
1607     PetscCall(PetscArrayzero(zarray,15*a->mbs));
1608   } else {
1609     mbs = a->mbs;
1610     ii  = a->i;
1611     z   = zarray;
1612   }
1613 
1614   for (i=0; i<mbs; i++) {
1615     n    = ii[i+1] - ii[i];
1616     idx  = ij + ii[i];
1617     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1618     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1619 
1620     for (j=0; j<n; j++) {
1621       xb = x + 15*(idx[j]);
1622       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1623       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1624 
1625       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1626       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1627       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1628       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1629       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1630       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1631       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1632       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1633       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1634       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1635       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1636       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1637       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1638       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1639       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1640       v     += 225;
1641     }
1642     if (usecprow) z = zarray + 15*ridx[i];
1643     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1644     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1645 
1646     if (!usecprow) z += 15;
1647   }
1648 
1649   PetscCall(VecRestoreArrayRead(xx,&x));
1650   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1651   PetscCall(PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt));
1652   PetscFunctionReturn(0);
1653 }
1654 
1655 /*
1656     This will not work with MatScalar == float because it calls the BLAS
1657 */
1658 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1659 {
1660   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1661   PetscScalar       *z = NULL,*work,*workt,*zarray;
1662   const PetscScalar *x,*xb;
1663   const MatScalar   *v;
1664   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1665   const PetscInt    *idx,*ii,*ridx=NULL;
1666   PetscInt          ncols,k;
1667   PetscBool         usecprow=a->compressedrow.use;
1668 
1669   PetscFunctionBegin;
1670   PetscCall(VecGetArrayRead(xx,&x));
1671   PetscCall(VecGetArrayWrite(zz,&zarray));
1672 
1673   idx = a->j;
1674   v   = a->a;
1675   if (usecprow) {
1676     mbs  = a->compressedrow.nrows;
1677     ii   = a->compressedrow.i;
1678     ridx = a->compressedrow.rindex;
1679     PetscCall(PetscArrayzero(zarray,bs*a->mbs));
1680   } else {
1681     mbs = a->mbs;
1682     ii  = a->i;
1683     z   = zarray;
1684   }
1685 
1686   if (!a->mult_work) {
1687     k    = PetscMax(A->rmap->n,A->cmap->n);
1688     PetscCall(PetscMalloc1(k+1,&a->mult_work));
1689   }
1690   work = a->mult_work;
1691   for (i=0; i<mbs; i++) {
1692     n           = ii[1] - ii[0]; ii++;
1693     ncols       = n*bs;
1694     workt       = work;
1695     for (j=0; j<n; j++) {
1696       xb = x + bs*(*idx++);
1697       for (k=0; k<bs; k++) workt[k] = xb[k];
1698       workt += bs;
1699     }
1700     if (usecprow) z = zarray + bs*ridx[i];
1701     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1702     v += n*bs2;
1703     if (!usecprow) z += bs;
1704   }
1705   PetscCall(VecRestoreArrayRead(xx,&x));
1706   PetscCall(VecRestoreArrayWrite(zz,&zarray));
1707   PetscCall(PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt));
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1712 {
1713   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1714   const PetscScalar *x;
1715   PetscScalar       *y,*z,sum;
1716   const MatScalar   *v;
1717   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1718   const PetscInt    *idx,*ii;
1719   PetscBool         usecprow=a->compressedrow.use;
1720 
1721   PetscFunctionBegin;
1722   PetscCall(VecGetArrayRead(xx,&x));
1723   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
1724 
1725   idx = a->j;
1726   v   = a->a;
1727   if (usecprow) {
1728     if (zz != yy) {
1729       PetscCall(PetscArraycpy(z,y,mbs));
1730     }
1731     mbs  = a->compressedrow.nrows;
1732     ii   = a->compressedrow.i;
1733     ridx = a->compressedrow.rindex;
1734   } else {
1735     ii = a->i;
1736   }
1737 
1738   for (i=0; i<mbs; i++) {
1739     n = ii[1] - ii[0];
1740     ii++;
1741     if (!usecprow) {
1742       sum = y[i];
1743     } else {
1744       sum = y[ridx[i]];
1745     }
1746     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1747     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1748     PetscSparseDensePlusDot(sum,x,v,idx,n);
1749     v   += n;
1750     idx += n;
1751     if (usecprow) {
1752       z[ridx[i]] = sum;
1753     } else {
1754       z[i] = sum;
1755     }
1756   }
1757   PetscCall(VecRestoreArrayRead(xx,&x));
1758   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
1759   PetscCall(PetscLogFlops(2.0*a->nz));
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1764 {
1765   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1766   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1767   const PetscScalar *x,*xb;
1768   PetscScalar       x1,x2,*yarray,*zarray;
1769   const MatScalar   *v;
1770   PetscInt          mbs = a->mbs,i,n,j;
1771   const PetscInt    *idx,*ii,*ridx = NULL;
1772   PetscBool         usecprow = a->compressedrow.use;
1773 
1774   PetscFunctionBegin;
1775   PetscCall(VecGetArrayRead(xx,&x));
1776   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1777 
1778   idx = a->j;
1779   v   = a->a;
1780   if (usecprow) {
1781     if (zz != yy) {
1782       PetscCall(PetscArraycpy(zarray,yarray,2*mbs));
1783     }
1784     mbs  = a->compressedrow.nrows;
1785     ii   = a->compressedrow.i;
1786     ridx = a->compressedrow.rindex;
1787   } else {
1788     ii = a->i;
1789     y  = yarray;
1790     z  = zarray;
1791   }
1792 
1793   for (i=0; i<mbs; i++) {
1794     n = ii[1] - ii[0]; ii++;
1795     if (usecprow) {
1796       z = zarray + 2*ridx[i];
1797       y = yarray + 2*ridx[i];
1798     }
1799     sum1 = y[0]; sum2 = y[1];
1800     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1801     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1802     for (j=0; j<n; j++) {
1803       xb = x + 2*(*idx++);
1804       x1 = xb[0];
1805       x2 = xb[1];
1806 
1807       sum1 += v[0]*x1 + v[2]*x2;
1808       sum2 += v[1]*x1 + v[3]*x2;
1809       v    += 4;
1810     }
1811     z[0] = sum1; z[1] = sum2;
1812     if (!usecprow) {
1813       z += 2; y += 2;
1814     }
1815   }
1816   PetscCall(VecRestoreArrayRead(xx,&x));
1817   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1818   PetscCall(PetscLogFlops(4.0*a->nz));
1819   PetscFunctionReturn(0);
1820 }
1821 
1822 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1823 {
1824   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1825   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1826   const PetscScalar *x,*xb;
1827   const MatScalar   *v;
1828   PetscInt          mbs = a->mbs,i,j,n;
1829   const PetscInt    *idx,*ii,*ridx = NULL;
1830   PetscBool         usecprow = a->compressedrow.use;
1831 
1832   PetscFunctionBegin;
1833   PetscCall(VecGetArrayRead(xx,&x));
1834   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1835 
1836   idx = a->j;
1837   v   = a->a;
1838   if (usecprow) {
1839     if (zz != yy) {
1840       PetscCall(PetscArraycpy(zarray,yarray,3*mbs));
1841     }
1842     mbs  = a->compressedrow.nrows;
1843     ii   = a->compressedrow.i;
1844     ridx = a->compressedrow.rindex;
1845   } else {
1846     ii = a->i;
1847     y  = yarray;
1848     z  = zarray;
1849   }
1850 
1851   for (i=0; i<mbs; i++) {
1852     n = ii[1] - ii[0]; ii++;
1853     if (usecprow) {
1854       z = zarray + 3*ridx[i];
1855       y = yarray + 3*ridx[i];
1856     }
1857     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1858     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1859     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1860     for (j=0; j<n; j++) {
1861       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1862       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1863       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1864       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1865       v    += 9;
1866     }
1867     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1868     if (!usecprow) {
1869       z += 3; y += 3;
1870     }
1871   }
1872   PetscCall(VecRestoreArrayRead(xx,&x));
1873   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1874   PetscCall(PetscLogFlops(18.0*a->nz));
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1879 {
1880   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1881   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1882   const PetscScalar *x,*xb;
1883   const MatScalar   *v;
1884   PetscInt          mbs = a->mbs,i,j,n;
1885   const PetscInt    *idx,*ii,*ridx=NULL;
1886   PetscBool         usecprow=a->compressedrow.use;
1887 
1888   PetscFunctionBegin;
1889   PetscCall(VecGetArrayRead(xx,&x));
1890   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1891 
1892   idx = a->j;
1893   v   = a->a;
1894   if (usecprow) {
1895     if (zz != yy) {
1896       PetscCall(PetscArraycpy(zarray,yarray,4*mbs));
1897     }
1898     mbs  = a->compressedrow.nrows;
1899     ii   = a->compressedrow.i;
1900     ridx = a->compressedrow.rindex;
1901   } else {
1902     ii = a->i;
1903     y  = yarray;
1904     z  = zarray;
1905   }
1906 
1907   for (i=0; i<mbs; i++) {
1908     n = ii[1] - ii[0]; ii++;
1909     if (usecprow) {
1910       z = zarray + 4*ridx[i];
1911       y = yarray + 4*ridx[i];
1912     }
1913     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1914     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1915     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1916     for (j=0; j<n; j++) {
1917       xb    = x + 4*(*idx++);
1918       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1919       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1920       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1921       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1922       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1923       v    += 16;
1924     }
1925     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1926     if (!usecprow) {
1927       z += 4; y += 4;
1928     }
1929   }
1930   PetscCall(VecRestoreArrayRead(xx,&x));
1931   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1932   PetscCall(PetscLogFlops(32.0*a->nz));
1933   PetscFunctionReturn(0);
1934 }
1935 
1936 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1937 {
1938   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1939   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1940   const PetscScalar *x,*xb;
1941   PetscScalar       *yarray,*zarray;
1942   const MatScalar   *v;
1943   PetscInt          mbs = a->mbs,i,j,n;
1944   const PetscInt    *idx,*ii,*ridx = NULL;
1945   PetscBool         usecprow=a->compressedrow.use;
1946 
1947   PetscFunctionBegin;
1948   PetscCall(VecGetArrayRead(xx,&x));
1949   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
1950 
1951   idx = a->j;
1952   v   = a->a;
1953   if (usecprow) {
1954     if (zz != yy) {
1955       PetscCall(PetscArraycpy(zarray,yarray,5*mbs));
1956     }
1957     mbs  = a->compressedrow.nrows;
1958     ii   = a->compressedrow.i;
1959     ridx = a->compressedrow.rindex;
1960   } else {
1961     ii = a->i;
1962     y  = yarray;
1963     z  = zarray;
1964   }
1965 
1966   for (i=0; i<mbs; i++) {
1967     n = ii[1] - ii[0]; ii++;
1968     if (usecprow) {
1969       z = zarray + 5*ridx[i];
1970       y = yarray + 5*ridx[i];
1971     }
1972     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1973     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1974     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1975     for (j=0; j<n; j++) {
1976       xb    = x + 5*(*idx++);
1977       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1978       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1979       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1980       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1981       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1982       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1983       v    += 25;
1984     }
1985     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1986     if (!usecprow) {
1987       z += 5; y += 5;
1988     }
1989   }
1990   PetscCall(VecRestoreArrayRead(xx,&x));
1991   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
1992   PetscCall(PetscLogFlops(50.0*a->nz));
1993   PetscFunctionReturn(0);
1994 }
1995 
1996 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1997 {
1998   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1999   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2000   const PetscScalar *x,*xb;
2001   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2002   const MatScalar   *v;
2003   PetscInt          mbs = a->mbs,i,j,n;
2004   const PetscInt    *idx,*ii,*ridx=NULL;
2005   PetscBool         usecprow=a->compressedrow.use;
2006 
2007   PetscFunctionBegin;
2008   PetscCall(VecGetArrayRead(xx,&x));
2009   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2010 
2011   idx = a->j;
2012   v   = a->a;
2013   if (usecprow) {
2014     if (zz != yy) {
2015       PetscCall(PetscArraycpy(zarray,yarray,6*mbs));
2016     }
2017     mbs  = a->compressedrow.nrows;
2018     ii   = a->compressedrow.i;
2019     ridx = a->compressedrow.rindex;
2020   } else {
2021     ii = a->i;
2022     y  = yarray;
2023     z  = zarray;
2024   }
2025 
2026   for (i=0; i<mbs; i++) {
2027     n = ii[1] - ii[0]; ii++;
2028     if (usecprow) {
2029       z = zarray + 6*ridx[i];
2030       y = yarray + 6*ridx[i];
2031     }
2032     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2033     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2034     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2035     for (j=0; j<n; j++) {
2036       xb    = x + 6*(*idx++);
2037       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2038       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2039       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2040       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2041       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2042       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2043       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2044       v    += 36;
2045     }
2046     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2047     if (!usecprow) {
2048       z += 6; y += 6;
2049     }
2050   }
2051   PetscCall(VecRestoreArrayRead(xx,&x));
2052   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2053   PetscCall(PetscLogFlops(72.0*a->nz));
2054   PetscFunctionReturn(0);
2055 }
2056 
2057 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2058 {
2059   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2060   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2061   const PetscScalar *x,*xb;
2062   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2063   const MatScalar   *v;
2064   PetscInt          mbs = a->mbs,i,j,n;
2065   const PetscInt    *idx,*ii,*ridx = NULL;
2066   PetscBool         usecprow=a->compressedrow.use;
2067 
2068   PetscFunctionBegin;
2069   PetscCall(VecGetArrayRead(xx,&x));
2070   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2071 
2072   idx = a->j;
2073   v   = a->a;
2074   if (usecprow) {
2075     if (zz != yy) {
2076       PetscCall(PetscArraycpy(zarray,yarray,7*mbs));
2077     }
2078     mbs  = a->compressedrow.nrows;
2079     ii   = a->compressedrow.i;
2080     ridx = a->compressedrow.rindex;
2081   } else {
2082     ii = a->i;
2083     y  = yarray;
2084     z  = zarray;
2085   }
2086 
2087   for (i=0; i<mbs; i++) {
2088     n = ii[1] - ii[0]; ii++;
2089     if (usecprow) {
2090       z = zarray + 7*ridx[i];
2091       y = yarray + 7*ridx[i];
2092     }
2093     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2094     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2095     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2096     for (j=0; j<n; j++) {
2097       xb    = x + 7*(*idx++);
2098       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2099       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2100       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2101       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2102       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2103       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2104       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2105       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2106       v    += 49;
2107     }
2108     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2109     if (!usecprow) {
2110       z += 7; y += 7;
2111     }
2112   }
2113   PetscCall(VecRestoreArrayRead(xx,&x));
2114   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2115   PetscCall(PetscLogFlops(98.0*a->nz));
2116   PetscFunctionReturn(0);
2117 }
2118 
2119 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2120 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2121 {
2122   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2123   PetscScalar       *z = NULL,*work,*workt,*zarray;
2124   const PetscScalar *x,*xb;
2125   const MatScalar   *v;
2126   PetscInt          mbs,i,j,n;
2127   PetscInt          k;
2128   PetscBool         usecprow=a->compressedrow.use;
2129   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2130 
2131   __m256d a0,a1,a2,a3,a4,a5;
2132   __m256d w0,w1,w2,w3;
2133   __m256d z0,z1,z2;
2134   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2135 
2136   PetscFunctionBegin;
2137   PetscCall(VecCopy(yy,zz));
2138   PetscCall(VecGetArrayRead(xx,&x));
2139   PetscCall(VecGetArray(zz,&zarray));
2140 
2141   idx = a->j;
2142   v   = a->a;
2143   if (usecprow) {
2144     mbs  = a->compressedrow.nrows;
2145     ii   = a->compressedrow.i;
2146     ridx = a->compressedrow.rindex;
2147   } else {
2148     mbs = a->mbs;
2149     ii  = a->i;
2150     z   = zarray;
2151   }
2152 
2153   if (!a->mult_work) {
2154     k    = PetscMax(A->rmap->n,A->cmap->n);
2155     PetscCall(PetscMalloc1(k+1,&a->mult_work));
2156   }
2157 
2158   work = a->mult_work;
2159   for (i=0; i<mbs; i++) {
2160     n           = ii[1] - ii[0]; ii++;
2161     workt       = work;
2162     for (j=0; j<n; j++) {
2163       xb = x + bs*(*idx++);
2164       for (k=0; k<bs; k++) workt[k] = xb[k];
2165       workt += bs;
2166     }
2167     if (usecprow) z = zarray + bs*ridx[i];
2168 
2169     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2170 
2171     for (j=0; j<n; j++) {
2172       /* first column of a */
2173       w0 = _mm256_set1_pd(work[j*9  ]);
2174       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2175       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2176       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2177 
2178       /* second column of a */
2179       w1 = _mm256_set1_pd(work[j*9+ 1]);
2180       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2181       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2182       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2183 
2184       /* third column of a */
2185       w2 = _mm256_set1_pd(work[j*9+ 2]);
2186       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2187       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2188       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2189 
2190       /* fourth column of a */
2191       w3 = _mm256_set1_pd(work[j*9+ 3]);
2192       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2193       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2194       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2195 
2196       /* fifth column of a */
2197       w0 = _mm256_set1_pd(work[j*9+ 4]);
2198       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2199       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2200       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2201 
2202       /* sixth column of a */
2203       w1 = _mm256_set1_pd(work[j*9+ 5]);
2204       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2205       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2206       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2207 
2208       /* seventh column of a */
2209       w2 = _mm256_set1_pd(work[j*9+ 6]);
2210       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2211       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2212       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2213 
2214       /* eighth column of a */
2215       w3 = _mm256_set1_pd(work[j*9+ 7]);
2216       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2217       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2218       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2219 
2220       /* ninth column of a */
2221       w0 = _mm256_set1_pd(work[j*9+ 8]);
2222       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2223       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2224       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2225     }
2226 
2227     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2228 
2229     v += n*bs2;
2230     if (!usecprow) z += bs;
2231   }
2232   PetscCall(VecRestoreArrayRead(xx,&x));
2233   PetscCall(VecRestoreArray(zz,&zarray));
2234   PetscCall(PetscLogFlops(162.0*a->nz));
2235   PetscFunctionReturn(0);
2236 }
2237 #endif
2238 
2239 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2240 {
2241   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2242   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2243   const PetscScalar *x,*xb;
2244   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2245   const MatScalar   *v;
2246   PetscInt          mbs = a->mbs,i,j,n;
2247   const PetscInt    *idx,*ii,*ridx = NULL;
2248   PetscBool         usecprow=a->compressedrow.use;
2249 
2250   PetscFunctionBegin;
2251   PetscCall(VecGetArrayRead(xx,&x));
2252   PetscCall(VecGetArrayPair(yy,zz,&yarray,&zarray));
2253 
2254   idx = a->j;
2255   v   = a->a;
2256   if (usecprow) {
2257     if (zz != yy) {
2258       PetscCall(PetscArraycpy(zarray,yarray,7*mbs));
2259     }
2260     mbs  = a->compressedrow.nrows;
2261     ii   = a->compressedrow.i;
2262     ridx = a->compressedrow.rindex;
2263   } else {
2264     ii = a->i;
2265     y  = yarray;
2266     z  = zarray;
2267   }
2268 
2269   for (i=0; i<mbs; i++) {
2270     n = ii[1] - ii[0]; ii++;
2271     if (usecprow) {
2272       z = zarray + 11*ridx[i];
2273       y = yarray + 11*ridx[i];
2274     }
2275     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2276     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2277     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2278     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2279     for (j=0; j<n; j++) {
2280       xb    = x + 11*(*idx++);
2281       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2282       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2283       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2284       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2285       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2286       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2287       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2288       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2289       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2290       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2291       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2292       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2293       v    += 121;
2294     }
2295     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2296     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2297     if (!usecprow) {
2298       z += 11; y += 11;
2299     }
2300   }
2301   PetscCall(VecRestoreArrayRead(xx,&x));
2302   PetscCall(VecRestoreArrayPair(yy,zz,&yarray,&zarray));
2303   PetscCall(PetscLogFlops(242.0*a->nz));
2304   PetscFunctionReturn(0);
2305 }
2306 
2307 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2308 {
2309   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2310   PetscScalar       *z = NULL,*work,*workt,*zarray;
2311   const PetscScalar *x,*xb;
2312   const MatScalar   *v;
2313   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2314   PetscInt          ncols,k;
2315   const PetscInt    *ridx = NULL,*idx,*ii;
2316   PetscBool         usecprow = a->compressedrow.use;
2317 
2318   PetscFunctionBegin;
2319   PetscCall(VecCopy(yy,zz));
2320   PetscCall(VecGetArrayRead(xx,&x));
2321   PetscCall(VecGetArray(zz,&zarray));
2322 
2323   idx = a->j;
2324   v   = a->a;
2325   if (usecprow) {
2326     mbs  = a->compressedrow.nrows;
2327     ii   = a->compressedrow.i;
2328     ridx = a->compressedrow.rindex;
2329   } else {
2330     mbs = a->mbs;
2331     ii  = a->i;
2332     z   = zarray;
2333   }
2334 
2335   if (!a->mult_work) {
2336     k    = PetscMax(A->rmap->n,A->cmap->n);
2337     PetscCall(PetscMalloc1(k+1,&a->mult_work));
2338   }
2339   work = a->mult_work;
2340   for (i=0; i<mbs; i++) {
2341     n     = ii[1] - ii[0]; ii++;
2342     ncols = n*bs;
2343     workt = work;
2344     for (j=0; j<n; j++) {
2345       xb = x + bs*(*idx++);
2346       for (k=0; k<bs; k++) workt[k] = xb[k];
2347       workt += bs;
2348     }
2349     if (usecprow) z = zarray + bs*ridx[i];
2350     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2351     v += n*bs2;
2352     if (!usecprow) z += bs;
2353   }
2354   PetscCall(VecRestoreArrayRead(xx,&x));
2355   PetscCall(VecRestoreArray(zz,&zarray));
2356   PetscCall(PetscLogFlops(2.0*a->nz*bs2));
2357   PetscFunctionReturn(0);
2358 }
2359 
2360 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2361 {
2362   PetscScalar    zero = 0.0;
2363 
2364   PetscFunctionBegin;
2365   PetscCall(VecSet(zz,zero));
2366   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz));
2367   PetscFunctionReturn(0);
2368 }
2369 
2370 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2371 {
2372   PetscScalar    zero = 0.0;
2373 
2374   PetscFunctionBegin;
2375   PetscCall(VecSet(zz,zero));
2376   PetscCall(MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz));
2377   PetscFunctionReturn(0);
2378 }
2379 
2380 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2381 {
2382   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2383   PetscScalar       *z,x1,x2,x3,x4,x5;
2384   const PetscScalar *x,*xb = NULL;
2385   const MatScalar   *v;
2386   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2387   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2388   Mat_CompressedRow cprow = a->compressedrow;
2389   PetscBool         usecprow = cprow.use;
2390 
2391   PetscFunctionBegin;
2392   if (yy != zz) PetscCall(VecCopy(yy,zz));
2393   PetscCall(VecGetArrayRead(xx,&x));
2394   PetscCall(VecGetArray(zz,&z));
2395 
2396   idx = a->j;
2397   v   = a->a;
2398   if (usecprow) {
2399     mbs  = cprow.nrows;
2400     ii   = cprow.i;
2401     ridx = cprow.rindex;
2402   } else {
2403     mbs=a->mbs;
2404     ii = a->i;
2405     xb = x;
2406   }
2407 
2408   switch (bs) {
2409   case 1:
2410     for (i=0; i<mbs; i++) {
2411       if (usecprow) xb = x + ridx[i];
2412       x1 = xb[0];
2413       ib = idx + ii[0];
2414       n  = ii[1] - ii[0]; ii++;
2415       for (j=0; j<n; j++) {
2416         rval     = ib[j];
2417         z[rval] += PetscConj(*v) * x1;
2418         v++;
2419       }
2420       if (!usecprow) xb++;
2421     }
2422     break;
2423   case 2:
2424     for (i=0; i<mbs; i++) {
2425       if (usecprow) xb = x + 2*ridx[i];
2426       x1 = xb[0]; x2 = xb[1];
2427       ib = idx + ii[0];
2428       n  = ii[1] - ii[0]; ii++;
2429       for (j=0; j<n; j++) {
2430         rval       = ib[j]*2;
2431         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2432         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2433         v         += 4;
2434       }
2435       if (!usecprow) xb += 2;
2436     }
2437     break;
2438   case 3:
2439     for (i=0; i<mbs; i++) {
2440       if (usecprow) xb = x + 3*ridx[i];
2441       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2442       ib = idx + ii[0];
2443       n  = ii[1] - ii[0]; ii++;
2444       for (j=0; j<n; j++) {
2445         rval       = ib[j]*3;
2446         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2447         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2448         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2449         v         += 9;
2450       }
2451       if (!usecprow) xb += 3;
2452     }
2453     break;
2454   case 4:
2455     for (i=0; i<mbs; i++) {
2456       if (usecprow) xb = x + 4*ridx[i];
2457       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2458       ib = idx + ii[0];
2459       n  = ii[1] - ii[0]; ii++;
2460       for (j=0; j<n; j++) {
2461         rval       = ib[j]*4;
2462         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2463         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2464         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2465         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2466         v         += 16;
2467       }
2468       if (!usecprow) xb += 4;
2469     }
2470     break;
2471   case 5:
2472     for (i=0; i<mbs; i++) {
2473       if (usecprow) xb = x + 5*ridx[i];
2474       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2475       x4 = xb[3]; x5 = xb[4];
2476       ib = idx + ii[0];
2477       n  = ii[1] - ii[0]; ii++;
2478       for (j=0; j<n; j++) {
2479         rval       = ib[j]*5;
2480         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2481         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2482         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2483         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2484         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2485         v         += 25;
2486       }
2487       if (!usecprow) xb += 5;
2488     }
2489     break;
2490   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2491     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2492 #if 0
2493     {
2494       PetscInt          ncols,k,bs2=a->bs2;
2495       PetscScalar       *work,*workt,zb;
2496       const PetscScalar *xtmp;
2497       if (!a->mult_work) {
2498         k    = PetscMax(A->rmap->n,A->cmap->n);
2499         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2500       }
2501       work = a->mult_work;
2502       xtmp = x;
2503       for (i=0; i<mbs; i++) {
2504         n     = ii[1] - ii[0]; ii++;
2505         ncols = n*bs;
2506         PetscCall(PetscArrayzero(work,ncols));
2507         if (usecprow) xtmp = x + bs*ridx[i];
2508         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2509         v += n*bs2;
2510         if (!usecprow) xtmp += bs;
2511         workt = work;
2512         for (j=0; j<n; j++) {
2513           zb = z + bs*(*idx++);
2514           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2515           workt += bs;
2516         }
2517       }
2518     }
2519 #endif
2520   }
2521   PetscCall(VecRestoreArrayRead(xx,&x));
2522   PetscCall(VecRestoreArray(zz,&z));
2523   PetscCall(PetscLogFlops(2.0*a->nz*a->bs2));
2524   PetscFunctionReturn(0);
2525 }
2526 
2527 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2528 {
2529   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2530   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2531   const PetscScalar *x,*xb = NULL;
2532   const MatScalar   *v;
2533   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2534   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2535   Mat_CompressedRow cprow   = a->compressedrow;
2536   PetscBool         usecprow=cprow.use;
2537 
2538   PetscFunctionBegin;
2539   if (yy != zz) PetscCall(VecCopy(yy,zz));
2540   PetscCall(VecGetArrayRead(xx,&x));
2541   PetscCall(VecGetArray(zz,&z));
2542 
2543   idx = a->j;
2544   v   = a->a;
2545   if (usecprow) {
2546     mbs  = cprow.nrows;
2547     ii   = cprow.i;
2548     ridx = cprow.rindex;
2549   } else {
2550     mbs=a->mbs;
2551     ii = a->i;
2552     xb = x;
2553   }
2554 
2555   switch (bs) {
2556   case 1:
2557     for (i=0; i<mbs; i++) {
2558       if (usecprow) xb = x + ridx[i];
2559       x1 = xb[0];
2560       ib = idx + ii[0];
2561       n  = ii[1] - ii[0]; ii++;
2562       for (j=0; j<n; j++) {
2563         rval     = ib[j];
2564         z[rval] += *v * x1;
2565         v++;
2566       }
2567       if (!usecprow) xb++;
2568     }
2569     break;
2570   case 2:
2571     for (i=0; i<mbs; i++) {
2572       if (usecprow) xb = x + 2*ridx[i];
2573       x1 = xb[0]; x2 = xb[1];
2574       ib = idx + ii[0];
2575       n  = ii[1] - ii[0]; ii++;
2576       for (j=0; j<n; j++) {
2577         rval       = ib[j]*2;
2578         z[rval++] += v[0]*x1 + v[1]*x2;
2579         z[rval++] += v[2]*x1 + v[3]*x2;
2580         v         += 4;
2581       }
2582       if (!usecprow) xb += 2;
2583     }
2584     break;
2585   case 3:
2586     for (i=0; i<mbs; i++) {
2587       if (usecprow) xb = x + 3*ridx[i];
2588       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2589       ib = idx + ii[0];
2590       n  = ii[1] - ii[0]; ii++;
2591       for (j=0; j<n; j++) {
2592         rval       = ib[j]*3;
2593         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2594         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2595         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2596         v         += 9;
2597       }
2598       if (!usecprow) xb += 3;
2599     }
2600     break;
2601   case 4:
2602     for (i=0; i<mbs; i++) {
2603       if (usecprow) xb = x + 4*ridx[i];
2604       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2605       ib = idx + ii[0];
2606       n  = ii[1] - ii[0]; ii++;
2607       for (j=0; j<n; j++) {
2608         rval       = ib[j]*4;
2609         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2610         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2611         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2612         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2613         v         += 16;
2614       }
2615       if (!usecprow) xb += 4;
2616     }
2617     break;
2618   case 5:
2619     for (i=0; i<mbs; i++) {
2620       if (usecprow) xb = x + 5*ridx[i];
2621       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2622       x4 = xb[3]; x5 = xb[4];
2623       ib = idx + ii[0];
2624       n  = ii[1] - ii[0]; ii++;
2625       for (j=0; j<n; j++) {
2626         rval       = ib[j]*5;
2627         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2628         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2629         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2630         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2631         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2632         v         += 25;
2633       }
2634       if (!usecprow) xb += 5;
2635     }
2636     break;
2637   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2638     PetscInt          ncols,k;
2639     PetscScalar       *work,*workt;
2640     const PetscScalar *xtmp;
2641     if (!a->mult_work) {
2642       k    = PetscMax(A->rmap->n,A->cmap->n);
2643       PetscCall(PetscMalloc1(k+1,&a->mult_work));
2644     }
2645     work = a->mult_work;
2646     xtmp = x;
2647     for (i=0; i<mbs; i++) {
2648       n     = ii[1] - ii[0]; ii++;
2649       ncols = n*bs;
2650       PetscCall(PetscArrayzero(work,ncols));
2651       if (usecprow) xtmp = x + bs*ridx[i];
2652       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2653       v += n*bs2;
2654       if (!usecprow) xtmp += bs;
2655       workt = work;
2656       for (j=0; j<n; j++) {
2657         zb = z + bs*(*idx++);
2658         for (k=0; k<bs; k++) zb[k] += workt[k];
2659         workt += bs;
2660       }
2661     }
2662     }
2663   }
2664   PetscCall(VecRestoreArrayRead(xx,&x));
2665   PetscCall(VecRestoreArray(zz,&z));
2666   PetscCall(PetscLogFlops(2.0*a->nz*a->bs2));
2667   PetscFunctionReturn(0);
2668 }
2669 
2670 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2671 {
2672   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2673   PetscInt       totalnz = a->bs2*a->nz;
2674   PetscScalar    oalpha  = alpha;
2675   PetscBLASInt   one = 1,tnz;
2676 
2677   PetscFunctionBegin;
2678   PetscCall(PetscBLASIntCast(totalnz,&tnz));
2679   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2680   PetscCall(PetscLogFlops(totalnz));
2681   PetscFunctionReturn(0);
2682 }
2683 
2684 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2685 {
2686   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2687   MatScalar      *v  = a->a;
2688   PetscReal      sum = 0.0;
2689   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2690 
2691   PetscFunctionBegin;
2692   if (type == NORM_FROBENIUS) {
2693 #if defined(PETSC_USE_REAL___FP16)
2694     PetscBLASInt one = 1,cnt = bs2*nz;
2695     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2696 #else
2697     for (i=0; i<bs2*nz; i++) {
2698       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2699     }
2700 #endif
2701     *norm = PetscSqrtReal(sum);
2702     PetscCall(PetscLogFlops(2.0*bs2*nz));
2703   } else if (type == NORM_1) { /* maximum column sum */
2704     PetscReal *tmp;
2705     PetscInt  *bcol = a->j;
2706     PetscCall(PetscCalloc1(A->cmap->n+1,&tmp));
2707     for (i=0; i<nz; i++) {
2708       for (j=0; j<bs; j++) {
2709         k1 = bs*(*bcol) + j; /* column index */
2710         for (k=0; k<bs; k++) {
2711           tmp[k1] += PetscAbsScalar(*v); v++;
2712         }
2713       }
2714       bcol++;
2715     }
2716     *norm = 0.0;
2717     for (j=0; j<A->cmap->n; j++) {
2718       if (tmp[j] > *norm) *norm = tmp[j];
2719     }
2720     PetscCall(PetscFree(tmp));
2721     PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0)));
2722   } else if (type == NORM_INFINITY) { /* maximum row sum */
2723     *norm = 0.0;
2724     for (k=0; k<bs; k++) {
2725       for (j=0; j<a->mbs; j++) {
2726         v   = a->a + bs2*a->i[j] + k;
2727         sum = 0.0;
2728         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2729           for (k1=0; k1<bs; k1++) {
2730             sum += PetscAbsScalar(*v);
2731             v   += bs;
2732           }
2733         }
2734         if (sum > *norm) *norm = sum;
2735       }
2736     }
2737     PetscCall(PetscLogFlops(PetscMax(bs2*nz-1,0)));
2738   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2739   PetscFunctionReturn(0);
2740 }
2741 
2742 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2743 {
2744   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2745 
2746   PetscFunctionBegin;
2747   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2748   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2749     *flg = PETSC_FALSE;
2750     PetscFunctionReturn(0);
2751   }
2752 
2753   /* if the a->i are the same */
2754   PetscCall(PetscArraycmp(a->i,b->i,a->mbs+1,flg));
2755   if (!*flg) PetscFunctionReturn(0);
2756 
2757   /* if a->j are the same */
2758   PetscCall(PetscArraycmp(a->j,b->j,a->nz,flg));
2759   if (!*flg) PetscFunctionReturn(0);
2760 
2761   /* if a->a are the same */
2762   PetscCall(PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg));
2763   PetscFunctionReturn(0);
2764 
2765 }
2766 
2767 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2768 {
2769   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2770   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2771   PetscScalar    *x,zero = 0.0;
2772   MatScalar      *aa,*aa_j;
2773 
2774   PetscFunctionBegin;
2775   PetscCheck(!A->factortype,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2776   bs   = A->rmap->bs;
2777   aa   = a->a;
2778   ai   = a->i;
2779   aj   = a->j;
2780   ambs = a->mbs;
2781   bs2  = a->bs2;
2782 
2783   PetscCall(VecSet(v,zero));
2784   PetscCall(VecGetArray(v,&x));
2785   PetscCall(VecGetLocalSize(v,&n));
2786   PetscCheck(n == A->rmap->N,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2787   for (i=0; i<ambs; i++) {
2788     for (j=ai[i]; j<ai[i+1]; j++) {
2789       if (aj[j] == i) {
2790         row  = i*bs;
2791         aa_j = aa+j*bs2;
2792         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2793         break;
2794       }
2795     }
2796   }
2797   PetscCall(VecRestoreArray(v,&x));
2798   PetscFunctionReturn(0);
2799 }
2800 
2801 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2802 {
2803   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2804   const PetscScalar *l,*r,*li,*ri;
2805   PetscScalar       x;
2806   MatScalar         *aa, *v;
2807   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2808   const PetscInt    *ai,*aj;
2809 
2810   PetscFunctionBegin;
2811   ai  = a->i;
2812   aj  = a->j;
2813   aa  = a->a;
2814   m   = A->rmap->n;
2815   n   = A->cmap->n;
2816   bs  = A->rmap->bs;
2817   mbs = a->mbs;
2818   bs2 = a->bs2;
2819   if (ll) {
2820     PetscCall(VecGetArrayRead(ll,&l));
2821     PetscCall(VecGetLocalSize(ll,&lm));
2822     PetscCheck(lm == m,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2823     for (i=0; i<mbs; i++) { /* for each block row */
2824       M  = ai[i+1] - ai[i];
2825       li = l + i*bs;
2826       v  = aa + bs2*ai[i];
2827       for (j=0; j<M; j++) { /* for each block */
2828         for (k=0; k<bs2; k++) {
2829           (*v++) *= li[k%bs];
2830         }
2831       }
2832     }
2833     PetscCall(VecRestoreArrayRead(ll,&l));
2834     PetscCall(PetscLogFlops(a->nz));
2835   }
2836 
2837   if (rr) {
2838     PetscCall(VecGetArrayRead(rr,&r));
2839     PetscCall(VecGetLocalSize(rr,&rn));
2840     PetscCheck(rn == n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2841     for (i=0; i<mbs; i++) { /* for each block row */
2842       iai = ai[i];
2843       M   = ai[i+1] - iai;
2844       v   = aa + bs2*iai;
2845       for (j=0; j<M; j++) { /* for each block */
2846         ri = r + bs*aj[iai+j];
2847         for (k=0; k<bs; k++) {
2848           x = ri[k];
2849           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2850           v += bs;
2851         }
2852       }
2853     }
2854     PetscCall(VecRestoreArrayRead(rr,&r));
2855     PetscCall(PetscLogFlops(a->nz));
2856   }
2857   PetscFunctionReturn(0);
2858 }
2859 
2860 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2861 {
2862   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2863 
2864   PetscFunctionBegin;
2865   info->block_size   = a->bs2;
2866   info->nz_allocated = a->bs2*a->maxnz;
2867   info->nz_used      = a->bs2*a->nz;
2868   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2869   info->assemblies   = A->num_ass;
2870   info->mallocs      = A->info.mallocs;
2871   info->memory       = ((PetscObject)A)->mem;
2872   if (A->factortype) {
2873     info->fill_ratio_given  = A->info.fill_ratio_given;
2874     info->fill_ratio_needed = A->info.fill_ratio_needed;
2875     info->factor_mallocs    = A->info.factor_mallocs;
2876   } else {
2877     info->fill_ratio_given  = 0;
2878     info->fill_ratio_needed = 0;
2879     info->factor_mallocs    = 0;
2880   }
2881   PetscFunctionReturn(0);
2882 }
2883 
2884 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2885 {
2886   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2887 
2888   PetscFunctionBegin;
2889   PetscCall(PetscArrayzero(a->a,a->bs2*a->i[a->mbs]));
2890   PetscFunctionReturn(0);
2891 }
2892 
2893 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2894 {
2895   PetscFunctionBegin;
2896   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
2897   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2898   PetscFunctionReturn(0);
2899 }
2900 
2901 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2902 {
2903   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2904   PetscScalar       *z = NULL,sum1;
2905   const PetscScalar *xb;
2906   PetscScalar       x1;
2907   const MatScalar   *v,*vv;
2908   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2909   PetscBool         usecprow=a->compressedrow.use;
2910 
2911   PetscFunctionBegin;
2912   idx = a->j;
2913   v   = a->a;
2914   if (usecprow) {
2915     mbs  = a->compressedrow.nrows;
2916     ii   = a->compressedrow.i;
2917     ridx = a->compressedrow.rindex;
2918   } else {
2919     mbs = a->mbs;
2920     ii  = a->i;
2921     z   = c;
2922   }
2923 
2924   for (i=0; i<mbs; i++) {
2925     n           = ii[1] - ii[0]; ii++;
2926     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2927     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2928     if (usecprow) z = c + ridx[i];
2929     jj = idx;
2930     vv = v;
2931     for (k=0; k<cn; k++) {
2932       idx = jj;
2933       v = vv;
2934       sum1    = 0.0;
2935       for (j=0; j<n; j++) {
2936         xb    = b + (*idx++); x1 = xb[0+k*bm];
2937         sum1 += v[0]*x1;
2938         v    += 1;
2939       }
2940       z[0+k*cm] = sum1;
2941     }
2942     if (!usecprow) z += 1;
2943   }
2944   PetscFunctionReturn(0);
2945 }
2946 
2947 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2948 {
2949   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2950   PetscScalar       *z = NULL,sum1,sum2;
2951   const PetscScalar *xb;
2952   PetscScalar       x1,x2;
2953   const MatScalar   *v,*vv;
2954   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2955   PetscBool         usecprow=a->compressedrow.use;
2956 
2957   PetscFunctionBegin;
2958   idx = a->j;
2959   v   = a->a;
2960   if (usecprow) {
2961     mbs  = a->compressedrow.nrows;
2962     ii   = a->compressedrow.i;
2963     ridx = a->compressedrow.rindex;
2964   } else {
2965     mbs = a->mbs;
2966     ii  = a->i;
2967     z   = c;
2968   }
2969 
2970   for (i=0; i<mbs; i++) {
2971     n           = ii[1] - ii[0]; ii++;
2972     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2973     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2974     if (usecprow) z = c + 2*ridx[i];
2975     jj = idx;
2976     vv = v;
2977     for (k=0; k<cn; k++) {
2978       idx = jj;
2979       v = vv;
2980       sum1    = 0.0; sum2 = 0.0;
2981       for (j=0; j<n; j++) {
2982         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2983         sum1 += v[0]*x1 + v[2]*x2;
2984         sum2 += v[1]*x1 + v[3]*x2;
2985         v    += 4;
2986       }
2987       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2988     }
2989     if (!usecprow) z += 2;
2990   }
2991   PetscFunctionReturn(0);
2992 }
2993 
2994 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2995 {
2996   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2997   PetscScalar       *z = NULL,sum1,sum2,sum3;
2998   const PetscScalar *xb;
2999   PetscScalar       x1,x2,x3;
3000   const MatScalar   *v,*vv;
3001   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3002   PetscBool         usecprow=a->compressedrow.use;
3003 
3004   PetscFunctionBegin;
3005   idx = a->j;
3006   v   = a->a;
3007   if (usecprow) {
3008     mbs  = a->compressedrow.nrows;
3009     ii   = a->compressedrow.i;
3010     ridx = a->compressedrow.rindex;
3011   } else {
3012     mbs = a->mbs;
3013     ii  = a->i;
3014     z   = c;
3015   }
3016 
3017   for (i=0; i<mbs; i++) {
3018     n           = ii[1] - ii[0]; ii++;
3019     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3020     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3021     if (usecprow) z = c + 3*ridx[i];
3022     jj = idx;
3023     vv = v;
3024     for (k=0; k<cn; k++) {
3025       idx = jj;
3026       v = vv;
3027       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3028       for (j=0; j<n; j++) {
3029         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3030         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3031         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3032         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3033         v    += 9;
3034       }
3035       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3036     }
3037     if (!usecprow) z += 3;
3038   }
3039   PetscFunctionReturn(0);
3040 }
3041 
3042 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3043 {
3044   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3045   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3046   const PetscScalar *xb;
3047   PetscScalar       x1,x2,x3,x4;
3048   const MatScalar   *v,*vv;
3049   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3050   PetscBool         usecprow=a->compressedrow.use;
3051 
3052   PetscFunctionBegin;
3053   idx = a->j;
3054   v   = a->a;
3055   if (usecprow) {
3056     mbs  = a->compressedrow.nrows;
3057     ii   = a->compressedrow.i;
3058     ridx = a->compressedrow.rindex;
3059   } else {
3060     mbs = a->mbs;
3061     ii  = a->i;
3062     z   = c;
3063   }
3064 
3065   for (i=0; i<mbs; i++) {
3066     n           = ii[1] - ii[0]; ii++;
3067     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3068     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3069     if (usecprow) z = c + 4*ridx[i];
3070     jj = idx;
3071     vv = v;
3072     for (k=0; k<cn; k++) {
3073       idx = jj;
3074       v = vv;
3075       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3076       for (j=0; j<n; j++) {
3077         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3078         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3079         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3080         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3081         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3082         v    += 16;
3083       }
3084       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3085     }
3086     if (!usecprow) z += 4;
3087   }
3088   PetscFunctionReturn(0);
3089 }
3090 
3091 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3092 {
3093   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3094   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3095   const PetscScalar *xb;
3096   PetscScalar       x1,x2,x3,x4,x5;
3097   const MatScalar   *v,*vv;
3098   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3099   PetscBool         usecprow=a->compressedrow.use;
3100 
3101   PetscFunctionBegin;
3102   idx = a->j;
3103   v   = a->a;
3104   if (usecprow) {
3105     mbs  = a->compressedrow.nrows;
3106     ii   = a->compressedrow.i;
3107     ridx = a->compressedrow.rindex;
3108   } else {
3109     mbs = a->mbs;
3110     ii  = a->i;
3111     z   = c;
3112   }
3113 
3114   for (i=0; i<mbs; i++) {
3115     n           = ii[1] - ii[0]; ii++;
3116     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3117     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3118     if (usecprow) z = c + 5*ridx[i];
3119     jj = idx;
3120     vv = v;
3121     for (k=0; k<cn; k++) {
3122       idx = jj;
3123       v = vv;
3124       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3125       for (j=0; j<n; j++) {
3126         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3127         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3128         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3129         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3130         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3131         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3132         v    += 25;
3133       }
3134       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3135     }
3136     if (!usecprow) z += 5;
3137   }
3138   PetscFunctionReturn(0);
3139 }
3140 
3141 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3142 {
3143   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3144   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3145   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3146   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3147   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3148   PetscBLASInt      bbs,bcn,bbm,bcm;
3149   PetscScalar       *z = NULL;
3150   PetscScalar       *c,*b;
3151   const MatScalar   *v;
3152   const PetscInt    *idx,*ii,*ridx=NULL;
3153   PetscScalar       _DZero=0.0,_DOne=1.0;
3154   PetscBool         usecprow=a->compressedrow.use;
3155 
3156   PetscFunctionBegin;
3157   if (!cm || !cn) PetscFunctionReturn(0);
3158   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
3159   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
3160   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
3161   b = bd->v;
3162   if (a->nonzerorowcnt != A->rmap->n) {
3163     PetscCall(MatZeroEntries(C));
3164   }
3165   PetscCall(MatDenseGetArray(C,&c));
3166   switch (bs) {
3167   case 1:
3168     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3169     break;
3170   case 2:
3171     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3172     break;
3173   case 3:
3174     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3175     break;
3176   case 4:
3177     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3178     break;
3179   case 5:
3180     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3181     break;
3182   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3183     PetscCall(PetscBLASIntCast(bs,&bbs));
3184     PetscCall(PetscBLASIntCast(cn,&bcn));
3185     PetscCall(PetscBLASIntCast(bm,&bbm));
3186     PetscCall(PetscBLASIntCast(cm,&bcm));
3187     idx = a->j;
3188     v   = a->a;
3189     if (usecprow) {
3190       mbs  = a->compressedrow.nrows;
3191       ii   = a->compressedrow.i;
3192       ridx = a->compressedrow.rindex;
3193     } else {
3194       mbs = a->mbs;
3195       ii  = a->i;
3196       z   = c;
3197     }
3198     for (i=0; i<mbs; i++) {
3199       n = ii[1] - ii[0]; ii++;
3200       if (usecprow) z = c + bs*ridx[i];
3201       if (n) {
3202         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3203         v += bs2;
3204       }
3205       for (j=1; j<n; j++) {
3206         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3207         v += bs2;
3208       }
3209       if (!usecprow) z += bs;
3210     }
3211   }
3212   PetscCall(MatDenseRestoreArray(C,&c));
3213   PetscCall(PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn));
3214   PetscFunctionReturn(0);
3215 }
3216