xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision cedd07cade5cbdfdad435c8172b7ec8972d9cd8d)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8   #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
14   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
15   const PetscInt *idx;
16   PetscInt        start, end, *ai, *aj, bs;
17   PetscBT         table;
18 
19   PetscFunctionBegin;
20   m  = a->mbs;
21   ai = a->i;
22   aj = a->j;
23   bs = A->rmap->bs;
24 
25   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
26 
27   PetscCall(PetscBTCreate(m, &table));
28   PetscCall(PetscMalloc1(m + 1, &nidx));
29 
30   for (i = 0; i < is_max; i++) {
31     /* Initialise the two local arrays */
32     isz = 0;
33     PetscCall(PetscBTMemzero(m, table));
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     PetscCall(ISGetIndices(is[i], &idx));
37     PetscCall(ISGetLocalSize(is[i], &n));
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j = 0; j < n; ++j) {
41       ival = idx[j] / bs; /* convert the indices into block indices */
42       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
43       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44     }
45     PetscCall(ISRestoreIndices(is[i], &idx));
46     PetscCall(ISDestroy(&is[i]));
47 
48     k = 0;
49     for (j = 0; j < ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row + 1];
55         for (l = start; l < end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58         }
59       }
60     }
61     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
62   }
63   PetscCall(PetscBTDestroy(&table));
64   PetscCall(PetscFree(nidx));
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
69 {
70   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
71   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
72   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
73   const PetscInt *irow, *icol;
74   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
75   PetscInt       *aj = a->j, *ai = a->i;
76   MatScalar      *mat_a;
77   Mat             C;
78   PetscBool       flag;
79 
80   PetscFunctionBegin;
81   PetscCall(ISGetIndices(isrow, &irow));
82   PetscCall(ISGetIndices(iscol, &icol));
83   PetscCall(ISGetLocalSize(isrow, &nrows));
84   PetscCall(ISGetLocalSize(iscol, &ncols));
85 
86   PetscCall(PetscCalloc1(1 + oldcols, &smap));
87   ssmap = smap;
88   PetscCall(PetscMalloc1(1 + nrows, &lens));
89   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
90   /* determine lens of each row */
91   for (i = 0; i < nrows; i++) {
92     kstart  = ai[irow[i]];
93     kend    = kstart + a->ilen[irow[i]];
94     lens[i] = 0;
95     for (k = kstart; k < kend; k++) {
96       if (ssmap[aj[k]]) lens[i]++;
97     }
98   }
99   /* Create and fill new matrix */
100   if (scall == MAT_REUSE_MATRIX) {
101     c = (Mat_SeqBAIJ *)((*B)->data);
102 
103     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
104     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
105     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
106     PetscCall(PetscArrayzero(c->ilen, c->mbs));
107     C = *B;
108   } else {
109     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
110     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
111     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
112     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113   }
114   c = (Mat_SeqBAIJ *)(C->data);
115   for (i = 0; i < nrows; i++) {
116     row      = irow[i];
117     kstart   = ai[row];
118     kend     = kstart + a->ilen[row];
119     mat_i    = c->i[i];
120     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
121     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122     mat_ilen = c->ilen + i;
123     for (k = kstart; k < kend; k++) {
124       if ((tcol = ssmap[a->j[k]])) {
125         *mat_j++ = tcol - 1;
126         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127         mat_a += bs2;
128         (*mat_ilen)++;
129       }
130     }
131   }
132   /* sort */
133   if (c->j && c->a) {
134     MatScalar *work;
135     PetscCall(PetscMalloc1(bs2, &work));
136     for (i = 0; i < nrows; i++) {
137       PetscInt ilen;
138       mat_i = c->i[i];
139       mat_j = c->j + mat_i;
140       mat_a = c->a + mat_i * bs2;
141       ilen  = c->ilen[i];
142       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143     }
144     PetscCall(PetscFree(work));
145   }
146 
147   /* Free work space */
148   PetscCall(ISRestoreIndices(iscol, &icol));
149   PetscCall(PetscFree(smap));
150   PetscCall(PetscFree(lens));
151   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
152   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
153 
154   PetscCall(ISRestoreIndices(isrow, &irow));
155   *B = C;
156   PetscFunctionReturn(0);
157 }
158 
159 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160 {
161   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
162   IS              is1, is2;
163   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
164   const PetscInt *irow, *icol;
165 
166   PetscFunctionBegin;
167   PetscCall(ISGetIndices(isrow, &irow));
168   PetscCall(ISGetIndices(iscol, &icol));
169   PetscCall(ISGetLocalSize(isrow, &nrows));
170   PetscCall(ISGetLocalSize(iscol, &ncols));
171 
172   /* Verify if the indices corespond to each element in a block
173    and form the IS with compressed IS */
174   maxmnbs = PetscMax(a->mbs, a->nbs);
175   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
176   PetscCall(PetscArrayzero(vary, a->mbs));
177   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
178   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
179   count = 0;
180   for (i = 0; i < nrows; i++) {
181     j = irow[i] / bs;
182     if ((vary[j]--) == bs) iary[count++] = j;
183   }
184   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
185 
186   PetscCall(PetscArrayzero(vary, a->nbs));
187   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
188   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
189   count = 0;
190   for (i = 0; i < ncols; i++) {
191     j = icol[i] / bs;
192     if ((vary[j]--) == bs) iary[count++] = j;
193   }
194   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
195   PetscCall(ISRestoreIndices(isrow, &irow));
196   PetscCall(ISRestoreIndices(iscol, &icol));
197   PetscCall(PetscFree2(vary, iary));
198 
199   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
200   PetscCall(ISDestroy(&is1));
201   PetscCall(ISDestroy(&is2));
202   PetscFunctionReturn(0);
203 }
204 
205 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
206 {
207   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
208   Mat_SubSppt *submatj = c->submatis1;
209 
210   PetscFunctionBegin;
211   PetscCall((*submatj->destroy)(C));
212   PetscCall(MatDestroySubMatrix_Private(submatj));
213   PetscFunctionReturn(0);
214 }
215 
216 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
217 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
218 {
219   PetscInt     i;
220   Mat          C;
221   Mat_SeqBAIJ *c;
222   Mat_SubSppt *submatj;
223 
224   PetscFunctionBegin;
225   for (i = 0; i < n; i++) {
226     C       = (*mat)[i];
227     c       = (Mat_SeqBAIJ *)C->data;
228     submatj = c->submatis1;
229     if (submatj) {
230       if (--((PetscObject)C)->refct <= 0) {
231         PetscCall(PetscFree(C->factorprefix));
232         PetscCall((*submatj->destroy)(C));
233         PetscCall(MatDestroySubMatrix_Private(submatj));
234         PetscCall(PetscFree(C->defaultvectype));
235         PetscCall(PetscFree(C->defaultrandtype));
236         PetscCall(PetscLayoutDestroy(&C->rmap));
237         PetscCall(PetscLayoutDestroy(&C->cmap));
238         PetscCall(PetscHeaderDestroy(&C));
239       }
240     } else {
241       PetscCall(MatDestroy(&C));
242     }
243   }
244 
245   /* Destroy Dummy submatrices created for reuse */
246   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
247 
248   PetscCall(PetscFree(*mat));
249   PetscFunctionReturn(0);
250 }
251 
252 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
253 {
254   PetscInt i;
255 
256   PetscFunctionBegin;
257   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
258 
259   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
260   PetscFunctionReturn(0);
261 }
262 
263 /* -------------------------------------------------------*/
264 /* Should check that shapes of vectors and matrices match */
265 /* -------------------------------------------------------*/
266 
267 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
268 {
269   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
270   PetscScalar       *z, sum;
271   const PetscScalar *x;
272   const MatScalar   *v;
273   PetscInt           mbs, i, n;
274   const PetscInt    *idx, *ii, *ridx = NULL;
275   PetscBool          usecprow = a->compressedrow.use;
276 
277   PetscFunctionBegin;
278   PetscCall(VecGetArrayRead(xx, &x));
279   PetscCall(VecGetArrayWrite(zz, &z));
280 
281   if (usecprow) {
282     mbs  = a->compressedrow.nrows;
283     ii   = a->compressedrow.i;
284     ridx = a->compressedrow.rindex;
285     PetscCall(PetscArrayzero(z, a->mbs));
286   } else {
287     mbs = a->mbs;
288     ii  = a->i;
289   }
290 
291   for (i = 0; i < mbs; i++) {
292     n   = ii[1] - ii[0];
293     v   = a->a + ii[0];
294     idx = a->j + ii[0];
295     ii++;
296     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
297     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
298     sum = 0.0;
299     PetscSparseDensePlusDot(sum, x, v, idx, n);
300     if (usecprow) {
301       z[ridx[i]] = sum;
302     } else {
303       z[i] = sum;
304     }
305   }
306   PetscCall(VecRestoreArrayRead(xx, &x));
307   PetscCall(VecRestoreArrayWrite(zz, &z));
308   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
309   PetscFunctionReturn(0);
310 }
311 
312 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
313 {
314   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
315   PetscScalar       *z = NULL, sum1, sum2, *zarray;
316   const PetscScalar *x, *xb;
317   PetscScalar        x1, x2;
318   const MatScalar   *v;
319   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
320   PetscBool          usecprow = a->compressedrow.use;
321 
322   PetscFunctionBegin;
323   PetscCall(VecGetArrayRead(xx, &x));
324   PetscCall(VecGetArrayWrite(zz, &zarray));
325 
326   idx = a->j;
327   v   = a->a;
328   if (usecprow) {
329     mbs  = a->compressedrow.nrows;
330     ii   = a->compressedrow.i;
331     ridx = a->compressedrow.rindex;
332     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
333   } else {
334     mbs = a->mbs;
335     ii  = a->i;
336     z   = zarray;
337   }
338 
339   for (i = 0; i < mbs; i++) {
340     n = ii[1] - ii[0];
341     ii++;
342     sum1 = 0.0;
343     sum2 = 0.0;
344     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
345     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
346     for (j = 0; j < n; j++) {
347       xb = x + 2 * (*idx++);
348       x1 = xb[0];
349       x2 = xb[1];
350       sum1 += v[0] * x1 + v[2] * x2;
351       sum2 += v[1] * x1 + v[3] * x2;
352       v += 4;
353     }
354     if (usecprow) z = zarray + 2 * ridx[i];
355     z[0] = sum1;
356     z[1] = sum2;
357     if (!usecprow) z += 2;
358   }
359   PetscCall(VecRestoreArrayRead(xx, &x));
360   PetscCall(VecRestoreArrayWrite(zz, &zarray));
361   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
362   PetscFunctionReturn(0);
363 }
364 
365 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
366 {
367   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
368   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
369   const PetscScalar *x, *xb;
370   const MatScalar   *v;
371   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
372   PetscBool          usecprow = a->compressedrow.use;
373 
374 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
375   #pragma disjoint(*v, *z, *xb)
376 #endif
377 
378   PetscFunctionBegin;
379   PetscCall(VecGetArrayRead(xx, &x));
380   PetscCall(VecGetArrayWrite(zz, &zarray));
381 
382   idx = a->j;
383   v   = a->a;
384   if (usecprow) {
385     mbs  = a->compressedrow.nrows;
386     ii   = a->compressedrow.i;
387     ridx = a->compressedrow.rindex;
388     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
389   } else {
390     mbs = a->mbs;
391     ii  = a->i;
392     z   = zarray;
393   }
394 
395   for (i = 0; i < mbs; i++) {
396     n = ii[1] - ii[0];
397     ii++;
398     sum1 = 0.0;
399     sum2 = 0.0;
400     sum3 = 0.0;
401     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
402     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
403     for (j = 0; j < n; j++) {
404       xb = x + 3 * (*idx++);
405       x1 = xb[0];
406       x2 = xb[1];
407       x3 = xb[2];
408 
409       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
410       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
411       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
412       v += 9;
413     }
414     if (usecprow) z = zarray + 3 * ridx[i];
415     z[0] = sum1;
416     z[1] = sum2;
417     z[2] = sum3;
418     if (!usecprow) z += 3;
419   }
420   PetscCall(VecRestoreArrayRead(xx, &x));
421   PetscCall(VecRestoreArrayWrite(zz, &zarray));
422   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
423   PetscFunctionReturn(0);
424 }
425 
426 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
427 {
428   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
429   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
430   const PetscScalar *x, *xb;
431   const MatScalar   *v;
432   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
433   PetscBool          usecprow = a->compressedrow.use;
434 
435   PetscFunctionBegin;
436   PetscCall(VecGetArrayRead(xx, &x));
437   PetscCall(VecGetArrayWrite(zz, &zarray));
438 
439   idx = a->j;
440   v   = a->a;
441   if (usecprow) {
442     mbs  = a->compressedrow.nrows;
443     ii   = a->compressedrow.i;
444     ridx = a->compressedrow.rindex;
445     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
446   } else {
447     mbs = a->mbs;
448     ii  = a->i;
449     z   = zarray;
450   }
451 
452   for (i = 0; i < mbs; i++) {
453     n = ii[1] - ii[0];
454     ii++;
455     sum1 = 0.0;
456     sum2 = 0.0;
457     sum3 = 0.0;
458     sum4 = 0.0;
459 
460     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
461     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
462     for (j = 0; j < n; j++) {
463       xb = x + 4 * (*idx++);
464       x1 = xb[0];
465       x2 = xb[1];
466       x3 = xb[2];
467       x4 = xb[3];
468       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
469       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
470       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
471       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
472       v += 16;
473     }
474     if (usecprow) z = zarray + 4 * ridx[i];
475     z[0] = sum1;
476     z[1] = sum2;
477     z[2] = sum3;
478     z[3] = sum4;
479     if (!usecprow) z += 4;
480   }
481   PetscCall(VecRestoreArrayRead(xx, &x));
482   PetscCall(VecRestoreArrayWrite(zz, &zarray));
483   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
484   PetscFunctionReturn(0);
485 }
486 
487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
488 {
489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
490   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
491   const PetscScalar *xb, *x;
492   const MatScalar   *v;
493   const PetscInt    *idx, *ii, *ridx = NULL;
494   PetscInt           mbs, i, j, n;
495   PetscBool          usecprow = a->compressedrow.use;
496 
497   PetscFunctionBegin;
498   PetscCall(VecGetArrayRead(xx, &x));
499   PetscCall(VecGetArrayWrite(zz, &zarray));
500 
501   idx = a->j;
502   v   = a->a;
503   if (usecprow) {
504     mbs  = a->compressedrow.nrows;
505     ii   = a->compressedrow.i;
506     ridx = a->compressedrow.rindex;
507     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
508   } else {
509     mbs = a->mbs;
510     ii  = a->i;
511     z   = zarray;
512   }
513 
514   for (i = 0; i < mbs; i++) {
515     n = ii[1] - ii[0];
516     ii++;
517     sum1 = 0.0;
518     sum2 = 0.0;
519     sum3 = 0.0;
520     sum4 = 0.0;
521     sum5 = 0.0;
522     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
523     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
524     for (j = 0; j < n; j++) {
525       xb = x + 5 * (*idx++);
526       x1 = xb[0];
527       x2 = xb[1];
528       x3 = xb[2];
529       x4 = xb[3];
530       x5 = xb[4];
531       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
532       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
533       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
534       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
535       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
536       v += 25;
537     }
538     if (usecprow) z = zarray + 5 * ridx[i];
539     z[0] = sum1;
540     z[1] = sum2;
541     z[2] = sum3;
542     z[3] = sum4;
543     z[4] = sum5;
544     if (!usecprow) z += 5;
545   }
546   PetscCall(VecRestoreArrayRead(xx, &x));
547   PetscCall(VecRestoreArrayWrite(zz, &zarray));
548   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
549   PetscFunctionReturn(0);
550 }
551 
552 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
553 {
554   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
555   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
556   const PetscScalar *x, *xb;
557   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
558   const MatScalar   *v;
559   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
560   PetscBool          usecprow = a->compressedrow.use;
561 
562   PetscFunctionBegin;
563   PetscCall(VecGetArrayRead(xx, &x));
564   PetscCall(VecGetArrayWrite(zz, &zarray));
565 
566   idx = a->j;
567   v   = a->a;
568   if (usecprow) {
569     mbs  = a->compressedrow.nrows;
570     ii   = a->compressedrow.i;
571     ridx = a->compressedrow.rindex;
572     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
573   } else {
574     mbs = a->mbs;
575     ii  = a->i;
576     z   = zarray;
577   }
578 
579   for (i = 0; i < mbs; i++) {
580     n = ii[1] - ii[0];
581     ii++;
582     sum1 = 0.0;
583     sum2 = 0.0;
584     sum3 = 0.0;
585     sum4 = 0.0;
586     sum5 = 0.0;
587     sum6 = 0.0;
588 
589     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
590     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
591     for (j = 0; j < n; j++) {
592       xb = x + 6 * (*idx++);
593       x1 = xb[0];
594       x2 = xb[1];
595       x3 = xb[2];
596       x4 = xb[3];
597       x5 = xb[4];
598       x6 = xb[5];
599       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
600       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
601       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
602       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
603       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
604       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
605       v += 36;
606     }
607     if (usecprow) z = zarray + 6 * ridx[i];
608     z[0] = sum1;
609     z[1] = sum2;
610     z[2] = sum3;
611     z[3] = sum4;
612     z[4] = sum5;
613     z[5] = sum6;
614     if (!usecprow) z += 6;
615   }
616 
617   PetscCall(VecRestoreArrayRead(xx, &x));
618   PetscCall(VecRestoreArrayWrite(zz, &zarray));
619   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
620   PetscFunctionReturn(0);
621 }
622 
623 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
624 {
625   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
626   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
627   const PetscScalar *x, *xb;
628   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
629   const MatScalar   *v;
630   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
631   PetscBool          usecprow = a->compressedrow.use;
632 
633   PetscFunctionBegin;
634   PetscCall(VecGetArrayRead(xx, &x));
635   PetscCall(VecGetArrayWrite(zz, &zarray));
636 
637   idx = a->j;
638   v   = a->a;
639   if (usecprow) {
640     mbs  = a->compressedrow.nrows;
641     ii   = a->compressedrow.i;
642     ridx = a->compressedrow.rindex;
643     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
644   } else {
645     mbs = a->mbs;
646     ii  = a->i;
647     z   = zarray;
648   }
649 
650   for (i = 0; i < mbs; i++) {
651     n = ii[1] - ii[0];
652     ii++;
653     sum1 = 0.0;
654     sum2 = 0.0;
655     sum3 = 0.0;
656     sum4 = 0.0;
657     sum5 = 0.0;
658     sum6 = 0.0;
659     sum7 = 0.0;
660 
661     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
662     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
663     for (j = 0; j < n; j++) {
664       xb = x + 7 * (*idx++);
665       x1 = xb[0];
666       x2 = xb[1];
667       x3 = xb[2];
668       x4 = xb[3];
669       x5 = xb[4];
670       x6 = xb[5];
671       x7 = xb[6];
672       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
673       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
674       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
675       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
676       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
677       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
678       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
679       v += 49;
680     }
681     if (usecprow) z = zarray + 7 * ridx[i];
682     z[0] = sum1;
683     z[1] = sum2;
684     z[2] = sum3;
685     z[3] = sum4;
686     z[4] = sum5;
687     z[5] = sum6;
688     z[6] = sum7;
689     if (!usecprow) z += 7;
690   }
691 
692   PetscCall(VecRestoreArrayRead(xx, &x));
693   PetscCall(VecRestoreArrayWrite(zz, &zarray));
694   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
695   PetscFunctionReturn(0);
696 }
697 
698 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
699 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
700 {
701   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
702   PetscScalar       *z = NULL, *work, *workt, *zarray;
703   const PetscScalar *x, *xb;
704   const MatScalar   *v;
705   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
706   const PetscInt    *idx, *ii, *ridx = NULL;
707   PetscInt           k;
708   PetscBool          usecprow = a->compressedrow.use;
709 
710   __m256d a0, a1, a2, a3, a4, a5;
711   __m256d w0, w1, w2, w3;
712   __m256d z0, z1, z2;
713   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
714 
715   PetscFunctionBegin;
716   PetscCall(VecGetArrayRead(xx, &x));
717   PetscCall(VecGetArrayWrite(zz, &zarray));
718 
719   idx = a->j;
720   v   = a->a;
721   if (usecprow) {
722     mbs  = a->compressedrow.nrows;
723     ii   = a->compressedrow.i;
724     ridx = a->compressedrow.rindex;
725     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
726   } else {
727     mbs = a->mbs;
728     ii  = a->i;
729     z   = zarray;
730   }
731 
732   if (!a->mult_work) {
733     k = PetscMax(A->rmap->n, A->cmap->n);
734     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
735   }
736 
737   work = a->mult_work;
738   for (i = 0; i < mbs; i++) {
739     n = ii[1] - ii[0];
740     ii++;
741     workt = work;
742     for (j = 0; j < n; j++) {
743       xb = x + bs * (*idx++);
744       for (k = 0; k < bs; k++) workt[k] = xb[k];
745       workt += bs;
746     }
747     if (usecprow) z = zarray + bs * ridx[i];
748 
749     z0 = _mm256_setzero_pd();
750     z1 = _mm256_setzero_pd();
751     z2 = _mm256_setzero_pd();
752 
753     for (j = 0; j < n; j++) {
754       /* first column of a */
755       w0 = _mm256_set1_pd(work[j * 9]);
756       a0 = _mm256_loadu_pd(&v[j * 81]);
757       z0 = _mm256_fmadd_pd(a0, w0, z0);
758       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
759       z1 = _mm256_fmadd_pd(a1, w0, z1);
760       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
761       z2 = _mm256_fmadd_pd(a2, w0, z2);
762 
763       /* second column of a */
764       w1 = _mm256_set1_pd(work[j * 9 + 1]);
765       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
766       z0 = _mm256_fmadd_pd(a0, w1, z0);
767       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
768       z1 = _mm256_fmadd_pd(a1, w1, z1);
769       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
770       z2 = _mm256_fmadd_pd(a2, w1, z2);
771 
772       /* third column of a */
773       w2 = _mm256_set1_pd(work[j * 9 + 2]);
774       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
775       z0 = _mm256_fmadd_pd(a3, w2, z0);
776       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
777       z1 = _mm256_fmadd_pd(a4, w2, z1);
778       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
779       z2 = _mm256_fmadd_pd(a5, w2, z2);
780 
781       /* fourth column of a */
782       w3 = _mm256_set1_pd(work[j * 9 + 3]);
783       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
784       z0 = _mm256_fmadd_pd(a0, w3, z0);
785       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
786       z1 = _mm256_fmadd_pd(a1, w3, z1);
787       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
788       z2 = _mm256_fmadd_pd(a2, w3, z2);
789 
790       /* fifth column of a */
791       w0 = _mm256_set1_pd(work[j * 9 + 4]);
792       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
793       z0 = _mm256_fmadd_pd(a3, w0, z0);
794       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
795       z1 = _mm256_fmadd_pd(a4, w0, z1);
796       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
797       z2 = _mm256_fmadd_pd(a5, w0, z2);
798 
799       /* sixth column of a */
800       w1 = _mm256_set1_pd(work[j * 9 + 5]);
801       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
802       z0 = _mm256_fmadd_pd(a0, w1, z0);
803       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
804       z1 = _mm256_fmadd_pd(a1, w1, z1);
805       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
806       z2 = _mm256_fmadd_pd(a2, w1, z2);
807 
808       /* seventh column of a */
809       w2 = _mm256_set1_pd(work[j * 9 + 6]);
810       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
811       z0 = _mm256_fmadd_pd(a0, w2, z0);
812       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
813       z1 = _mm256_fmadd_pd(a1, w2, z1);
814       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
815       z2 = _mm256_fmadd_pd(a2, w2, z2);
816 
817       /* eighth column of a */
818       w3 = _mm256_set1_pd(work[j * 9 + 7]);
819       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
820       z0 = _mm256_fmadd_pd(a3, w3, z0);
821       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
822       z1 = _mm256_fmadd_pd(a4, w3, z1);
823       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
824       z2 = _mm256_fmadd_pd(a5, w3, z2);
825 
826       /* ninth column of a */
827       w0 = _mm256_set1_pd(work[j * 9 + 8]);
828       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
829       z0 = _mm256_fmadd_pd(a0, w0, z0);
830       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
831       z1 = _mm256_fmadd_pd(a1, w0, z1);
832       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
833       z2 = _mm256_fmadd_pd(a2, w0, z2);
834     }
835 
836     _mm256_storeu_pd(&z[0], z0);
837     _mm256_storeu_pd(&z[4], z1);
838     _mm256_maskstore_pd(&z[8], mask1, z2);
839 
840     v += n * bs2;
841     if (!usecprow) z += bs;
842   }
843   PetscCall(VecRestoreArrayRead(xx, &x));
844   PetscCall(VecRestoreArrayWrite(zz, &zarray));
845   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
846   PetscFunctionReturn(0);
847 }
848 #endif
849 
850 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
851 {
852   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
853   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
854   const PetscScalar *x, *xb;
855   PetscScalar       *zarray, xv;
856   const MatScalar   *v;
857   const PetscInt    *ii, *ij = a->j, *idx;
858   PetscInt           mbs, i, j, k, n, *ridx = NULL;
859   PetscBool          usecprow = a->compressedrow.use;
860 
861   PetscFunctionBegin;
862   PetscCall(VecGetArrayRead(xx, &x));
863   PetscCall(VecGetArrayWrite(zz, &zarray));
864 
865   v = a->a;
866   if (usecprow) {
867     mbs  = a->compressedrow.nrows;
868     ii   = a->compressedrow.i;
869     ridx = a->compressedrow.rindex;
870     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
871   } else {
872     mbs = a->mbs;
873     ii  = a->i;
874     z   = zarray;
875   }
876 
877   for (i = 0; i < mbs; i++) {
878     n     = ii[i + 1] - ii[i];
879     idx   = ij + ii[i];
880     sum1  = 0.0;
881     sum2  = 0.0;
882     sum3  = 0.0;
883     sum4  = 0.0;
884     sum5  = 0.0;
885     sum6  = 0.0;
886     sum7  = 0.0;
887     sum8  = 0.0;
888     sum9  = 0.0;
889     sum10 = 0.0;
890     sum11 = 0.0;
891 
892     for (j = 0; j < n; j++) {
893       xb = x + 11 * (idx[j]);
894 
895       for (k = 0; k < 11; k++) {
896         xv = xb[k];
897         sum1 += v[0] * xv;
898         sum2 += v[1] * xv;
899         sum3 += v[2] * xv;
900         sum4 += v[3] * xv;
901         sum5 += v[4] * xv;
902         sum6 += v[5] * xv;
903         sum7 += v[6] * xv;
904         sum8 += v[7] * xv;
905         sum9 += v[8] * xv;
906         sum10 += v[9] * xv;
907         sum11 += v[10] * xv;
908         v += 11;
909       }
910     }
911     if (usecprow) z = zarray + 11 * ridx[i];
912     z[0]  = sum1;
913     z[1]  = sum2;
914     z[2]  = sum3;
915     z[3]  = sum4;
916     z[4]  = sum5;
917     z[5]  = sum6;
918     z[6]  = sum7;
919     z[7]  = sum8;
920     z[8]  = sum9;
921     z[9]  = sum10;
922     z[10] = sum11;
923 
924     if (!usecprow) z += 11;
925   }
926 
927   PetscCall(VecRestoreArrayRead(xx, &x));
928   PetscCall(VecRestoreArrayWrite(zz, &zarray));
929   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
930   PetscFunctionReturn(0);
931 }
932 
933 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
934 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
935 {
936   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
937   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
938   const PetscScalar *x, *xb;
939   PetscScalar       *zarray, xv;
940   const MatScalar   *v;
941   const PetscInt    *ii, *ij = a->j, *idx;
942   PetscInt           mbs, i, j, k, n, *ridx = NULL;
943   PetscBool          usecprow = a->compressedrow.use;
944 
945   PetscFunctionBegin;
946   PetscCall(VecGetArrayRead(xx, &x));
947   PetscCall(VecGetArrayWrite(zz, &zarray));
948 
949   v = a->a;
950   if (usecprow) {
951     mbs  = a->compressedrow.nrows;
952     ii   = a->compressedrow.i;
953     ridx = a->compressedrow.rindex;
954     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
955   } else {
956     mbs = a->mbs;
957     ii  = a->i;
958     z   = zarray;
959   }
960 
961   for (i = 0; i < mbs; i++) {
962     n     = ii[i + 1] - ii[i];
963     idx   = ij + ii[i];
964     sum1  = 0.0;
965     sum2  = 0.0;
966     sum3  = 0.0;
967     sum4  = 0.0;
968     sum5  = 0.0;
969     sum6  = 0.0;
970     sum7  = 0.0;
971     sum8  = 0.0;
972     sum9  = 0.0;
973     sum10 = 0.0;
974     sum11 = 0.0;
975     sum12 = 0.0;
976 
977     for (j = 0; j < n; j++) {
978       xb = x + 12 * (idx[j]);
979 
980       for (k = 0; k < 12; k++) {
981         xv = xb[k];
982         sum1 += v[0] * xv;
983         sum2 += v[1] * xv;
984         sum3 += v[2] * xv;
985         sum4 += v[3] * xv;
986         sum5 += v[4] * xv;
987         sum6 += v[5] * xv;
988         sum7 += v[6] * xv;
989         sum8 += v[7] * xv;
990         sum9 += v[8] * xv;
991         sum10 += v[9] * xv;
992         sum11 += v[10] * xv;
993         sum12 += v[11] * xv;
994         v += 12;
995       }
996     }
997     if (usecprow) z = zarray + 12 * ridx[i];
998     z[0]  = sum1;
999     z[1]  = sum2;
1000     z[2]  = sum3;
1001     z[3]  = sum4;
1002     z[4]  = sum5;
1003     z[5]  = sum6;
1004     z[6]  = sum7;
1005     z[7]  = sum8;
1006     z[8]  = sum9;
1007     z[9]  = sum10;
1008     z[10] = sum11;
1009     z[11] = sum12;
1010     if (!usecprow) z += 12;
1011   }
1012   PetscCall(VecRestoreArrayRead(xx, &x));
1013   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1014   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1015   PetscFunctionReturn(0);
1016 }
1017 
1018 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1019 {
1020   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1021   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1022   const PetscScalar *x, *xb;
1023   PetscScalar       *zarray, *yarray, xv;
1024   const MatScalar   *v;
1025   const PetscInt    *ii, *ij = a->j, *idx;
1026   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1027   PetscBool          usecprow = a->compressedrow.use;
1028 
1029   PetscFunctionBegin;
1030   PetscCall(VecGetArrayRead(xx, &x));
1031   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1032 
1033   v = a->a;
1034   if (usecprow) {
1035     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1036     mbs  = a->compressedrow.nrows;
1037     ii   = a->compressedrow.i;
1038     ridx = a->compressedrow.rindex;
1039   } else {
1040     ii = a->i;
1041     y  = yarray;
1042     z  = zarray;
1043   }
1044 
1045   for (i = 0; i < mbs; i++) {
1046     n   = ii[i + 1] - ii[i];
1047     idx = ij + ii[i];
1048 
1049     if (usecprow) {
1050       y = yarray + 12 * ridx[i];
1051       z = zarray + 12 * ridx[i];
1052     }
1053     sum1  = y[0];
1054     sum2  = y[1];
1055     sum3  = y[2];
1056     sum4  = y[3];
1057     sum5  = y[4];
1058     sum6  = y[5];
1059     sum7  = y[6];
1060     sum8  = y[7];
1061     sum9  = y[8];
1062     sum10 = y[9];
1063     sum11 = y[10];
1064     sum12 = y[11];
1065 
1066     for (j = 0; j < n; j++) {
1067       xb = x + 12 * (idx[j]);
1068 
1069       for (k = 0; k < 12; k++) {
1070         xv = xb[k];
1071         sum1 += v[0] * xv;
1072         sum2 += v[1] * xv;
1073         sum3 += v[2] * xv;
1074         sum4 += v[3] * xv;
1075         sum5 += v[4] * xv;
1076         sum6 += v[5] * xv;
1077         sum7 += v[6] * xv;
1078         sum8 += v[7] * xv;
1079         sum9 += v[8] * xv;
1080         sum10 += v[9] * xv;
1081         sum11 += v[10] * xv;
1082         sum12 += v[11] * xv;
1083         v += 12;
1084       }
1085     }
1086 
1087     z[0]  = sum1;
1088     z[1]  = sum2;
1089     z[2]  = sum3;
1090     z[3]  = sum4;
1091     z[4]  = sum5;
1092     z[5]  = sum6;
1093     z[6]  = sum7;
1094     z[7]  = sum8;
1095     z[8]  = sum9;
1096     z[9]  = sum10;
1097     z[10] = sum11;
1098     z[11] = sum12;
1099     if (!usecprow) {
1100       y += 12;
1101       z += 12;
1102     }
1103   }
1104   PetscCall(VecRestoreArrayRead(xx, &x));
1105   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1106   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1107   PetscFunctionReturn(0);
1108 }
1109 
1110 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1111 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1112 {
1113   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1114   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1115   const PetscScalar *x, *xb;
1116   PetscScalar        x1, x2, x3, x4, *zarray;
1117   const MatScalar   *v;
1118   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1119   PetscInt           mbs, i, j, n;
1120   PetscBool          usecprow = a->compressedrow.use;
1121 
1122   PetscFunctionBegin;
1123   PetscCall(VecGetArrayRead(xx, &x));
1124   PetscCall(VecGetArrayWrite(zz, &zarray));
1125 
1126   v = a->a;
1127   if (usecprow) {
1128     mbs  = a->compressedrow.nrows;
1129     ii   = a->compressedrow.i;
1130     ridx = a->compressedrow.rindex;
1131     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1132   } else {
1133     mbs = a->mbs;
1134     ii  = a->i;
1135     z   = zarray;
1136   }
1137 
1138   for (i = 0; i < mbs; i++) {
1139     n   = ii[i + 1] - ii[i];
1140     idx = ij + ii[i];
1141 
1142     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1143     for (j = 0; j < n; j++) {
1144       xb = x + 12 * (idx[j]);
1145       x1 = xb[0];
1146       x2 = xb[1];
1147       x3 = xb[2];
1148       x4 = xb[3];
1149 
1150       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1151       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1152       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1153       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1154       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1155       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1156       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1157       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1158       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1159       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1160       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1161       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1162       v += 48;
1163 
1164       x1 = xb[4];
1165       x2 = xb[5];
1166       x3 = xb[6];
1167       x4 = xb[7];
1168 
1169       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1170       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1171       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1172       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1173       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1174       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1175       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1176       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1177       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1178       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1179       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1180       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1181       v += 48;
1182 
1183       x1 = xb[8];
1184       x2 = xb[9];
1185       x3 = xb[10];
1186       x4 = xb[11];
1187       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1188       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1189       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1190       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1191       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1192       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1193       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1194       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1195       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1196       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1197       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1198       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1199       v += 48;
1200     }
1201     if (usecprow) z = zarray + 12 * ridx[i];
1202     z[0]  = sum1;
1203     z[1]  = sum2;
1204     z[2]  = sum3;
1205     z[3]  = sum4;
1206     z[4]  = sum5;
1207     z[5]  = sum6;
1208     z[6]  = sum7;
1209     z[7]  = sum8;
1210     z[8]  = sum9;
1211     z[9]  = sum10;
1212     z[10] = sum11;
1213     z[11] = sum12;
1214     if (!usecprow) z += 12;
1215   }
1216   PetscCall(VecRestoreArrayRead(xx, &x));
1217   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1218   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1219   PetscFunctionReturn(0);
1220 }
1221 
1222 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1223 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1224 {
1225   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1226   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1227   const PetscScalar *x, *xb;
1228   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1229   const MatScalar   *v;
1230   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1231   PetscInt           mbs      = a->mbs, i, j, n;
1232   PetscBool          usecprow = a->compressedrow.use;
1233 
1234   PetscFunctionBegin;
1235   PetscCall(VecGetArrayRead(xx, &x));
1236   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1237 
1238   v = a->a;
1239   if (usecprow) {
1240     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1241     mbs  = a->compressedrow.nrows;
1242     ii   = a->compressedrow.i;
1243     ridx = a->compressedrow.rindex;
1244   } else {
1245     ii = a->i;
1246     y  = yarray;
1247     z  = zarray;
1248   }
1249 
1250   for (i = 0; i < mbs; i++) {
1251     n   = ii[i + 1] - ii[i];
1252     idx = ij + ii[i];
1253 
1254     if (usecprow) {
1255       y = yarray + 12 * ridx[i];
1256       z = zarray + 12 * ridx[i];
1257     }
1258     sum1  = y[0];
1259     sum2  = y[1];
1260     sum3  = y[2];
1261     sum4  = y[3];
1262     sum5  = y[4];
1263     sum6  = y[5];
1264     sum7  = y[6];
1265     sum8  = y[7];
1266     sum9  = y[8];
1267     sum10 = y[9];
1268     sum11 = y[10];
1269     sum12 = y[11];
1270 
1271     for (j = 0; j < n; j++) {
1272       xb = x + 12 * (idx[j]);
1273       x1 = xb[0];
1274       x2 = xb[1];
1275       x3 = xb[2];
1276       x4 = xb[3];
1277 
1278       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1279       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1280       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1281       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1282       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1283       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1284       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1285       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1286       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1287       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1288       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1289       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1290       v += 48;
1291 
1292       x1 = xb[4];
1293       x2 = xb[5];
1294       x3 = xb[6];
1295       x4 = xb[7];
1296 
1297       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1298       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1299       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1300       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1301       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1302       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1303       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1304       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1305       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1306       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1307       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1308       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1309       v += 48;
1310 
1311       x1 = xb[8];
1312       x2 = xb[9];
1313       x3 = xb[10];
1314       x4 = xb[11];
1315       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1316       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1317       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1318       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1319       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1320       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1321       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1322       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1323       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1324       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1325       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1326       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1327       v += 48;
1328     }
1329     z[0]  = sum1;
1330     z[1]  = sum2;
1331     z[2]  = sum3;
1332     z[3]  = sum4;
1333     z[4]  = sum5;
1334     z[5]  = sum6;
1335     z[6]  = sum7;
1336     z[7]  = sum8;
1337     z[8]  = sum9;
1338     z[9]  = sum10;
1339     z[10] = sum11;
1340     z[11] = sum12;
1341     if (!usecprow) {
1342       y += 12;
1343       z += 12;
1344     }
1345   }
1346   PetscCall(VecRestoreArrayRead(xx, &x));
1347   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1348   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1349   PetscFunctionReturn(0);
1350 }
1351 
1352 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1353 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1354 {
1355   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1356   PetscScalar       *z = NULL, *zarray;
1357   const PetscScalar *x, *work;
1358   const MatScalar   *v = a->a;
1359   PetscInt           mbs, i, j, n;
1360   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1361   PetscBool          usecprow = a->compressedrow.use;
1362   const PetscInt     bs = 12, bs2 = 144;
1363 
1364   __m256d a0, a1, a2, a3, a4, a5;
1365   __m256d w0, w1, w2, w3;
1366   __m256d z0, z1, z2;
1367 
1368   PetscFunctionBegin;
1369   PetscCall(VecGetArrayRead(xx, &x));
1370   PetscCall(VecGetArrayWrite(zz, &zarray));
1371 
1372   if (usecprow) {
1373     mbs  = a->compressedrow.nrows;
1374     ii   = a->compressedrow.i;
1375     ridx = a->compressedrow.rindex;
1376     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1377   } else {
1378     mbs = a->mbs;
1379     ii  = a->i;
1380     z   = zarray;
1381   }
1382 
1383   for (i = 0; i < mbs; i++) {
1384     z0 = _mm256_setzero_pd();
1385     z1 = _mm256_setzero_pd();
1386     z2 = _mm256_setzero_pd();
1387 
1388     n = ii[1] - ii[0];
1389     ii++;
1390     for (j = 0; j < n; j++) {
1391       work = x + bs * (*idx++);
1392 
1393       /* first column of a */
1394       w0 = _mm256_set1_pd(work[0]);
1395       a0 = _mm256_loadu_pd(v + 0);
1396       z0 = _mm256_fmadd_pd(a0, w0, z0);
1397       a1 = _mm256_loadu_pd(v + 4);
1398       z1 = _mm256_fmadd_pd(a1, w0, z1);
1399       a2 = _mm256_loadu_pd(v + 8);
1400       z2 = _mm256_fmadd_pd(a2, w0, z2);
1401 
1402       /* second column of a */
1403       w1 = _mm256_set1_pd(work[1]);
1404       a3 = _mm256_loadu_pd(v + 12);
1405       z0 = _mm256_fmadd_pd(a3, w1, z0);
1406       a4 = _mm256_loadu_pd(v + 16);
1407       z1 = _mm256_fmadd_pd(a4, w1, z1);
1408       a5 = _mm256_loadu_pd(v + 20);
1409       z2 = _mm256_fmadd_pd(a5, w1, z2);
1410 
1411       /* third column of a */
1412       w2 = _mm256_set1_pd(work[2]);
1413       a0 = _mm256_loadu_pd(v + 24);
1414       z0 = _mm256_fmadd_pd(a0, w2, z0);
1415       a1 = _mm256_loadu_pd(v + 28);
1416       z1 = _mm256_fmadd_pd(a1, w2, z1);
1417       a2 = _mm256_loadu_pd(v + 32);
1418       z2 = _mm256_fmadd_pd(a2, w2, z2);
1419 
1420       /* fourth column of a */
1421       w3 = _mm256_set1_pd(work[3]);
1422       a3 = _mm256_loadu_pd(v + 36);
1423       z0 = _mm256_fmadd_pd(a3, w3, z0);
1424       a4 = _mm256_loadu_pd(v + 40);
1425       z1 = _mm256_fmadd_pd(a4, w3, z1);
1426       a5 = _mm256_loadu_pd(v + 44);
1427       z2 = _mm256_fmadd_pd(a5, w3, z2);
1428 
1429       /* fifth column of a */
1430       w0 = _mm256_set1_pd(work[4]);
1431       a0 = _mm256_loadu_pd(v + 48);
1432       z0 = _mm256_fmadd_pd(a0, w0, z0);
1433       a1 = _mm256_loadu_pd(v + 52);
1434       z1 = _mm256_fmadd_pd(a1, w0, z1);
1435       a2 = _mm256_loadu_pd(v + 56);
1436       z2 = _mm256_fmadd_pd(a2, w0, z2);
1437 
1438       /* sixth column of a */
1439       w1 = _mm256_set1_pd(work[5]);
1440       a3 = _mm256_loadu_pd(v + 60);
1441       z0 = _mm256_fmadd_pd(a3, w1, z0);
1442       a4 = _mm256_loadu_pd(v + 64);
1443       z1 = _mm256_fmadd_pd(a4, w1, z1);
1444       a5 = _mm256_loadu_pd(v + 68);
1445       z2 = _mm256_fmadd_pd(a5, w1, z2);
1446 
1447       /* seventh column of a */
1448       w2 = _mm256_set1_pd(work[6]);
1449       a0 = _mm256_loadu_pd(v + 72);
1450       z0 = _mm256_fmadd_pd(a0, w2, z0);
1451       a1 = _mm256_loadu_pd(v + 76);
1452       z1 = _mm256_fmadd_pd(a1, w2, z1);
1453       a2 = _mm256_loadu_pd(v + 80);
1454       z2 = _mm256_fmadd_pd(a2, w2, z2);
1455 
1456       /* eighth column of a */
1457       w3 = _mm256_set1_pd(work[7]);
1458       a3 = _mm256_loadu_pd(v + 84);
1459       z0 = _mm256_fmadd_pd(a3, w3, z0);
1460       a4 = _mm256_loadu_pd(v + 88);
1461       z1 = _mm256_fmadd_pd(a4, w3, z1);
1462       a5 = _mm256_loadu_pd(v + 92);
1463       z2 = _mm256_fmadd_pd(a5, w3, z2);
1464 
1465       /* ninth column of a */
1466       w0 = _mm256_set1_pd(work[8]);
1467       a0 = _mm256_loadu_pd(v + 96);
1468       z0 = _mm256_fmadd_pd(a0, w0, z0);
1469       a1 = _mm256_loadu_pd(v + 100);
1470       z1 = _mm256_fmadd_pd(a1, w0, z1);
1471       a2 = _mm256_loadu_pd(v + 104);
1472       z2 = _mm256_fmadd_pd(a2, w0, z2);
1473 
1474       /* tenth column of a */
1475       w1 = _mm256_set1_pd(work[9]);
1476       a3 = _mm256_loadu_pd(v + 108);
1477       z0 = _mm256_fmadd_pd(a3, w1, z0);
1478       a4 = _mm256_loadu_pd(v + 112);
1479       z1 = _mm256_fmadd_pd(a4, w1, z1);
1480       a5 = _mm256_loadu_pd(v + 116);
1481       z2 = _mm256_fmadd_pd(a5, w1, z2);
1482 
1483       /* eleventh column of a */
1484       w2 = _mm256_set1_pd(work[10]);
1485       a0 = _mm256_loadu_pd(v + 120);
1486       z0 = _mm256_fmadd_pd(a0, w2, z0);
1487       a1 = _mm256_loadu_pd(v + 124);
1488       z1 = _mm256_fmadd_pd(a1, w2, z1);
1489       a2 = _mm256_loadu_pd(v + 128);
1490       z2 = _mm256_fmadd_pd(a2, w2, z2);
1491 
1492       /* twelveth column of a */
1493       w3 = _mm256_set1_pd(work[11]);
1494       a3 = _mm256_loadu_pd(v + 132);
1495       z0 = _mm256_fmadd_pd(a3, w3, z0);
1496       a4 = _mm256_loadu_pd(v + 136);
1497       z1 = _mm256_fmadd_pd(a4, w3, z1);
1498       a5 = _mm256_loadu_pd(v + 140);
1499       z2 = _mm256_fmadd_pd(a5, w3, z2);
1500 
1501       v += bs2;
1502     }
1503     if (usecprow) z = zarray + bs * ridx[i];
1504     _mm256_storeu_pd(&z[0], z0);
1505     _mm256_storeu_pd(&z[4], z1);
1506     _mm256_storeu_pd(&z[8], z2);
1507     if (!usecprow) z += bs;
1508   }
1509   PetscCall(VecRestoreArrayRead(xx, &x));
1510   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1511   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1512   PetscFunctionReturn(0);
1513 }
1514 #endif
1515 
1516 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1517 /* Default MatMult for block size 15 */
1518 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1519 {
1520   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1521   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1522   const PetscScalar *x, *xb;
1523   PetscScalar       *zarray, xv;
1524   const MatScalar   *v;
1525   const PetscInt    *ii, *ij = a->j, *idx;
1526   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1527   PetscBool          usecprow = a->compressedrow.use;
1528 
1529   PetscFunctionBegin;
1530   PetscCall(VecGetArrayRead(xx, &x));
1531   PetscCall(VecGetArrayWrite(zz, &zarray));
1532 
1533   v = a->a;
1534   if (usecprow) {
1535     mbs  = a->compressedrow.nrows;
1536     ii   = a->compressedrow.i;
1537     ridx = a->compressedrow.rindex;
1538     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1539   } else {
1540     mbs = a->mbs;
1541     ii  = a->i;
1542     z   = zarray;
1543   }
1544 
1545   for (i = 0; i < mbs; i++) {
1546     n     = ii[i + 1] - ii[i];
1547     idx   = ij + ii[i];
1548     sum1  = 0.0;
1549     sum2  = 0.0;
1550     sum3  = 0.0;
1551     sum4  = 0.0;
1552     sum5  = 0.0;
1553     sum6  = 0.0;
1554     sum7  = 0.0;
1555     sum8  = 0.0;
1556     sum9  = 0.0;
1557     sum10 = 0.0;
1558     sum11 = 0.0;
1559     sum12 = 0.0;
1560     sum13 = 0.0;
1561     sum14 = 0.0;
1562     sum15 = 0.0;
1563 
1564     for (j = 0; j < n; j++) {
1565       xb = x + 15 * (idx[j]);
1566 
1567       for (k = 0; k < 15; k++) {
1568         xv = xb[k];
1569         sum1 += v[0] * xv;
1570         sum2 += v[1] * xv;
1571         sum3 += v[2] * xv;
1572         sum4 += v[3] * xv;
1573         sum5 += v[4] * xv;
1574         sum6 += v[5] * xv;
1575         sum7 += v[6] * xv;
1576         sum8 += v[7] * xv;
1577         sum9 += v[8] * xv;
1578         sum10 += v[9] * xv;
1579         sum11 += v[10] * xv;
1580         sum12 += v[11] * xv;
1581         sum13 += v[12] * xv;
1582         sum14 += v[13] * xv;
1583         sum15 += v[14] * xv;
1584         v += 15;
1585       }
1586     }
1587     if (usecprow) z = zarray + 15 * ridx[i];
1588     z[0]  = sum1;
1589     z[1]  = sum2;
1590     z[2]  = sum3;
1591     z[3]  = sum4;
1592     z[4]  = sum5;
1593     z[5]  = sum6;
1594     z[6]  = sum7;
1595     z[7]  = sum8;
1596     z[8]  = sum9;
1597     z[9]  = sum10;
1598     z[10] = sum11;
1599     z[11] = sum12;
1600     z[12] = sum13;
1601     z[13] = sum14;
1602     z[14] = sum15;
1603 
1604     if (!usecprow) z += 15;
1605   }
1606 
1607   PetscCall(VecRestoreArrayRead(xx, &x));
1608   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1609   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1610   PetscFunctionReturn(0);
1611 }
1612 
1613 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1614 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1615 {
1616   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1617   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1618   const PetscScalar *x, *xb;
1619   PetscScalar        x1, x2, x3, x4, *zarray;
1620   const MatScalar   *v;
1621   const PetscInt    *ii, *ij = a->j, *idx;
1622   PetscInt           mbs, i, j, n, *ridx = NULL;
1623   PetscBool          usecprow = a->compressedrow.use;
1624 
1625   PetscFunctionBegin;
1626   PetscCall(VecGetArrayRead(xx, &x));
1627   PetscCall(VecGetArrayWrite(zz, &zarray));
1628 
1629   v = a->a;
1630   if (usecprow) {
1631     mbs  = a->compressedrow.nrows;
1632     ii   = a->compressedrow.i;
1633     ridx = a->compressedrow.rindex;
1634     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1635   } else {
1636     mbs = a->mbs;
1637     ii  = a->i;
1638     z   = zarray;
1639   }
1640 
1641   for (i = 0; i < mbs; i++) {
1642     n     = ii[i + 1] - ii[i];
1643     idx   = ij + ii[i];
1644     sum1  = 0.0;
1645     sum2  = 0.0;
1646     sum3  = 0.0;
1647     sum4  = 0.0;
1648     sum5  = 0.0;
1649     sum6  = 0.0;
1650     sum7  = 0.0;
1651     sum8  = 0.0;
1652     sum9  = 0.0;
1653     sum10 = 0.0;
1654     sum11 = 0.0;
1655     sum12 = 0.0;
1656     sum13 = 0.0;
1657     sum14 = 0.0;
1658     sum15 = 0.0;
1659 
1660     for (j = 0; j < n; j++) {
1661       xb = x + 15 * (idx[j]);
1662       x1 = xb[0];
1663       x2 = xb[1];
1664       x3 = xb[2];
1665       x4 = xb[3];
1666 
1667       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1668       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1669       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1670       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1671       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1672       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1673       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1674       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1675       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1676       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1677       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1678       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1679       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1680       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1681       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1682 
1683       v += 60;
1684 
1685       x1 = xb[4];
1686       x2 = xb[5];
1687       x3 = xb[6];
1688       x4 = xb[7];
1689 
1690       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1691       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1692       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1693       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1694       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1695       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1696       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1697       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1698       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1699       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1700       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1701       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1702       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1703       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1704       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1705       v += 60;
1706 
1707       x1 = xb[8];
1708       x2 = xb[9];
1709       x3 = xb[10];
1710       x4 = xb[11];
1711       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1712       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1713       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1714       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1715       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1716       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1717       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1718       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1719       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1720       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1721       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1722       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1723       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1724       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1725       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1726       v += 60;
1727 
1728       x1 = xb[12];
1729       x2 = xb[13];
1730       x3 = xb[14];
1731       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1732       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1733       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1734       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1735       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1736       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1737       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1738       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1739       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1740       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1741       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1742       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1743       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1744       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1745       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1746       v += 45;
1747     }
1748     if (usecprow) z = zarray + 15 * ridx[i];
1749     z[0]  = sum1;
1750     z[1]  = sum2;
1751     z[2]  = sum3;
1752     z[3]  = sum4;
1753     z[4]  = sum5;
1754     z[5]  = sum6;
1755     z[6]  = sum7;
1756     z[7]  = sum8;
1757     z[8]  = sum9;
1758     z[9]  = sum10;
1759     z[10] = sum11;
1760     z[11] = sum12;
1761     z[12] = sum13;
1762     z[13] = sum14;
1763     z[14] = sum15;
1764 
1765     if (!usecprow) z += 15;
1766   }
1767 
1768   PetscCall(VecRestoreArrayRead(xx, &x));
1769   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1770   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1775 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1776 {
1777   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1778   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1779   const PetscScalar *x, *xb;
1780   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1781   const MatScalar   *v;
1782   const PetscInt    *ii, *ij = a->j, *idx;
1783   PetscInt           mbs, i, j, n, *ridx = NULL;
1784   PetscBool          usecprow = a->compressedrow.use;
1785 
1786   PetscFunctionBegin;
1787   PetscCall(VecGetArrayRead(xx, &x));
1788   PetscCall(VecGetArrayWrite(zz, &zarray));
1789 
1790   v = a->a;
1791   if (usecprow) {
1792     mbs  = a->compressedrow.nrows;
1793     ii   = a->compressedrow.i;
1794     ridx = a->compressedrow.rindex;
1795     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1796   } else {
1797     mbs = a->mbs;
1798     ii  = a->i;
1799     z   = zarray;
1800   }
1801 
1802   for (i = 0; i < mbs; i++) {
1803     n     = ii[i + 1] - ii[i];
1804     idx   = ij + ii[i];
1805     sum1  = 0.0;
1806     sum2  = 0.0;
1807     sum3  = 0.0;
1808     sum4  = 0.0;
1809     sum5  = 0.0;
1810     sum6  = 0.0;
1811     sum7  = 0.0;
1812     sum8  = 0.0;
1813     sum9  = 0.0;
1814     sum10 = 0.0;
1815     sum11 = 0.0;
1816     sum12 = 0.0;
1817     sum13 = 0.0;
1818     sum14 = 0.0;
1819     sum15 = 0.0;
1820 
1821     for (j = 0; j < n; j++) {
1822       xb = x + 15 * (idx[j]);
1823       x1 = xb[0];
1824       x2 = xb[1];
1825       x3 = xb[2];
1826       x4 = xb[3];
1827       x5 = xb[4];
1828       x6 = xb[5];
1829       x7 = xb[6];
1830       x8 = xb[7];
1831 
1832       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1833       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1834       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1835       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1836       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1837       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1838       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1839       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1840       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1841       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1842       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1843       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1844       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1845       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1846       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1847       v += 120;
1848 
1849       x1 = xb[8];
1850       x2 = xb[9];
1851       x3 = xb[10];
1852       x4 = xb[11];
1853       x5 = xb[12];
1854       x6 = xb[13];
1855       x7 = xb[14];
1856 
1857       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1858       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1859       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1860       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1861       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1862       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1863       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1864       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1865       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1866       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1867       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1868       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1869       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1870       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1871       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1872       v += 105;
1873     }
1874     if (usecprow) z = zarray + 15 * ridx[i];
1875     z[0]  = sum1;
1876     z[1]  = sum2;
1877     z[2]  = sum3;
1878     z[3]  = sum4;
1879     z[4]  = sum5;
1880     z[5]  = sum6;
1881     z[6]  = sum7;
1882     z[7]  = sum8;
1883     z[8]  = sum9;
1884     z[9]  = sum10;
1885     z[10] = sum11;
1886     z[11] = sum12;
1887     z[12] = sum13;
1888     z[13] = sum14;
1889     z[14] = sum15;
1890 
1891     if (!usecprow) z += 15;
1892   }
1893 
1894   PetscCall(VecRestoreArrayRead(xx, &x));
1895   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1896   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1901 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1902 {
1903   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1904   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1905   const PetscScalar *x, *xb;
1906   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1907   const MatScalar   *v;
1908   const PetscInt    *ii, *ij = a->j, *idx;
1909   PetscInt           mbs, i, j, n, *ridx = NULL;
1910   PetscBool          usecprow = a->compressedrow.use;
1911 
1912   PetscFunctionBegin;
1913   PetscCall(VecGetArrayRead(xx, &x));
1914   PetscCall(VecGetArrayWrite(zz, &zarray));
1915 
1916   v = a->a;
1917   if (usecprow) {
1918     mbs  = a->compressedrow.nrows;
1919     ii   = a->compressedrow.i;
1920     ridx = a->compressedrow.rindex;
1921     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1922   } else {
1923     mbs = a->mbs;
1924     ii  = a->i;
1925     z   = zarray;
1926   }
1927 
1928   for (i = 0; i < mbs; i++) {
1929     n     = ii[i + 1] - ii[i];
1930     idx   = ij + ii[i];
1931     sum1  = 0.0;
1932     sum2  = 0.0;
1933     sum3  = 0.0;
1934     sum4  = 0.0;
1935     sum5  = 0.0;
1936     sum6  = 0.0;
1937     sum7  = 0.0;
1938     sum8  = 0.0;
1939     sum9  = 0.0;
1940     sum10 = 0.0;
1941     sum11 = 0.0;
1942     sum12 = 0.0;
1943     sum13 = 0.0;
1944     sum14 = 0.0;
1945     sum15 = 0.0;
1946 
1947     for (j = 0; j < n; j++) {
1948       xb  = x + 15 * (idx[j]);
1949       x1  = xb[0];
1950       x2  = xb[1];
1951       x3  = xb[2];
1952       x4  = xb[3];
1953       x5  = xb[4];
1954       x6  = xb[5];
1955       x7  = xb[6];
1956       x8  = xb[7];
1957       x9  = xb[8];
1958       x10 = xb[9];
1959       x11 = xb[10];
1960       x12 = xb[11];
1961       x13 = xb[12];
1962       x14 = xb[13];
1963       x15 = xb[14];
1964 
1965       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1966       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1967       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1968       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1969       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1970       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1971       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1972       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1973       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1974       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1975       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1976       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1977       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1978       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1979       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1980       v += 225;
1981     }
1982     if (usecprow) z = zarray + 15 * ridx[i];
1983     z[0]  = sum1;
1984     z[1]  = sum2;
1985     z[2]  = sum3;
1986     z[3]  = sum4;
1987     z[4]  = sum5;
1988     z[5]  = sum6;
1989     z[6]  = sum7;
1990     z[7]  = sum8;
1991     z[8]  = sum9;
1992     z[9]  = sum10;
1993     z[10] = sum11;
1994     z[11] = sum12;
1995     z[12] = sum13;
1996     z[13] = sum14;
1997     z[14] = sum15;
1998 
1999     if (!usecprow) z += 15;
2000   }
2001 
2002   PetscCall(VecRestoreArrayRead(xx, &x));
2003   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2004   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
2005   PetscFunctionReturn(0);
2006 }
2007 
2008 /*
2009     This will not work with MatScalar == float because it calls the BLAS
2010 */
2011 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
2012 {
2013   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2014   PetscScalar       *z = NULL, *work, *workt, *zarray;
2015   const PetscScalar *x, *xb;
2016   const MatScalar   *v;
2017   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2018   const PetscInt    *idx, *ii, *ridx = NULL;
2019   PetscInt           ncols, k;
2020   PetscBool          usecprow = a->compressedrow.use;
2021 
2022   PetscFunctionBegin;
2023   PetscCall(VecGetArrayRead(xx, &x));
2024   PetscCall(VecGetArrayWrite(zz, &zarray));
2025 
2026   idx = a->j;
2027   v   = a->a;
2028   if (usecprow) {
2029     mbs  = a->compressedrow.nrows;
2030     ii   = a->compressedrow.i;
2031     ridx = a->compressedrow.rindex;
2032     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2033   } else {
2034     mbs = a->mbs;
2035     ii  = a->i;
2036     z   = zarray;
2037   }
2038 
2039   if (!a->mult_work) {
2040     k = PetscMax(A->rmap->n, A->cmap->n);
2041     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2042   }
2043   work = a->mult_work;
2044   for (i = 0; i < mbs; i++) {
2045     n = ii[1] - ii[0];
2046     ii++;
2047     ncols = n * bs;
2048     workt = work;
2049     for (j = 0; j < n; j++) {
2050       xb = x + bs * (*idx++);
2051       for (k = 0; k < bs; k++) workt[k] = xb[k];
2052       workt += bs;
2053     }
2054     if (usecprow) z = zarray + bs * ridx[i];
2055     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2056     v += n * bs2;
2057     if (!usecprow) z += bs;
2058   }
2059   PetscCall(VecRestoreArrayRead(xx, &x));
2060   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2061   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2066 {
2067   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2068   const PetscScalar *x;
2069   PetscScalar       *y, *z, sum;
2070   const MatScalar   *v;
2071   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2072   const PetscInt    *idx, *ii;
2073   PetscBool          usecprow = a->compressedrow.use;
2074 
2075   PetscFunctionBegin;
2076   PetscCall(VecGetArrayRead(xx, &x));
2077   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2078 
2079   idx = a->j;
2080   v   = a->a;
2081   if (usecprow) {
2082     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2083     mbs  = a->compressedrow.nrows;
2084     ii   = a->compressedrow.i;
2085     ridx = a->compressedrow.rindex;
2086   } else {
2087     ii = a->i;
2088   }
2089 
2090   for (i = 0; i < mbs; i++) {
2091     n = ii[1] - ii[0];
2092     ii++;
2093     if (!usecprow) {
2094       sum = y[i];
2095     } else {
2096       sum = y[ridx[i]];
2097     }
2098     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2099     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2100     PetscSparseDensePlusDot(sum, x, v, idx, n);
2101     v += n;
2102     idx += n;
2103     if (usecprow) {
2104       z[ridx[i]] = sum;
2105     } else {
2106       z[i] = sum;
2107     }
2108   }
2109   PetscCall(VecRestoreArrayRead(xx, &x));
2110   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2111   PetscCall(PetscLogFlops(2.0 * a->nz));
2112   PetscFunctionReturn(0);
2113 }
2114 
2115 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2116 {
2117   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2118   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2119   const PetscScalar *x, *xb;
2120   PetscScalar        x1, x2, *yarray, *zarray;
2121   const MatScalar   *v;
2122   PetscInt           mbs = a->mbs, i, n, j;
2123   const PetscInt    *idx, *ii, *ridx = NULL;
2124   PetscBool          usecprow = a->compressedrow.use;
2125 
2126   PetscFunctionBegin;
2127   PetscCall(VecGetArrayRead(xx, &x));
2128   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2129 
2130   idx = a->j;
2131   v   = a->a;
2132   if (usecprow) {
2133     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2134     mbs  = a->compressedrow.nrows;
2135     ii   = a->compressedrow.i;
2136     ridx = a->compressedrow.rindex;
2137   } else {
2138     ii = a->i;
2139     y  = yarray;
2140     z  = zarray;
2141   }
2142 
2143   for (i = 0; i < mbs; i++) {
2144     n = ii[1] - ii[0];
2145     ii++;
2146     if (usecprow) {
2147       z = zarray + 2 * ridx[i];
2148       y = yarray + 2 * ridx[i];
2149     }
2150     sum1 = y[0];
2151     sum2 = y[1];
2152     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2153     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2154     for (j = 0; j < n; j++) {
2155       xb = x + 2 * (*idx++);
2156       x1 = xb[0];
2157       x2 = xb[1];
2158 
2159       sum1 += v[0] * x1 + v[2] * x2;
2160       sum2 += v[1] * x1 + v[3] * x2;
2161       v += 4;
2162     }
2163     z[0] = sum1;
2164     z[1] = sum2;
2165     if (!usecprow) {
2166       z += 2;
2167       y += 2;
2168     }
2169   }
2170   PetscCall(VecRestoreArrayRead(xx, &x));
2171   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2172   PetscCall(PetscLogFlops(4.0 * a->nz));
2173   PetscFunctionReturn(0);
2174 }
2175 
2176 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2177 {
2178   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2179   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2180   const PetscScalar *x, *xb;
2181   const MatScalar   *v;
2182   PetscInt           mbs = a->mbs, i, j, n;
2183   const PetscInt    *idx, *ii, *ridx = NULL;
2184   PetscBool          usecprow = a->compressedrow.use;
2185 
2186   PetscFunctionBegin;
2187   PetscCall(VecGetArrayRead(xx, &x));
2188   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2189 
2190   idx = a->j;
2191   v   = a->a;
2192   if (usecprow) {
2193     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2194     mbs  = a->compressedrow.nrows;
2195     ii   = a->compressedrow.i;
2196     ridx = a->compressedrow.rindex;
2197   } else {
2198     ii = a->i;
2199     y  = yarray;
2200     z  = zarray;
2201   }
2202 
2203   for (i = 0; i < mbs; i++) {
2204     n = ii[1] - ii[0];
2205     ii++;
2206     if (usecprow) {
2207       z = zarray + 3 * ridx[i];
2208       y = yarray + 3 * ridx[i];
2209     }
2210     sum1 = y[0];
2211     sum2 = y[1];
2212     sum3 = y[2];
2213     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2214     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2215     for (j = 0; j < n; j++) {
2216       xb = x + 3 * (*idx++);
2217       x1 = xb[0];
2218       x2 = xb[1];
2219       x3 = xb[2];
2220       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2221       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2222       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2223       v += 9;
2224     }
2225     z[0] = sum1;
2226     z[1] = sum2;
2227     z[2] = sum3;
2228     if (!usecprow) {
2229       z += 3;
2230       y += 3;
2231     }
2232   }
2233   PetscCall(VecRestoreArrayRead(xx, &x));
2234   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2235   PetscCall(PetscLogFlops(18.0 * a->nz));
2236   PetscFunctionReturn(0);
2237 }
2238 
2239 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2240 {
2241   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2242   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2243   const PetscScalar *x, *xb;
2244   const MatScalar   *v;
2245   PetscInt           mbs = a->mbs, i, j, n;
2246   const PetscInt    *idx, *ii, *ridx = NULL;
2247   PetscBool          usecprow = a->compressedrow.use;
2248 
2249   PetscFunctionBegin;
2250   PetscCall(VecGetArrayRead(xx, &x));
2251   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2252 
2253   idx = a->j;
2254   v   = a->a;
2255   if (usecprow) {
2256     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2257     mbs  = a->compressedrow.nrows;
2258     ii   = a->compressedrow.i;
2259     ridx = a->compressedrow.rindex;
2260   } else {
2261     ii = a->i;
2262     y  = yarray;
2263     z  = zarray;
2264   }
2265 
2266   for (i = 0; i < mbs; i++) {
2267     n = ii[1] - ii[0];
2268     ii++;
2269     if (usecprow) {
2270       z = zarray + 4 * ridx[i];
2271       y = yarray + 4 * ridx[i];
2272     }
2273     sum1 = y[0];
2274     sum2 = y[1];
2275     sum3 = y[2];
2276     sum4 = y[3];
2277     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2278     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2279     for (j = 0; j < n; j++) {
2280       xb = x + 4 * (*idx++);
2281       x1 = xb[0];
2282       x2 = xb[1];
2283       x3 = xb[2];
2284       x4 = xb[3];
2285       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2286       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2287       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2288       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2289       v += 16;
2290     }
2291     z[0] = sum1;
2292     z[1] = sum2;
2293     z[2] = sum3;
2294     z[3] = sum4;
2295     if (!usecprow) {
2296       z += 4;
2297       y += 4;
2298     }
2299   }
2300   PetscCall(VecRestoreArrayRead(xx, &x));
2301   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2302   PetscCall(PetscLogFlops(32.0 * a->nz));
2303   PetscFunctionReturn(0);
2304 }
2305 
2306 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2307 {
2308   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2309   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2310   const PetscScalar *x, *xb;
2311   PetscScalar       *yarray, *zarray;
2312   const MatScalar   *v;
2313   PetscInt           mbs = a->mbs, i, j, n;
2314   const PetscInt    *idx, *ii, *ridx = NULL;
2315   PetscBool          usecprow = a->compressedrow.use;
2316 
2317   PetscFunctionBegin;
2318   PetscCall(VecGetArrayRead(xx, &x));
2319   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2320 
2321   idx = a->j;
2322   v   = a->a;
2323   if (usecprow) {
2324     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2325     mbs  = a->compressedrow.nrows;
2326     ii   = a->compressedrow.i;
2327     ridx = a->compressedrow.rindex;
2328   } else {
2329     ii = a->i;
2330     y  = yarray;
2331     z  = zarray;
2332   }
2333 
2334   for (i = 0; i < mbs; i++) {
2335     n = ii[1] - ii[0];
2336     ii++;
2337     if (usecprow) {
2338       z = zarray + 5 * ridx[i];
2339       y = yarray + 5 * ridx[i];
2340     }
2341     sum1 = y[0];
2342     sum2 = y[1];
2343     sum3 = y[2];
2344     sum4 = y[3];
2345     sum5 = y[4];
2346     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2347     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2348     for (j = 0; j < n; j++) {
2349       xb = x + 5 * (*idx++);
2350       x1 = xb[0];
2351       x2 = xb[1];
2352       x3 = xb[2];
2353       x4 = xb[3];
2354       x5 = xb[4];
2355       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2356       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2357       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2358       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2359       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2360       v += 25;
2361     }
2362     z[0] = sum1;
2363     z[1] = sum2;
2364     z[2] = sum3;
2365     z[3] = sum4;
2366     z[4] = sum5;
2367     if (!usecprow) {
2368       z += 5;
2369       y += 5;
2370     }
2371   }
2372   PetscCall(VecRestoreArrayRead(xx, &x));
2373   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2374   PetscCall(PetscLogFlops(50.0 * a->nz));
2375   PetscFunctionReturn(0);
2376 }
2377 
2378 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2379 {
2380   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2381   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2382   const PetscScalar *x, *xb;
2383   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2384   const MatScalar   *v;
2385   PetscInt           mbs = a->mbs, i, j, n;
2386   const PetscInt    *idx, *ii, *ridx = NULL;
2387   PetscBool          usecprow = a->compressedrow.use;
2388 
2389   PetscFunctionBegin;
2390   PetscCall(VecGetArrayRead(xx, &x));
2391   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2392 
2393   idx = a->j;
2394   v   = a->a;
2395   if (usecprow) {
2396     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2397     mbs  = a->compressedrow.nrows;
2398     ii   = a->compressedrow.i;
2399     ridx = a->compressedrow.rindex;
2400   } else {
2401     ii = a->i;
2402     y  = yarray;
2403     z  = zarray;
2404   }
2405 
2406   for (i = 0; i < mbs; i++) {
2407     n = ii[1] - ii[0];
2408     ii++;
2409     if (usecprow) {
2410       z = zarray + 6 * ridx[i];
2411       y = yarray + 6 * ridx[i];
2412     }
2413     sum1 = y[0];
2414     sum2 = y[1];
2415     sum3 = y[2];
2416     sum4 = y[3];
2417     sum5 = y[4];
2418     sum6 = y[5];
2419     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2420     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2421     for (j = 0; j < n; j++) {
2422       xb = x + 6 * (*idx++);
2423       x1 = xb[0];
2424       x2 = xb[1];
2425       x3 = xb[2];
2426       x4 = xb[3];
2427       x5 = xb[4];
2428       x6 = xb[5];
2429       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2430       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2431       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2432       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2433       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2434       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2435       v += 36;
2436     }
2437     z[0] = sum1;
2438     z[1] = sum2;
2439     z[2] = sum3;
2440     z[3] = sum4;
2441     z[4] = sum5;
2442     z[5] = sum6;
2443     if (!usecprow) {
2444       z += 6;
2445       y += 6;
2446     }
2447   }
2448   PetscCall(VecRestoreArrayRead(xx, &x));
2449   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2450   PetscCall(PetscLogFlops(72.0 * a->nz));
2451   PetscFunctionReturn(0);
2452 }
2453 
2454 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2455 {
2456   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2457   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2458   const PetscScalar *x, *xb;
2459   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2460   const MatScalar   *v;
2461   PetscInt           mbs = a->mbs, i, j, n;
2462   const PetscInt    *idx, *ii, *ridx = NULL;
2463   PetscBool          usecprow = a->compressedrow.use;
2464 
2465   PetscFunctionBegin;
2466   PetscCall(VecGetArrayRead(xx, &x));
2467   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2468 
2469   idx = a->j;
2470   v   = a->a;
2471   if (usecprow) {
2472     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2473     mbs  = a->compressedrow.nrows;
2474     ii   = a->compressedrow.i;
2475     ridx = a->compressedrow.rindex;
2476   } else {
2477     ii = a->i;
2478     y  = yarray;
2479     z  = zarray;
2480   }
2481 
2482   for (i = 0; i < mbs; i++) {
2483     n = ii[1] - ii[0];
2484     ii++;
2485     if (usecprow) {
2486       z = zarray + 7 * ridx[i];
2487       y = yarray + 7 * ridx[i];
2488     }
2489     sum1 = y[0];
2490     sum2 = y[1];
2491     sum3 = y[2];
2492     sum4 = y[3];
2493     sum5 = y[4];
2494     sum6 = y[5];
2495     sum7 = y[6];
2496     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2497     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2498     for (j = 0; j < n; j++) {
2499       xb = x + 7 * (*idx++);
2500       x1 = xb[0];
2501       x2 = xb[1];
2502       x3 = xb[2];
2503       x4 = xb[3];
2504       x5 = xb[4];
2505       x6 = xb[5];
2506       x7 = xb[6];
2507       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2508       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2509       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2510       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2511       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2512       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2513       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2514       v += 49;
2515     }
2516     z[0] = sum1;
2517     z[1] = sum2;
2518     z[2] = sum3;
2519     z[3] = sum4;
2520     z[4] = sum5;
2521     z[5] = sum6;
2522     z[6] = sum7;
2523     if (!usecprow) {
2524       z += 7;
2525       y += 7;
2526     }
2527   }
2528   PetscCall(VecRestoreArrayRead(xx, &x));
2529   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2530   PetscCall(PetscLogFlops(98.0 * a->nz));
2531   PetscFunctionReturn(0);
2532 }
2533 
2534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2535 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2536 {
2537   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2538   PetscScalar       *z = NULL, *work, *workt, *zarray;
2539   const PetscScalar *x, *xb;
2540   const MatScalar   *v;
2541   PetscInt           mbs, i, j, n;
2542   PetscInt           k;
2543   PetscBool          usecprow = a->compressedrow.use;
2544   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2545 
2546   __m256d a0, a1, a2, a3, a4, a5;
2547   __m256d w0, w1, w2, w3;
2548   __m256d z0, z1, z2;
2549   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2550 
2551   PetscFunctionBegin;
2552   PetscCall(VecCopy(yy, zz));
2553   PetscCall(VecGetArrayRead(xx, &x));
2554   PetscCall(VecGetArray(zz, &zarray));
2555 
2556   idx = a->j;
2557   v   = a->a;
2558   if (usecprow) {
2559     mbs  = a->compressedrow.nrows;
2560     ii   = a->compressedrow.i;
2561     ridx = a->compressedrow.rindex;
2562   } else {
2563     mbs = a->mbs;
2564     ii  = a->i;
2565     z   = zarray;
2566   }
2567 
2568   if (!a->mult_work) {
2569     k = PetscMax(A->rmap->n, A->cmap->n);
2570     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2571   }
2572 
2573   work = a->mult_work;
2574   for (i = 0; i < mbs; i++) {
2575     n = ii[1] - ii[0];
2576     ii++;
2577     workt = work;
2578     for (j = 0; j < n; j++) {
2579       xb = x + bs * (*idx++);
2580       for (k = 0; k < bs; k++) workt[k] = xb[k];
2581       workt += bs;
2582     }
2583     if (usecprow) z = zarray + bs * ridx[i];
2584 
2585     z0 = _mm256_loadu_pd(&z[0]);
2586     z1 = _mm256_loadu_pd(&z[4]);
2587     z2 = _mm256_set1_pd(z[8]);
2588 
2589     for (j = 0; j < n; j++) {
2590       /* first column of a */
2591       w0 = _mm256_set1_pd(work[j * 9]);
2592       a0 = _mm256_loadu_pd(&v[j * 81]);
2593       z0 = _mm256_fmadd_pd(a0, w0, z0);
2594       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2595       z1 = _mm256_fmadd_pd(a1, w0, z1);
2596       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2597       z2 = _mm256_fmadd_pd(a2, w0, z2);
2598 
2599       /* second column of a */
2600       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2601       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2602       z0 = _mm256_fmadd_pd(a0, w1, z0);
2603       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2604       z1 = _mm256_fmadd_pd(a1, w1, z1);
2605       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2606       z2 = _mm256_fmadd_pd(a2, w1, z2);
2607 
2608       /* third column of a */
2609       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2610       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2611       z0 = _mm256_fmadd_pd(a3, w2, z0);
2612       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2613       z1 = _mm256_fmadd_pd(a4, w2, z1);
2614       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2615       z2 = _mm256_fmadd_pd(a5, w2, z2);
2616 
2617       /* fourth column of a */
2618       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2619       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2620       z0 = _mm256_fmadd_pd(a0, w3, z0);
2621       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2622       z1 = _mm256_fmadd_pd(a1, w3, z1);
2623       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2624       z2 = _mm256_fmadd_pd(a2, w3, z2);
2625 
2626       /* fifth column of a */
2627       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2628       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2629       z0 = _mm256_fmadd_pd(a3, w0, z0);
2630       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2631       z1 = _mm256_fmadd_pd(a4, w0, z1);
2632       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2633       z2 = _mm256_fmadd_pd(a5, w0, z2);
2634 
2635       /* sixth column of a */
2636       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2637       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2638       z0 = _mm256_fmadd_pd(a0, w1, z0);
2639       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2640       z1 = _mm256_fmadd_pd(a1, w1, z1);
2641       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2642       z2 = _mm256_fmadd_pd(a2, w1, z2);
2643 
2644       /* seventh column of a */
2645       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2646       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2647       z0 = _mm256_fmadd_pd(a0, w2, z0);
2648       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2649       z1 = _mm256_fmadd_pd(a1, w2, z1);
2650       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2651       z2 = _mm256_fmadd_pd(a2, w2, z2);
2652 
2653       /* eighth column of a */
2654       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2655       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2656       z0 = _mm256_fmadd_pd(a3, w3, z0);
2657       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2658       z1 = _mm256_fmadd_pd(a4, w3, z1);
2659       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2660       z2 = _mm256_fmadd_pd(a5, w3, z2);
2661 
2662       /* ninth column of a */
2663       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2664       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2665       z0 = _mm256_fmadd_pd(a0, w0, z0);
2666       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2667       z1 = _mm256_fmadd_pd(a1, w0, z1);
2668       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2669       z2 = _mm256_fmadd_pd(a2, w0, z2);
2670     }
2671 
2672     _mm256_storeu_pd(&z[0], z0);
2673     _mm256_storeu_pd(&z[4], z1);
2674     _mm256_maskstore_pd(&z[8], mask1, z2);
2675 
2676     v += n * bs2;
2677     if (!usecprow) z += bs;
2678   }
2679   PetscCall(VecRestoreArrayRead(xx, &x));
2680   PetscCall(VecRestoreArray(zz, &zarray));
2681   PetscCall(PetscLogFlops(162.0 * a->nz));
2682   PetscFunctionReturn(0);
2683 }
2684 #endif
2685 
2686 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2687 {
2688   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2689   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2690   const PetscScalar *x, *xb;
2691   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2692   const MatScalar   *v;
2693   PetscInt           mbs = a->mbs, i, j, n;
2694   const PetscInt    *idx, *ii, *ridx = NULL;
2695   PetscBool          usecprow = a->compressedrow.use;
2696 
2697   PetscFunctionBegin;
2698   PetscCall(VecGetArrayRead(xx, &x));
2699   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2700 
2701   idx = a->j;
2702   v   = a->a;
2703   if (usecprow) {
2704     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2705     mbs  = a->compressedrow.nrows;
2706     ii   = a->compressedrow.i;
2707     ridx = a->compressedrow.rindex;
2708   } else {
2709     ii = a->i;
2710     y  = yarray;
2711     z  = zarray;
2712   }
2713 
2714   for (i = 0; i < mbs; i++) {
2715     n = ii[1] - ii[0];
2716     ii++;
2717     if (usecprow) {
2718       z = zarray + 11 * ridx[i];
2719       y = yarray + 11 * ridx[i];
2720     }
2721     sum1  = y[0];
2722     sum2  = y[1];
2723     sum3  = y[2];
2724     sum4  = y[3];
2725     sum5  = y[4];
2726     sum6  = y[5];
2727     sum7  = y[6];
2728     sum8  = y[7];
2729     sum9  = y[8];
2730     sum10 = y[9];
2731     sum11 = y[10];
2732     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2733     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2734     for (j = 0; j < n; j++) {
2735       xb  = x + 11 * (*idx++);
2736       x1  = xb[0];
2737       x2  = xb[1];
2738       x3  = xb[2];
2739       x4  = xb[3];
2740       x5  = xb[4];
2741       x6  = xb[5];
2742       x7  = xb[6];
2743       x8  = xb[7];
2744       x9  = xb[8];
2745       x10 = xb[9];
2746       x11 = xb[10];
2747       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2748       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2749       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2750       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2751       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2752       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2753       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2754       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2755       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2756       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2757       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2758       v += 121;
2759     }
2760     z[0]  = sum1;
2761     z[1]  = sum2;
2762     z[2]  = sum3;
2763     z[3]  = sum4;
2764     z[4]  = sum5;
2765     z[5]  = sum6;
2766     z[6]  = sum7;
2767     z[7]  = sum8;
2768     z[8]  = sum9;
2769     z[9]  = sum10;
2770     z[10] = sum11;
2771     if (!usecprow) {
2772       z += 11;
2773       y += 11;
2774     }
2775   }
2776   PetscCall(VecRestoreArrayRead(xx, &x));
2777   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2778   PetscCall(PetscLogFlops(242.0 * a->nz));
2779   PetscFunctionReturn(0);
2780 }
2781 
2782 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2783 {
2784   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2785   PetscScalar       *z = NULL, *work, *workt, *zarray;
2786   const PetscScalar *x, *xb;
2787   const MatScalar   *v;
2788   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2789   PetscInt           ncols, k;
2790   const PetscInt    *ridx     = NULL, *idx, *ii;
2791   PetscBool          usecprow = a->compressedrow.use;
2792 
2793   PetscFunctionBegin;
2794   PetscCall(VecCopy(yy, zz));
2795   PetscCall(VecGetArrayRead(xx, &x));
2796   PetscCall(VecGetArray(zz, &zarray));
2797 
2798   idx = a->j;
2799   v   = a->a;
2800   if (usecprow) {
2801     mbs  = a->compressedrow.nrows;
2802     ii   = a->compressedrow.i;
2803     ridx = a->compressedrow.rindex;
2804   } else {
2805     mbs = a->mbs;
2806     ii  = a->i;
2807     z   = zarray;
2808   }
2809 
2810   if (!a->mult_work) {
2811     k = PetscMax(A->rmap->n, A->cmap->n);
2812     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2813   }
2814   work = a->mult_work;
2815   for (i = 0; i < mbs; i++) {
2816     n = ii[1] - ii[0];
2817     ii++;
2818     ncols = n * bs;
2819     workt = work;
2820     for (j = 0; j < n; j++) {
2821       xb = x + bs * (*idx++);
2822       for (k = 0; k < bs; k++) workt[k] = xb[k];
2823       workt += bs;
2824     }
2825     if (usecprow) z = zarray + bs * ridx[i];
2826     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2827     v += n * bs2;
2828     if (!usecprow) z += bs;
2829   }
2830   PetscCall(VecRestoreArrayRead(xx, &x));
2831   PetscCall(VecRestoreArray(zz, &zarray));
2832   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2833   PetscFunctionReturn(0);
2834 }
2835 
2836 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2837 {
2838   PetscScalar zero = 0.0;
2839 
2840   PetscFunctionBegin;
2841   PetscCall(VecSet(zz, zero));
2842   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2843   PetscFunctionReturn(0);
2844 }
2845 
2846 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2847 {
2848   PetscScalar zero = 0.0;
2849 
2850   PetscFunctionBegin;
2851   PetscCall(VecSet(zz, zero));
2852   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2853   PetscFunctionReturn(0);
2854 }
2855 
2856 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2857 {
2858   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2859   PetscScalar       *z, x1, x2, x3, x4, x5;
2860   const PetscScalar *x, *xb = NULL;
2861   const MatScalar   *v;
2862   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2863   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2864   Mat_CompressedRow  cprow    = a->compressedrow;
2865   PetscBool          usecprow = cprow.use;
2866 
2867   PetscFunctionBegin;
2868   if (yy != zz) PetscCall(VecCopy(yy, zz));
2869   PetscCall(VecGetArrayRead(xx, &x));
2870   PetscCall(VecGetArray(zz, &z));
2871 
2872   idx = a->j;
2873   v   = a->a;
2874   if (usecprow) {
2875     mbs  = cprow.nrows;
2876     ii   = cprow.i;
2877     ridx = cprow.rindex;
2878   } else {
2879     mbs = a->mbs;
2880     ii  = a->i;
2881     xb  = x;
2882   }
2883 
2884   switch (bs) {
2885   case 1:
2886     for (i = 0; i < mbs; i++) {
2887       if (usecprow) xb = x + ridx[i];
2888       x1 = xb[0];
2889       ib = idx + ii[0];
2890       n  = ii[1] - ii[0];
2891       ii++;
2892       for (j = 0; j < n; j++) {
2893         rval = ib[j];
2894         z[rval] += PetscConj(*v) * x1;
2895         v++;
2896       }
2897       if (!usecprow) xb++;
2898     }
2899     break;
2900   case 2:
2901     for (i = 0; i < mbs; i++) {
2902       if (usecprow) xb = x + 2 * ridx[i];
2903       x1 = xb[0];
2904       x2 = xb[1];
2905       ib = idx + ii[0];
2906       n  = ii[1] - ii[0];
2907       ii++;
2908       for (j = 0; j < n; j++) {
2909         rval = ib[j] * 2;
2910         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2911         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2912         v += 4;
2913       }
2914       if (!usecprow) xb += 2;
2915     }
2916     break;
2917   case 3:
2918     for (i = 0; i < mbs; i++) {
2919       if (usecprow) xb = x + 3 * ridx[i];
2920       x1 = xb[0];
2921       x2 = xb[1];
2922       x3 = xb[2];
2923       ib = idx + ii[0];
2924       n  = ii[1] - ii[0];
2925       ii++;
2926       for (j = 0; j < n; j++) {
2927         rval = ib[j] * 3;
2928         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2929         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2930         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2931         v += 9;
2932       }
2933       if (!usecprow) xb += 3;
2934     }
2935     break;
2936   case 4:
2937     for (i = 0; i < mbs; i++) {
2938       if (usecprow) xb = x + 4 * ridx[i];
2939       x1 = xb[0];
2940       x2 = xb[1];
2941       x3 = xb[2];
2942       x4 = xb[3];
2943       ib = idx + ii[0];
2944       n  = ii[1] - ii[0];
2945       ii++;
2946       for (j = 0; j < n; j++) {
2947         rval = ib[j] * 4;
2948         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2949         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2950         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2951         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2952         v += 16;
2953       }
2954       if (!usecprow) xb += 4;
2955     }
2956     break;
2957   case 5:
2958     for (i = 0; i < mbs; i++) {
2959       if (usecprow) xb = x + 5 * ridx[i];
2960       x1 = xb[0];
2961       x2 = xb[1];
2962       x3 = xb[2];
2963       x4 = xb[3];
2964       x5 = xb[4];
2965       ib = idx + ii[0];
2966       n  = ii[1] - ii[0];
2967       ii++;
2968       for (j = 0; j < n; j++) {
2969         rval = ib[j] * 5;
2970         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2971         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2972         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2973         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2974         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2975         v += 25;
2976       }
2977       if (!usecprow) xb += 5;
2978     }
2979     break;
2980   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2981     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2982 #if 0
2983     {
2984       PetscInt          ncols,k,bs2=a->bs2;
2985       PetscScalar       *work,*workt,zb;
2986       const PetscScalar *xtmp;
2987       if (!a->mult_work) {
2988         k    = PetscMax(A->rmap->n,A->cmap->n);
2989         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2990       }
2991       work = a->mult_work;
2992       xtmp = x;
2993       for (i=0; i<mbs; i++) {
2994         n     = ii[1] - ii[0]; ii++;
2995         ncols = n*bs;
2996         PetscCall(PetscArrayzero(work,ncols));
2997         if (usecprow) xtmp = x + bs*ridx[i];
2998         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2999         v += n*bs2;
3000         if (!usecprow) xtmp += bs;
3001         workt = work;
3002         for (j=0; j<n; j++) {
3003           zb = z + bs*(*idx++);
3004           for (k=0; k<bs; k++) zb[k] += workt[k] ;
3005           workt += bs;
3006         }
3007       }
3008     }
3009 #endif
3010   }
3011   PetscCall(VecRestoreArrayRead(xx, &x));
3012   PetscCall(VecRestoreArray(zz, &z));
3013   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3014   PetscFunctionReturn(0);
3015 }
3016 
3017 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
3018 {
3019   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3020   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
3021   const PetscScalar *x, *xb = NULL;
3022   const MatScalar   *v;
3023   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3024   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
3025   Mat_CompressedRow  cprow    = a->compressedrow;
3026   PetscBool          usecprow = cprow.use;
3027 
3028   PetscFunctionBegin;
3029   if (yy != zz) PetscCall(VecCopy(yy, zz));
3030   PetscCall(VecGetArrayRead(xx, &x));
3031   PetscCall(VecGetArray(zz, &z));
3032 
3033   idx = a->j;
3034   v   = a->a;
3035   if (usecprow) {
3036     mbs  = cprow.nrows;
3037     ii   = cprow.i;
3038     ridx = cprow.rindex;
3039   } else {
3040     mbs = a->mbs;
3041     ii  = a->i;
3042     xb  = x;
3043   }
3044 
3045   switch (bs) {
3046   case 1:
3047     for (i = 0; i < mbs; i++) {
3048       if (usecprow) xb = x + ridx[i];
3049       x1 = xb[0];
3050       ib = idx + ii[0];
3051       n  = ii[1] - ii[0];
3052       ii++;
3053       for (j = 0; j < n; j++) {
3054         rval = ib[j];
3055         z[rval] += *v * x1;
3056         v++;
3057       }
3058       if (!usecprow) xb++;
3059     }
3060     break;
3061   case 2:
3062     for (i = 0; i < mbs; i++) {
3063       if (usecprow) xb = x + 2 * ridx[i];
3064       x1 = xb[0];
3065       x2 = xb[1];
3066       ib = idx + ii[0];
3067       n  = ii[1] - ii[0];
3068       ii++;
3069       for (j = 0; j < n; j++) {
3070         rval = ib[j] * 2;
3071         z[rval++] += v[0] * x1 + v[1] * x2;
3072         z[rval++] += v[2] * x1 + v[3] * x2;
3073         v += 4;
3074       }
3075       if (!usecprow) xb += 2;
3076     }
3077     break;
3078   case 3:
3079     for (i = 0; i < mbs; i++) {
3080       if (usecprow) xb = x + 3 * ridx[i];
3081       x1 = xb[0];
3082       x2 = xb[1];
3083       x3 = xb[2];
3084       ib = idx + ii[0];
3085       n  = ii[1] - ii[0];
3086       ii++;
3087       for (j = 0; j < n; j++) {
3088         rval = ib[j] * 3;
3089         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3090         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3091         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3092         v += 9;
3093       }
3094       if (!usecprow) xb += 3;
3095     }
3096     break;
3097   case 4:
3098     for (i = 0; i < mbs; i++) {
3099       if (usecprow) xb = x + 4 * ridx[i];
3100       x1 = xb[0];
3101       x2 = xb[1];
3102       x3 = xb[2];
3103       x4 = xb[3];
3104       ib = idx + ii[0];
3105       n  = ii[1] - ii[0];
3106       ii++;
3107       for (j = 0; j < n; j++) {
3108         rval = ib[j] * 4;
3109         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3110         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3111         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3112         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3113         v += 16;
3114       }
3115       if (!usecprow) xb += 4;
3116     }
3117     break;
3118   case 5:
3119     for (i = 0; i < mbs; i++) {
3120       if (usecprow) xb = x + 5 * ridx[i];
3121       x1 = xb[0];
3122       x2 = xb[1];
3123       x3 = xb[2];
3124       x4 = xb[3];
3125       x5 = xb[4];
3126       ib = idx + ii[0];
3127       n  = ii[1] - ii[0];
3128       ii++;
3129       for (j = 0; j < n; j++) {
3130         rval = ib[j] * 5;
3131         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3132         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3133         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3134         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3135         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3136         v += 25;
3137       }
3138       if (!usecprow) xb += 5;
3139     }
3140     break;
3141   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3142     PetscInt           ncols, k;
3143     PetscScalar       *work, *workt;
3144     const PetscScalar *xtmp;
3145     if (!a->mult_work) {
3146       k = PetscMax(A->rmap->n, A->cmap->n);
3147       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3148     }
3149     work = a->mult_work;
3150     xtmp = x;
3151     for (i = 0; i < mbs; i++) {
3152       n = ii[1] - ii[0];
3153       ii++;
3154       ncols = n * bs;
3155       PetscCall(PetscArrayzero(work, ncols));
3156       if (usecprow) xtmp = x + bs * ridx[i];
3157       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3158       v += n * bs2;
3159       if (!usecprow) xtmp += bs;
3160       workt = work;
3161       for (j = 0; j < n; j++) {
3162         zb = z + bs * (*idx++);
3163         for (k = 0; k < bs; k++) zb[k] += workt[k];
3164         workt += bs;
3165       }
3166     }
3167   }
3168   }
3169   PetscCall(VecRestoreArrayRead(xx, &x));
3170   PetscCall(VecRestoreArray(zz, &z));
3171   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3172   PetscFunctionReturn(0);
3173 }
3174 
3175 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3176 {
3177   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3178   PetscInt     totalnz = a->bs2 * a->nz;
3179   PetscScalar  oalpha  = alpha;
3180   PetscBLASInt one     = 1, tnz;
3181 
3182   PetscFunctionBegin;
3183   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3184   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3185   PetscCall(PetscLogFlops(totalnz));
3186   PetscFunctionReturn(0);
3187 }
3188 
3189 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3190 {
3191   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3192   MatScalar   *v   = a->a;
3193   PetscReal    sum = 0.0;
3194   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3195 
3196   PetscFunctionBegin;
3197   if (type == NORM_FROBENIUS) {
3198 #if defined(PETSC_USE_REAL___FP16)
3199     PetscBLASInt one = 1, cnt = bs2 * nz;
3200     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3201 #else
3202     for (i = 0; i < bs2 * nz; i++) {
3203       sum += PetscRealPart(PetscConj(*v) * (*v));
3204       v++;
3205     }
3206 #endif
3207     *norm = PetscSqrtReal(sum);
3208     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3209   } else if (type == NORM_1) { /* maximum column sum */
3210     PetscReal *tmp;
3211     PetscInt  *bcol = a->j;
3212     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3213     for (i = 0; i < nz; i++) {
3214       for (j = 0; j < bs; j++) {
3215         k1 = bs * (*bcol) + j; /* column index */
3216         for (k = 0; k < bs; k++) {
3217           tmp[k1] += PetscAbsScalar(*v);
3218           v++;
3219         }
3220       }
3221       bcol++;
3222     }
3223     *norm = 0.0;
3224     for (j = 0; j < A->cmap->n; j++) {
3225       if (tmp[j] > *norm) *norm = tmp[j];
3226     }
3227     PetscCall(PetscFree(tmp));
3228     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3229   } else if (type == NORM_INFINITY) { /* maximum row sum */
3230     *norm = 0.0;
3231     for (k = 0; k < bs; k++) {
3232       for (j = 0; j < a->mbs; j++) {
3233         v   = a->a + bs2 * a->i[j] + k;
3234         sum = 0.0;
3235         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3236           for (k1 = 0; k1 < bs; k1++) {
3237             sum += PetscAbsScalar(*v);
3238             v += bs;
3239           }
3240         }
3241         if (sum > *norm) *norm = sum;
3242       }
3243     }
3244     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3245   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3246   PetscFunctionReturn(0);
3247 }
3248 
3249 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3250 {
3251   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3252 
3253   PetscFunctionBegin;
3254   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3255   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3256     *flg = PETSC_FALSE;
3257     PetscFunctionReturn(0);
3258   }
3259 
3260   /* if the a->i are the same */
3261   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3262   if (!*flg) PetscFunctionReturn(0);
3263 
3264   /* if a->j are the same */
3265   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3266   if (!*flg) PetscFunctionReturn(0);
3267 
3268   /* if a->a are the same */
3269   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3270   PetscFunctionReturn(0);
3271 }
3272 
3273 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3274 {
3275   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3276   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3277   PetscScalar *x, zero = 0.0;
3278   MatScalar   *aa, *aa_j;
3279 
3280   PetscFunctionBegin;
3281   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3282   bs   = A->rmap->bs;
3283   aa   = a->a;
3284   ai   = a->i;
3285   aj   = a->j;
3286   ambs = a->mbs;
3287   bs2  = a->bs2;
3288 
3289   PetscCall(VecSet(v, zero));
3290   PetscCall(VecGetArray(v, &x));
3291   PetscCall(VecGetLocalSize(v, &n));
3292   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3293   for (i = 0; i < ambs; i++) {
3294     for (j = ai[i]; j < ai[i + 1]; j++) {
3295       if (aj[j] == i) {
3296         row  = i * bs;
3297         aa_j = aa + j * bs2;
3298         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3299         break;
3300       }
3301     }
3302   }
3303   PetscCall(VecRestoreArray(v, &x));
3304   PetscFunctionReturn(0);
3305 }
3306 
3307 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3308 {
3309   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3310   const PetscScalar *l, *r, *li, *ri;
3311   PetscScalar        x;
3312   MatScalar         *aa, *v;
3313   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3314   const PetscInt    *ai, *aj;
3315 
3316   PetscFunctionBegin;
3317   ai  = a->i;
3318   aj  = a->j;
3319   aa  = a->a;
3320   m   = A->rmap->n;
3321   n   = A->cmap->n;
3322   bs  = A->rmap->bs;
3323   mbs = a->mbs;
3324   bs2 = a->bs2;
3325   if (ll) {
3326     PetscCall(VecGetArrayRead(ll, &l));
3327     PetscCall(VecGetLocalSize(ll, &lm));
3328     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3329     for (i = 0; i < mbs; i++) { /* for each block row */
3330       M  = ai[i + 1] - ai[i];
3331       li = l + i * bs;
3332       v  = aa + bs2 * ai[i];
3333       for (j = 0; j < M; j++) { /* for each block */
3334         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3335       }
3336     }
3337     PetscCall(VecRestoreArrayRead(ll, &l));
3338     PetscCall(PetscLogFlops(a->nz));
3339   }
3340 
3341   if (rr) {
3342     PetscCall(VecGetArrayRead(rr, &r));
3343     PetscCall(VecGetLocalSize(rr, &rn));
3344     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3345     for (i = 0; i < mbs; i++) { /* for each block row */
3346       iai = ai[i];
3347       M   = ai[i + 1] - iai;
3348       v   = aa + bs2 * iai;
3349       for (j = 0; j < M; j++) { /* for each block */
3350         ri = r + bs * aj[iai + j];
3351         for (k = 0; k < bs; k++) {
3352           x = ri[k];
3353           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3354           v += bs;
3355         }
3356       }
3357     }
3358     PetscCall(VecRestoreArrayRead(rr, &r));
3359     PetscCall(PetscLogFlops(a->nz));
3360   }
3361   PetscFunctionReturn(0);
3362 }
3363 
3364 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3365 {
3366   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3367 
3368   PetscFunctionBegin;
3369   info->block_size   = a->bs2;
3370   info->nz_allocated = a->bs2 * a->maxnz;
3371   info->nz_used      = a->bs2 * a->nz;
3372   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3373   info->assemblies   = A->num_ass;
3374   info->mallocs      = A->info.mallocs;
3375   info->memory       = 0; /* REVIEW ME */
3376   if (A->factortype) {
3377     info->fill_ratio_given  = A->info.fill_ratio_given;
3378     info->fill_ratio_needed = A->info.fill_ratio_needed;
3379     info->factor_mallocs    = A->info.factor_mallocs;
3380   } else {
3381     info->fill_ratio_given  = 0;
3382     info->fill_ratio_needed = 0;
3383     info->factor_mallocs    = 0;
3384   }
3385   PetscFunctionReturn(0);
3386 }
3387 
3388 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3389 {
3390   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3391 
3392   PetscFunctionBegin;
3393   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3394   PetscFunctionReturn(0);
3395 }
3396 
3397 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3398 {
3399   PetscFunctionBegin;
3400   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3401   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3402   PetscFunctionReturn(0);
3403 }
3404 
3405 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3406 {
3407   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3408   PetscScalar       *z = NULL, sum1;
3409   const PetscScalar *xb;
3410   PetscScalar        x1;
3411   const MatScalar   *v, *vv;
3412   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3413   PetscBool          usecprow = a->compressedrow.use;
3414 
3415   PetscFunctionBegin;
3416   idx = a->j;
3417   v   = a->a;
3418   if (usecprow) {
3419     mbs  = a->compressedrow.nrows;
3420     ii   = a->compressedrow.i;
3421     ridx = a->compressedrow.rindex;
3422   } else {
3423     mbs = a->mbs;
3424     ii  = a->i;
3425     z   = c;
3426   }
3427 
3428   for (i = 0; i < mbs; i++) {
3429     n = ii[1] - ii[0];
3430     ii++;
3431     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3432     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3433     if (usecprow) z = c + ridx[i];
3434     jj = idx;
3435     vv = v;
3436     for (k = 0; k < cn; k++) {
3437       idx  = jj;
3438       v    = vv;
3439       sum1 = 0.0;
3440       for (j = 0; j < n; j++) {
3441         xb = b + (*idx++);
3442         x1 = xb[0 + k * bm];
3443         sum1 += v[0] * x1;
3444         v += 1;
3445       }
3446       z[0 + k * cm] = sum1;
3447     }
3448     if (!usecprow) z += 1;
3449   }
3450   PetscFunctionReturn(0);
3451 }
3452 
3453 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3454 {
3455   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3456   PetscScalar       *z = NULL, sum1, sum2;
3457   const PetscScalar *xb;
3458   PetscScalar        x1, x2;
3459   const MatScalar   *v, *vv;
3460   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3461   PetscBool          usecprow = a->compressedrow.use;
3462 
3463   PetscFunctionBegin;
3464   idx = a->j;
3465   v   = a->a;
3466   if (usecprow) {
3467     mbs  = a->compressedrow.nrows;
3468     ii   = a->compressedrow.i;
3469     ridx = a->compressedrow.rindex;
3470   } else {
3471     mbs = a->mbs;
3472     ii  = a->i;
3473     z   = c;
3474   }
3475 
3476   for (i = 0; i < mbs; i++) {
3477     n = ii[1] - ii[0];
3478     ii++;
3479     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3480     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3481     if (usecprow) z = c + 2 * ridx[i];
3482     jj = idx;
3483     vv = v;
3484     for (k = 0; k < cn; k++) {
3485       idx  = jj;
3486       v    = vv;
3487       sum1 = 0.0;
3488       sum2 = 0.0;
3489       for (j = 0; j < n; j++) {
3490         xb = b + 2 * (*idx++);
3491         x1 = xb[0 + k * bm];
3492         x2 = xb[1 + k * bm];
3493         sum1 += v[0] * x1 + v[2] * x2;
3494         sum2 += v[1] * x1 + v[3] * x2;
3495         v += 4;
3496       }
3497       z[0 + k * cm] = sum1;
3498       z[1 + k * cm] = sum2;
3499     }
3500     if (!usecprow) z += 2;
3501   }
3502   PetscFunctionReturn(0);
3503 }
3504 
3505 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3506 {
3507   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3508   PetscScalar       *z = NULL, sum1, sum2, sum3;
3509   const PetscScalar *xb;
3510   PetscScalar        x1, x2, x3;
3511   const MatScalar   *v, *vv;
3512   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3513   PetscBool          usecprow = a->compressedrow.use;
3514 
3515   PetscFunctionBegin;
3516   idx = a->j;
3517   v   = a->a;
3518   if (usecprow) {
3519     mbs  = a->compressedrow.nrows;
3520     ii   = a->compressedrow.i;
3521     ridx = a->compressedrow.rindex;
3522   } else {
3523     mbs = a->mbs;
3524     ii  = a->i;
3525     z   = c;
3526   }
3527 
3528   for (i = 0; i < mbs; i++) {
3529     n = ii[1] - ii[0];
3530     ii++;
3531     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3532     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3533     if (usecprow) z = c + 3 * ridx[i];
3534     jj = idx;
3535     vv = v;
3536     for (k = 0; k < cn; k++) {
3537       idx  = jj;
3538       v    = vv;
3539       sum1 = 0.0;
3540       sum2 = 0.0;
3541       sum3 = 0.0;
3542       for (j = 0; j < n; j++) {
3543         xb = b + 3 * (*idx++);
3544         x1 = xb[0 + k * bm];
3545         x2 = xb[1 + k * bm];
3546         x3 = xb[2 + k * bm];
3547         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3548         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3549         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3550         v += 9;
3551       }
3552       z[0 + k * cm] = sum1;
3553       z[1 + k * cm] = sum2;
3554       z[2 + k * cm] = sum3;
3555     }
3556     if (!usecprow) z += 3;
3557   }
3558   PetscFunctionReturn(0);
3559 }
3560 
3561 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3562 {
3563   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3564   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3565   const PetscScalar *xb;
3566   PetscScalar        x1, x2, x3, x4;
3567   const MatScalar   *v, *vv;
3568   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3569   PetscBool          usecprow = a->compressedrow.use;
3570 
3571   PetscFunctionBegin;
3572   idx = a->j;
3573   v   = a->a;
3574   if (usecprow) {
3575     mbs  = a->compressedrow.nrows;
3576     ii   = a->compressedrow.i;
3577     ridx = a->compressedrow.rindex;
3578   } else {
3579     mbs = a->mbs;
3580     ii  = a->i;
3581     z   = c;
3582   }
3583 
3584   for (i = 0; i < mbs; i++) {
3585     n = ii[1] - ii[0];
3586     ii++;
3587     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3588     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3589     if (usecprow) z = c + 4 * ridx[i];
3590     jj = idx;
3591     vv = v;
3592     for (k = 0; k < cn; k++) {
3593       idx  = jj;
3594       v    = vv;
3595       sum1 = 0.0;
3596       sum2 = 0.0;
3597       sum3 = 0.0;
3598       sum4 = 0.0;
3599       for (j = 0; j < n; j++) {
3600         xb = b + 4 * (*idx++);
3601         x1 = xb[0 + k * bm];
3602         x2 = xb[1 + k * bm];
3603         x3 = xb[2 + k * bm];
3604         x4 = xb[3 + k * bm];
3605         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3606         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3607         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3608         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3609         v += 16;
3610       }
3611       z[0 + k * cm] = sum1;
3612       z[1 + k * cm] = sum2;
3613       z[2 + k * cm] = sum3;
3614       z[3 + k * cm] = sum4;
3615     }
3616     if (!usecprow) z += 4;
3617   }
3618   PetscFunctionReturn(0);
3619 }
3620 
3621 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3622 {
3623   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3624   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3625   const PetscScalar *xb;
3626   PetscScalar        x1, x2, x3, x4, x5;
3627   const MatScalar   *v, *vv;
3628   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3629   PetscBool          usecprow = a->compressedrow.use;
3630 
3631   PetscFunctionBegin;
3632   idx = a->j;
3633   v   = a->a;
3634   if (usecprow) {
3635     mbs  = a->compressedrow.nrows;
3636     ii   = a->compressedrow.i;
3637     ridx = a->compressedrow.rindex;
3638   } else {
3639     mbs = a->mbs;
3640     ii  = a->i;
3641     z   = c;
3642   }
3643 
3644   for (i = 0; i < mbs; i++) {
3645     n = ii[1] - ii[0];
3646     ii++;
3647     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3648     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3649     if (usecprow) z = c + 5 * ridx[i];
3650     jj = idx;
3651     vv = v;
3652     for (k = 0; k < cn; k++) {
3653       idx  = jj;
3654       v    = vv;
3655       sum1 = 0.0;
3656       sum2 = 0.0;
3657       sum3 = 0.0;
3658       sum4 = 0.0;
3659       sum5 = 0.0;
3660       for (j = 0; j < n; j++) {
3661         xb = b + 5 * (*idx++);
3662         x1 = xb[0 + k * bm];
3663         x2 = xb[1 + k * bm];
3664         x3 = xb[2 + k * bm];
3665         x4 = xb[3 + k * bm];
3666         x5 = xb[4 + k * bm];
3667         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3668         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3669         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3670         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3671         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3672         v += 25;
3673       }
3674       z[0 + k * cm] = sum1;
3675       z[1 + k * cm] = sum2;
3676       z[2 + k * cm] = sum3;
3677       z[3 + k * cm] = sum4;
3678       z[4 + k * cm] = sum5;
3679     }
3680     if (!usecprow) z += 5;
3681   }
3682   PetscFunctionReturn(0);
3683 }
3684 
3685 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3686 {
3687   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3688   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3689   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3690   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3691   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3692   PetscBLASInt     bbs, bcn, bbm, bcm;
3693   PetscScalar     *z = NULL;
3694   PetscScalar     *c, *b;
3695   const MatScalar *v;
3696   const PetscInt  *idx, *ii, *ridx = NULL;
3697   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3698   PetscBool        usecprow = a->compressedrow.use;
3699 
3700   PetscFunctionBegin;
3701   if (!cm || !cn) PetscFunctionReturn(0);
3702   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3703   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3704   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3705   b = bd->v;
3706   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3707   PetscCall(MatDenseGetArray(C, &c));
3708   switch (bs) {
3709   case 1:
3710     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3711     break;
3712   case 2:
3713     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3714     break;
3715   case 3:
3716     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3717     break;
3718   case 4:
3719     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3720     break;
3721   case 5:
3722     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3723     break;
3724   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3725     PetscCall(PetscBLASIntCast(bs, &bbs));
3726     PetscCall(PetscBLASIntCast(cn, &bcn));
3727     PetscCall(PetscBLASIntCast(bm, &bbm));
3728     PetscCall(PetscBLASIntCast(cm, &bcm));
3729     idx = a->j;
3730     v   = a->a;
3731     if (usecprow) {
3732       mbs  = a->compressedrow.nrows;
3733       ii   = a->compressedrow.i;
3734       ridx = a->compressedrow.rindex;
3735     } else {
3736       mbs = a->mbs;
3737       ii  = a->i;
3738       z   = c;
3739     }
3740     for (i = 0; i < mbs; i++) {
3741       n = ii[1] - ii[0];
3742       ii++;
3743       if (usecprow) z = c + bs * ridx[i];
3744       if (n) {
3745         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3746         v += bs2;
3747       }
3748       for (j = 1; j < n; j++) {
3749         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3750         v += bs2;
3751       }
3752       if (!usecprow) z += bs;
3753     }
3754   }
3755   PetscCall(MatDenseRestoreArray(C, &c));
3756   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3757   PetscFunctionReturn(0);
3758 }
3759