xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision e0f5bfbec699682fa3e8b8532b1176849ea4e12a)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
13   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
14   const PetscInt *idx;
15   PetscInt        start, end, *ai, *aj, bs, *nidx2;
16   PetscBT         table;
17 
18   PetscFunctionBegin;
19   m  = a->mbs;
20   ai = a->i;
21   aj = a->j;
22   bs = A->rmap->bs;
23 
24   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
25 
26   PetscCall(PetscBTCreate(m, &table));
27   PetscCall(PetscMalloc1(m + 1, &nidx));
28   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
29 
30   for (i = 0; i < is_max; i++) {
31     /* Initialise the two local arrays */
32     isz = 0;
33     PetscCall(PetscBTMemzero(m, table));
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     PetscCall(ISGetIndices(is[i], &idx));
37     PetscCall(ISGetLocalSize(is[i], &n));
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j = 0; j < n; ++j) {
41       ival = idx[j] / bs; /* convert the indices into block indices */
42       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
43       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44     }
45     PetscCall(ISRestoreIndices(is[i], &idx));
46     PetscCall(ISDestroy(&is[i]));
47 
48     k = 0;
49     for (j = 0; j < ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row + 1];
55         for (l = start; l < end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j = 0; j < isz; j++) {
63       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
64     }
65     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
66   }
67   PetscCall(PetscBTDestroy(&table));
68   PetscCall(PetscFree(nidx));
69   PetscCall(PetscFree(nidx2));
70   PetscFunctionReturn(0);
71 }
72 
73 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
74   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
75   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
76   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
77   const PetscInt *irow, *icol;
78   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
79   PetscInt       *aj = a->j, *ai = a->i;
80   MatScalar      *mat_a;
81   Mat             C;
82   PetscBool       flag;
83 
84   PetscFunctionBegin;
85   PetscCall(ISGetIndices(isrow, &irow));
86   PetscCall(ISGetIndices(iscol, &icol));
87   PetscCall(ISGetLocalSize(isrow, &nrows));
88   PetscCall(ISGetLocalSize(iscol, &ncols));
89 
90   PetscCall(PetscCalloc1(1 + oldcols, &smap));
91   ssmap = smap;
92   PetscCall(PetscMalloc1(1 + nrows, &lens));
93   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
94   /* determine lens of each row */
95   for (i = 0; i < nrows; i++) {
96     kstart  = ai[irow[i]];
97     kend    = kstart + a->ilen[irow[i]];
98     lens[i] = 0;
99     for (k = kstart; k < kend; k++) {
100       if (ssmap[aj[k]]) lens[i]++;
101     }
102   }
103   /* Create and fill new matrix */
104   if (scall == MAT_REUSE_MATRIX) {
105     c = (Mat_SeqBAIJ *)((*B)->data);
106 
107     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
108     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
109     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
110     PetscCall(PetscArrayzero(c->ilen, c->mbs));
111     C = *B;
112   } else {
113     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
114     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
115     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
116     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
117   }
118   c = (Mat_SeqBAIJ *)(C->data);
119   for (i = 0; i < nrows; i++) {
120     row      = irow[i];
121     kstart   = ai[row];
122     kend     = kstart + a->ilen[row];
123     mat_i    = c->i[i];
124     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
125     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
126     mat_ilen = c->ilen + i;
127     for (k = kstart; k < kend; k++) {
128       if ((tcol = ssmap[a->j[k]])) {
129         *mat_j++ = tcol - 1;
130         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
131         mat_a += bs2;
132         (*mat_ilen)++;
133       }
134     }
135   }
136   /* sort */
137   if (c->j && c->a) {
138     MatScalar *work;
139     PetscCall(PetscMalloc1(bs2, &work));
140     for (i = 0; i < nrows; i++) {
141       PetscInt ilen;
142       mat_i = c->i[i];
143       mat_j = c->j + mat_i;
144       mat_a = c->a + mat_i * bs2;
145       ilen  = c->ilen[i];
146       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
147     }
148     PetscCall(PetscFree(work));
149   }
150 
151   /* Free work space */
152   PetscCall(ISRestoreIndices(iscol, &icol));
153   PetscCall(PetscFree(smap));
154   PetscCall(PetscFree(lens));
155   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
156   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
157 
158   PetscCall(ISRestoreIndices(isrow, &irow));
159   *B = C;
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
164   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
165   IS              is1, is2;
166   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
167   const PetscInt *irow, *icol;
168 
169   PetscFunctionBegin;
170   PetscCall(ISGetIndices(isrow, &irow));
171   PetscCall(ISGetIndices(iscol, &icol));
172   PetscCall(ISGetLocalSize(isrow, &nrows));
173   PetscCall(ISGetLocalSize(iscol, &ncols));
174 
175   /* Verify if the indices corespond to each element in a block
176    and form the IS with compressed IS */
177   maxmnbs = PetscMax(a->mbs, a->nbs);
178   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
179   PetscCall(PetscArrayzero(vary, a->mbs));
180   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
181   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
182   count = 0;
183   for (i = 0; i < nrows; i++) {
184     j = irow[i] / bs;
185     if ((vary[j]--) == bs) iary[count++] = j;
186   }
187   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
188 
189   PetscCall(PetscArrayzero(vary, a->nbs));
190   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
191   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
192   count = 0;
193   for (i = 0; i < ncols; i++) {
194     j = icol[i] / bs;
195     if ((vary[j]--) == bs) iary[count++] = j;
196   }
197   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
198   PetscCall(ISRestoreIndices(isrow, &irow));
199   PetscCall(ISRestoreIndices(iscol, &icol));
200   PetscCall(PetscFree2(vary, iary));
201 
202   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
203   PetscCall(ISDestroy(&is1));
204   PetscCall(ISDestroy(&is2));
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) {
209   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
210   Mat_SubSppt *submatj = c->submatis1;
211 
212   PetscFunctionBegin;
213   PetscCall((*submatj->destroy)(C));
214   PetscCall(MatDestroySubMatrix_Private(submatj));
215   PetscFunctionReturn(0);
216 }
217 
218 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
219 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) {
220   PetscInt     i;
221   Mat          C;
222   Mat_SeqBAIJ *c;
223   Mat_SubSppt *submatj;
224 
225   PetscFunctionBegin;
226   for (i = 0; i < n; i++) {
227     C       = (*mat)[i];
228     c       = (Mat_SeqBAIJ *)C->data;
229     submatj = c->submatis1;
230     if (submatj) {
231       if (--((PetscObject)C)->refct <= 0) {
232         PetscCall(PetscFree(C->factorprefix));
233         PetscCall((*submatj->destroy)(C));
234         PetscCall(MatDestroySubMatrix_Private(submatj));
235         PetscCall(PetscFree(C->defaultvectype));
236         PetscCall(PetscFree(C->defaultrandtype));
237         PetscCall(PetscLayoutDestroy(&C->rmap));
238         PetscCall(PetscLayoutDestroy(&C->cmap));
239         PetscCall(PetscHeaderDestroy(&C));
240       }
241     } else {
242       PetscCall(MatDestroy(&C));
243     }
244   }
245 
246   /* Destroy Dummy submatrices created for reuse */
247   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
248 
249   PetscCall(PetscFree(*mat));
250   PetscFunctionReturn(0);
251 }
252 
253 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) {
254   PetscInt i;
255 
256   PetscFunctionBegin;
257   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
258 
259   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
260   PetscFunctionReturn(0);
261 }
262 
263 /* -------------------------------------------------------*/
264 /* Should check that shapes of vectors and matrices match */
265 /* -------------------------------------------------------*/
266 
267 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) {
268   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
269   PetscScalar       *z, sum;
270   const PetscScalar *x;
271   const MatScalar   *v;
272   PetscInt           mbs, i, n;
273   const PetscInt    *idx, *ii, *ridx = NULL;
274   PetscBool          usecprow = a->compressedrow.use;
275 
276   PetscFunctionBegin;
277   PetscCall(VecGetArrayRead(xx, &x));
278   PetscCall(VecGetArrayWrite(zz, &z));
279 
280   if (usecprow) {
281     mbs  = a->compressedrow.nrows;
282     ii   = a->compressedrow.i;
283     ridx = a->compressedrow.rindex;
284     PetscCall(PetscArrayzero(z, a->mbs));
285   } else {
286     mbs = a->mbs;
287     ii  = a->i;
288   }
289 
290   for (i = 0; i < mbs; i++) {
291     n   = ii[1] - ii[0];
292     v   = a->a + ii[0];
293     idx = a->j + ii[0];
294     ii++;
295     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
296     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
297     sum = 0.0;
298     PetscSparseDensePlusDot(sum, x, v, idx, n);
299     if (usecprow) {
300       z[ridx[i]] = sum;
301     } else {
302       z[i] = sum;
303     }
304   }
305   PetscCall(VecRestoreArrayRead(xx, &x));
306   PetscCall(VecRestoreArrayWrite(zz, &z));
307   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
308   PetscFunctionReturn(0);
309 }
310 
311 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) {
312   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
313   PetscScalar       *z = NULL, sum1, sum2, *zarray;
314   const PetscScalar *x, *xb;
315   PetscScalar        x1, x2;
316   const MatScalar   *v;
317   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
318   PetscBool          usecprow = a->compressedrow.use;
319 
320   PetscFunctionBegin;
321   PetscCall(VecGetArrayRead(xx, &x));
322   PetscCall(VecGetArrayWrite(zz, &zarray));
323 
324   idx = a->j;
325   v   = a->a;
326   if (usecprow) {
327     mbs  = a->compressedrow.nrows;
328     ii   = a->compressedrow.i;
329     ridx = a->compressedrow.rindex;
330     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
331   } else {
332     mbs = a->mbs;
333     ii  = a->i;
334     z   = zarray;
335   }
336 
337   for (i = 0; i < mbs; i++) {
338     n = ii[1] - ii[0];
339     ii++;
340     sum1 = 0.0;
341     sum2 = 0.0;
342     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
343     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
344     for (j = 0; j < n; j++) {
345       xb = x + 2 * (*idx++);
346       x1 = xb[0];
347       x2 = xb[1];
348       sum1 += v[0] * x1 + v[2] * x2;
349       sum2 += v[1] * x1 + v[3] * x2;
350       v += 4;
351     }
352     if (usecprow) z = zarray + 2 * ridx[i];
353     z[0] = sum1;
354     z[1] = sum2;
355     if (!usecprow) z += 2;
356   }
357   PetscCall(VecRestoreArrayRead(xx, &x));
358   PetscCall(VecRestoreArrayWrite(zz, &zarray));
359   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
360   PetscFunctionReturn(0);
361 }
362 
363 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) {
364   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
365   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
366   const PetscScalar *x, *xb;
367   const MatScalar   *v;
368   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
369   PetscBool          usecprow = a->compressedrow.use;
370 
371 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
372 #pragma disjoint(*v, *z, *xb)
373 #endif
374 
375   PetscFunctionBegin;
376   PetscCall(VecGetArrayRead(xx, &x));
377   PetscCall(VecGetArrayWrite(zz, &zarray));
378 
379   idx = a->j;
380   v   = a->a;
381   if (usecprow) {
382     mbs  = a->compressedrow.nrows;
383     ii   = a->compressedrow.i;
384     ridx = a->compressedrow.rindex;
385     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
386   } else {
387     mbs = a->mbs;
388     ii  = a->i;
389     z   = zarray;
390   }
391 
392   for (i = 0; i < mbs; i++) {
393     n = ii[1] - ii[0];
394     ii++;
395     sum1 = 0.0;
396     sum2 = 0.0;
397     sum3 = 0.0;
398     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
399     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
400     for (j = 0; j < n; j++) {
401       xb = x + 3 * (*idx++);
402       x1 = xb[0];
403       x2 = xb[1];
404       x3 = xb[2];
405 
406       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
407       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
408       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
409       v += 9;
410     }
411     if (usecprow) z = zarray + 3 * ridx[i];
412     z[0] = sum1;
413     z[1] = sum2;
414     z[2] = sum3;
415     if (!usecprow) z += 3;
416   }
417   PetscCall(VecRestoreArrayRead(xx, &x));
418   PetscCall(VecRestoreArrayWrite(zz, &zarray));
419   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
420   PetscFunctionReturn(0);
421 }
422 
423 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) {
424   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
425   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
426   const PetscScalar *x, *xb;
427   const MatScalar   *v;
428   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
429   PetscBool          usecprow = a->compressedrow.use;
430 
431   PetscFunctionBegin;
432   PetscCall(VecGetArrayRead(xx, &x));
433   PetscCall(VecGetArrayWrite(zz, &zarray));
434 
435   idx = a->j;
436   v   = a->a;
437   if (usecprow) {
438     mbs  = a->compressedrow.nrows;
439     ii   = a->compressedrow.i;
440     ridx = a->compressedrow.rindex;
441     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
442   } else {
443     mbs = a->mbs;
444     ii  = a->i;
445     z   = zarray;
446   }
447 
448   for (i = 0; i < mbs; i++) {
449     n = ii[1] - ii[0];
450     ii++;
451     sum1 = 0.0;
452     sum2 = 0.0;
453     sum3 = 0.0;
454     sum4 = 0.0;
455 
456     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
457     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
458     for (j = 0; j < n; j++) {
459       xb = x + 4 * (*idx++);
460       x1 = xb[0];
461       x2 = xb[1];
462       x3 = xb[2];
463       x4 = xb[3];
464       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
465       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
466       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
467       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
468       v += 16;
469     }
470     if (usecprow) z = zarray + 4 * ridx[i];
471     z[0] = sum1;
472     z[1] = sum2;
473     z[2] = sum3;
474     z[3] = sum4;
475     if (!usecprow) z += 4;
476   }
477   PetscCall(VecRestoreArrayRead(xx, &x));
478   PetscCall(VecRestoreArrayWrite(zz, &zarray));
479   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
480   PetscFunctionReturn(0);
481 }
482 
483 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) {
484   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
485   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
486   const PetscScalar *xb, *x;
487   const MatScalar   *v;
488   const PetscInt    *idx, *ii, *ridx = NULL;
489   PetscInt           mbs, i, j, n;
490   PetscBool          usecprow = a->compressedrow.use;
491 
492   PetscFunctionBegin;
493   PetscCall(VecGetArrayRead(xx, &x));
494   PetscCall(VecGetArrayWrite(zz, &zarray));
495 
496   idx = a->j;
497   v   = a->a;
498   if (usecprow) {
499     mbs  = a->compressedrow.nrows;
500     ii   = a->compressedrow.i;
501     ridx = a->compressedrow.rindex;
502     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
503   } else {
504     mbs = a->mbs;
505     ii  = a->i;
506     z   = zarray;
507   }
508 
509   for (i = 0; i < mbs; i++) {
510     n = ii[1] - ii[0];
511     ii++;
512     sum1 = 0.0;
513     sum2 = 0.0;
514     sum3 = 0.0;
515     sum4 = 0.0;
516     sum5 = 0.0;
517     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
518     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
519     for (j = 0; j < n; j++) {
520       xb = x + 5 * (*idx++);
521       x1 = xb[0];
522       x2 = xb[1];
523       x3 = xb[2];
524       x4 = xb[3];
525       x5 = xb[4];
526       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
527       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
528       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
529       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
530       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
531       v += 25;
532     }
533     if (usecprow) z = zarray + 5 * ridx[i];
534     z[0] = sum1;
535     z[1] = sum2;
536     z[2] = sum3;
537     z[3] = sum4;
538     z[4] = sum5;
539     if (!usecprow) z += 5;
540   }
541   PetscCall(VecRestoreArrayRead(xx, &x));
542   PetscCall(VecRestoreArrayWrite(zz, &zarray));
543   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
544   PetscFunctionReturn(0);
545 }
546 
547 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) {
548   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
549   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
550   const PetscScalar *x, *xb;
551   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
552   const MatScalar   *v;
553   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
554   PetscBool          usecprow = a->compressedrow.use;
555 
556   PetscFunctionBegin;
557   PetscCall(VecGetArrayRead(xx, &x));
558   PetscCall(VecGetArrayWrite(zz, &zarray));
559 
560   idx = a->j;
561   v   = a->a;
562   if (usecprow) {
563     mbs  = a->compressedrow.nrows;
564     ii   = a->compressedrow.i;
565     ridx = a->compressedrow.rindex;
566     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
567   } else {
568     mbs = a->mbs;
569     ii  = a->i;
570     z   = zarray;
571   }
572 
573   for (i = 0; i < mbs; i++) {
574     n = ii[1] - ii[0];
575     ii++;
576     sum1 = 0.0;
577     sum2 = 0.0;
578     sum3 = 0.0;
579     sum4 = 0.0;
580     sum5 = 0.0;
581     sum6 = 0.0;
582 
583     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
584     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585     for (j = 0; j < n; j++) {
586       xb = x + 6 * (*idx++);
587       x1 = xb[0];
588       x2 = xb[1];
589       x3 = xb[2];
590       x4 = xb[3];
591       x5 = xb[4];
592       x6 = xb[5];
593       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
594       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
595       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
596       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
597       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
598       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
599       v += 36;
600     }
601     if (usecprow) z = zarray + 6 * ridx[i];
602     z[0] = sum1;
603     z[1] = sum2;
604     z[2] = sum3;
605     z[3] = sum4;
606     z[4] = sum5;
607     z[5] = sum6;
608     if (!usecprow) z += 6;
609   }
610 
611   PetscCall(VecRestoreArrayRead(xx, &x));
612   PetscCall(VecRestoreArrayWrite(zz, &zarray));
613   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
614   PetscFunctionReturn(0);
615 }
616 
617 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) {
618   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
619   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
620   const PetscScalar *x, *xb;
621   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
622   const MatScalar   *v;
623   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
624   PetscBool          usecprow = a->compressedrow.use;
625 
626   PetscFunctionBegin;
627   PetscCall(VecGetArrayRead(xx, &x));
628   PetscCall(VecGetArrayWrite(zz, &zarray));
629 
630   idx = a->j;
631   v   = a->a;
632   if (usecprow) {
633     mbs  = a->compressedrow.nrows;
634     ii   = a->compressedrow.i;
635     ridx = a->compressedrow.rindex;
636     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
637   } else {
638     mbs = a->mbs;
639     ii  = a->i;
640     z   = zarray;
641   }
642 
643   for (i = 0; i < mbs; i++) {
644     n = ii[1] - ii[0];
645     ii++;
646     sum1 = 0.0;
647     sum2 = 0.0;
648     sum3 = 0.0;
649     sum4 = 0.0;
650     sum5 = 0.0;
651     sum6 = 0.0;
652     sum7 = 0.0;
653 
654     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
655     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
656     for (j = 0; j < n; j++) {
657       xb = x + 7 * (*idx++);
658       x1 = xb[0];
659       x2 = xb[1];
660       x3 = xb[2];
661       x4 = xb[3];
662       x5 = xb[4];
663       x6 = xb[5];
664       x7 = xb[6];
665       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
666       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
667       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
668       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
669       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
670       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
671       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
672       v += 49;
673     }
674     if (usecprow) z = zarray + 7 * ridx[i];
675     z[0] = sum1;
676     z[1] = sum2;
677     z[2] = sum3;
678     z[3] = sum4;
679     z[4] = sum5;
680     z[5] = sum6;
681     z[6] = sum7;
682     if (!usecprow) z += 7;
683   }
684 
685   PetscCall(VecRestoreArrayRead(xx, &x));
686   PetscCall(VecRestoreArrayWrite(zz, &zarray));
687   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
688   PetscFunctionReturn(0);
689 }
690 
691 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
692 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) {
693   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
694   PetscScalar       *z = NULL, *work, *workt, *zarray;
695   const PetscScalar *x, *xb;
696   const MatScalar   *v;
697   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
698   const PetscInt    *idx, *ii, *ridx = NULL;
699   PetscInt           k;
700   PetscBool          usecprow = a->compressedrow.use;
701 
702   __m256d a0, a1, a2, a3, a4, a5;
703   __m256d w0, w1, w2, w3;
704   __m256d z0, z1, z2;
705   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
706 
707   PetscFunctionBegin;
708   PetscCall(VecGetArrayRead(xx, &x));
709   PetscCall(VecGetArrayWrite(zz, &zarray));
710 
711   idx = a->j;
712   v   = a->a;
713   if (usecprow) {
714     mbs  = a->compressedrow.nrows;
715     ii   = a->compressedrow.i;
716     ridx = a->compressedrow.rindex;
717     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
718   } else {
719     mbs = a->mbs;
720     ii  = a->i;
721     z   = zarray;
722   }
723 
724   if (!a->mult_work) {
725     k = PetscMax(A->rmap->n, A->cmap->n);
726     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
727   }
728 
729   work = a->mult_work;
730   for (i = 0; i < mbs; i++) {
731     n = ii[1] - ii[0];
732     ii++;
733     workt = work;
734     for (j = 0; j < n; j++) {
735       xb = x + bs * (*idx++);
736       for (k = 0; k < bs; k++) workt[k] = xb[k];
737       workt += bs;
738     }
739     if (usecprow) z = zarray + bs * ridx[i];
740 
741     z0 = _mm256_setzero_pd();
742     z1 = _mm256_setzero_pd();
743     z2 = _mm256_setzero_pd();
744 
745     for (j = 0; j < n; j++) {
746       /* first column of a */
747       w0 = _mm256_set1_pd(work[j * 9]);
748       a0 = _mm256_loadu_pd(&v[j * 81]);
749       z0 = _mm256_fmadd_pd(a0, w0, z0);
750       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
751       z1 = _mm256_fmadd_pd(a1, w0, z1);
752       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
753       z2 = _mm256_fmadd_pd(a2, w0, z2);
754 
755       /* second column of a */
756       w1 = _mm256_set1_pd(work[j * 9 + 1]);
757       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
758       z0 = _mm256_fmadd_pd(a0, w1, z0);
759       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
760       z1 = _mm256_fmadd_pd(a1, w1, z1);
761       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
762       z2 = _mm256_fmadd_pd(a2, w1, z2);
763 
764       /* third column of a */
765       w2 = _mm256_set1_pd(work[j * 9 + 2]);
766       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
767       z0 = _mm256_fmadd_pd(a3, w2, z0);
768       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
769       z1 = _mm256_fmadd_pd(a4, w2, z1);
770       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
771       z2 = _mm256_fmadd_pd(a5, w2, z2);
772 
773       /* fourth column of a */
774       w3 = _mm256_set1_pd(work[j * 9 + 3]);
775       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
776       z0 = _mm256_fmadd_pd(a0, w3, z0);
777       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
778       z1 = _mm256_fmadd_pd(a1, w3, z1);
779       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
780       z2 = _mm256_fmadd_pd(a2, w3, z2);
781 
782       /* fifth column of a */
783       w0 = _mm256_set1_pd(work[j * 9 + 4]);
784       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
785       z0 = _mm256_fmadd_pd(a3, w0, z0);
786       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
787       z1 = _mm256_fmadd_pd(a4, w0, z1);
788       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
789       z2 = _mm256_fmadd_pd(a5, w0, z2);
790 
791       /* sixth column of a */
792       w1 = _mm256_set1_pd(work[j * 9 + 5]);
793       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
794       z0 = _mm256_fmadd_pd(a0, w1, z0);
795       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
796       z1 = _mm256_fmadd_pd(a1, w1, z1);
797       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
798       z2 = _mm256_fmadd_pd(a2, w1, z2);
799 
800       /* seventh column of a */
801       w2 = _mm256_set1_pd(work[j * 9 + 6]);
802       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
803       z0 = _mm256_fmadd_pd(a0, w2, z0);
804       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
805       z1 = _mm256_fmadd_pd(a1, w2, z1);
806       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
807       z2 = _mm256_fmadd_pd(a2, w2, z2);
808 
809       /* eighth column of a */
810       w3 = _mm256_set1_pd(work[j * 9 + 7]);
811       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
812       z0 = _mm256_fmadd_pd(a3, w3, z0);
813       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
814       z1 = _mm256_fmadd_pd(a4, w3, z1);
815       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
816       z2 = _mm256_fmadd_pd(a5, w3, z2);
817 
818       /* ninth column of a */
819       w0 = _mm256_set1_pd(work[j * 9 + 8]);
820       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
821       z0 = _mm256_fmadd_pd(a0, w0, z0);
822       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
823       z1 = _mm256_fmadd_pd(a1, w0, z1);
824       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
825       z2 = _mm256_fmadd_pd(a2, w0, z2);
826     }
827 
828     _mm256_storeu_pd(&z[0], z0);
829     _mm256_storeu_pd(&z[4], z1);
830     _mm256_maskstore_pd(&z[8], mask1, z2);
831 
832     v += n * bs2;
833     if (!usecprow) z += bs;
834   }
835   PetscCall(VecRestoreArrayRead(xx, &x));
836   PetscCall(VecRestoreArrayWrite(zz, &zarray));
837   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
838   PetscFunctionReturn(0);
839 }
840 #endif
841 
842 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) {
843   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
844   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
845   const PetscScalar *x, *xb;
846   PetscScalar       *zarray, xv;
847   const MatScalar   *v;
848   const PetscInt    *ii, *ij = a->j, *idx;
849   PetscInt           mbs, i, j, k, n, *ridx = NULL;
850   PetscBool          usecprow = a->compressedrow.use;
851 
852   PetscFunctionBegin;
853   PetscCall(VecGetArrayRead(xx, &x));
854   PetscCall(VecGetArrayWrite(zz, &zarray));
855 
856   v = a->a;
857   if (usecprow) {
858     mbs  = a->compressedrow.nrows;
859     ii   = a->compressedrow.i;
860     ridx = a->compressedrow.rindex;
861     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
862   } else {
863     mbs = a->mbs;
864     ii  = a->i;
865     z   = zarray;
866   }
867 
868   for (i = 0; i < mbs; i++) {
869     n     = ii[i + 1] - ii[i];
870     idx   = ij + ii[i];
871     sum1  = 0.0;
872     sum2  = 0.0;
873     sum3  = 0.0;
874     sum4  = 0.0;
875     sum5  = 0.0;
876     sum6  = 0.0;
877     sum7  = 0.0;
878     sum8  = 0.0;
879     sum9  = 0.0;
880     sum10 = 0.0;
881     sum11 = 0.0;
882 
883     for (j = 0; j < n; j++) {
884       xb = x + 11 * (idx[j]);
885 
886       for (k = 0; k < 11; k++) {
887         xv = xb[k];
888         sum1 += v[0] * xv;
889         sum2 += v[1] * xv;
890         sum3 += v[2] * xv;
891         sum4 += v[3] * xv;
892         sum5 += v[4] * xv;
893         sum6 += v[5] * xv;
894         sum7 += v[6] * xv;
895         sum8 += v[7] * xv;
896         sum9 += v[8] * xv;
897         sum10 += v[9] * xv;
898         sum11 += v[10] * xv;
899         v += 11;
900       }
901     }
902     if (usecprow) z = zarray + 11 * ridx[i];
903     z[0]  = sum1;
904     z[1]  = sum2;
905     z[2]  = sum3;
906     z[3]  = sum4;
907     z[4]  = sum5;
908     z[5]  = sum6;
909     z[6]  = sum7;
910     z[7]  = sum8;
911     z[8]  = sum9;
912     z[9]  = sum10;
913     z[10] = sum11;
914 
915     if (!usecprow) z += 11;
916   }
917 
918   PetscCall(VecRestoreArrayRead(xx, &x));
919   PetscCall(VecRestoreArrayWrite(zz, &zarray));
920   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
921   PetscFunctionReturn(0);
922 }
923 
924 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
925 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) {
926   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
927   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
928   const PetscScalar *x, *xb;
929   PetscScalar       *zarray, xv;
930   const MatScalar   *v;
931   const PetscInt    *ii, *ij = a->j, *idx;
932   PetscInt           mbs, i, j, k, n, *ridx = NULL;
933   PetscBool          usecprow = a->compressedrow.use;
934 
935   PetscFunctionBegin;
936   PetscCall(VecGetArrayRead(xx, &x));
937   PetscCall(VecGetArrayWrite(zz, &zarray));
938 
939   v = a->a;
940   if (usecprow) {
941     mbs  = a->compressedrow.nrows;
942     ii   = a->compressedrow.i;
943     ridx = a->compressedrow.rindex;
944     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
945   } else {
946     mbs = a->mbs;
947     ii  = a->i;
948     z   = zarray;
949   }
950 
951   for (i = 0; i < mbs; i++) {
952     n     = ii[i + 1] - ii[i];
953     idx   = ij + ii[i];
954     sum1  = 0.0;
955     sum2  = 0.0;
956     sum3  = 0.0;
957     sum4  = 0.0;
958     sum5  = 0.0;
959     sum6  = 0.0;
960     sum7  = 0.0;
961     sum8  = 0.0;
962     sum9  = 0.0;
963     sum10 = 0.0;
964     sum11 = 0.0;
965     sum12 = 0.0;
966 
967     for (j = 0; j < n; j++) {
968       xb = x + 12 * (idx[j]);
969 
970       for (k = 0; k < 12; k++) {
971         xv = xb[k];
972         sum1 += v[0] * xv;
973         sum2 += v[1] * xv;
974         sum3 += v[2] * xv;
975         sum4 += v[3] * xv;
976         sum5 += v[4] * xv;
977         sum6 += v[5] * xv;
978         sum7 += v[6] * xv;
979         sum8 += v[7] * xv;
980         sum9 += v[8] * xv;
981         sum10 += v[9] * xv;
982         sum11 += v[10] * xv;
983         sum12 += v[11] * xv;
984         v += 12;
985       }
986     }
987     if (usecprow) z = zarray + 12 * ridx[i];
988     z[0]  = sum1;
989     z[1]  = sum2;
990     z[2]  = sum3;
991     z[3]  = sum4;
992     z[4]  = sum5;
993     z[5]  = sum6;
994     z[6]  = sum7;
995     z[7]  = sum8;
996     z[8]  = sum9;
997     z[9]  = sum10;
998     z[10] = sum11;
999     z[11] = sum12;
1000     if (!usecprow) z += 12;
1001   }
1002   PetscCall(VecRestoreArrayRead(xx, &x));
1003   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1004   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1005   PetscFunctionReturn(0);
1006 }
1007 
1008 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) {
1009   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1010   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1011   const PetscScalar *x, *xb;
1012   PetscScalar       *zarray, *yarray, xv;
1013   const MatScalar   *v;
1014   const PetscInt    *ii, *ij = a->j, *idx;
1015   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1016   PetscBool          usecprow = a->compressedrow.use;
1017 
1018   PetscFunctionBegin;
1019   PetscCall(VecGetArrayRead(xx, &x));
1020   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1021 
1022   v = a->a;
1023   if (usecprow) {
1024     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1025     mbs  = a->compressedrow.nrows;
1026     ii   = a->compressedrow.i;
1027     ridx = a->compressedrow.rindex;
1028   } else {
1029     ii = a->i;
1030     y  = yarray;
1031     z  = zarray;
1032   }
1033 
1034   for (i = 0; i < mbs; i++) {
1035     n   = ii[i + 1] - ii[i];
1036     idx = ij + ii[i];
1037 
1038     if (usecprow) {
1039       y = yarray + 12 * ridx[i];
1040       z = zarray + 12 * ridx[i];
1041     }
1042     sum1  = y[0];
1043     sum2  = y[1];
1044     sum3  = y[2];
1045     sum4  = y[3];
1046     sum5  = y[4];
1047     sum6  = y[5];
1048     sum7  = y[6];
1049     sum8  = y[7];
1050     sum9  = y[8];
1051     sum10 = y[9];
1052     sum11 = y[10];
1053     sum12 = y[11];
1054 
1055     for (j = 0; j < n; j++) {
1056       xb = x + 12 * (idx[j]);
1057 
1058       for (k = 0; k < 12; k++) {
1059         xv = xb[k];
1060         sum1 += v[0] * xv;
1061         sum2 += v[1] * xv;
1062         sum3 += v[2] * xv;
1063         sum4 += v[3] * xv;
1064         sum5 += v[4] * xv;
1065         sum6 += v[5] * xv;
1066         sum7 += v[6] * xv;
1067         sum8 += v[7] * xv;
1068         sum9 += v[8] * xv;
1069         sum10 += v[9] * xv;
1070         sum11 += v[10] * xv;
1071         sum12 += v[11] * xv;
1072         v += 12;
1073       }
1074     }
1075 
1076     z[0]  = sum1;
1077     z[1]  = sum2;
1078     z[2]  = sum3;
1079     z[3]  = sum4;
1080     z[4]  = sum5;
1081     z[5]  = sum6;
1082     z[6]  = sum7;
1083     z[7]  = sum8;
1084     z[8]  = sum9;
1085     z[9]  = sum10;
1086     z[10] = sum11;
1087     z[11] = sum12;
1088     if (!usecprow) {
1089       y += 12;
1090       z += 12;
1091     }
1092   }
1093   PetscCall(VecRestoreArrayRead(xx, &x));
1094   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1095   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1096   PetscFunctionReturn(0);
1097 }
1098 
1099 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1100 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) {
1101   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1102   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1103   const PetscScalar *x, *xb;
1104   PetscScalar        x1, x2, x3, x4, *zarray;
1105   const MatScalar   *v;
1106   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1107   PetscInt           mbs, i, j, n;
1108   PetscBool          usecprow = a->compressedrow.use;
1109 
1110   PetscFunctionBegin;
1111   PetscCall(VecGetArrayRead(xx, &x));
1112   PetscCall(VecGetArrayWrite(zz, &zarray));
1113 
1114   v = a->a;
1115   if (usecprow) {
1116     mbs  = a->compressedrow.nrows;
1117     ii   = a->compressedrow.i;
1118     ridx = a->compressedrow.rindex;
1119     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1120   } else {
1121     mbs = a->mbs;
1122     ii  = a->i;
1123     z   = zarray;
1124   }
1125 
1126   for (i = 0; i < mbs; i++) {
1127     n   = ii[i + 1] - ii[i];
1128     idx = ij + ii[i];
1129 
1130     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1131     for (j = 0; j < n; j++) {
1132       xb = x + 12 * (idx[j]);
1133       x1 = xb[0];
1134       x2 = xb[1];
1135       x3 = xb[2];
1136       x4 = xb[3];
1137 
1138       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1139       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1140       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1141       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1142       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1143       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1144       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1145       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1146       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1147       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1148       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1149       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1150       v += 48;
1151 
1152       x1 = xb[4];
1153       x2 = xb[5];
1154       x3 = xb[6];
1155       x4 = xb[7];
1156 
1157       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1158       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1159       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1160       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1161       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1162       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1163       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1164       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1165       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1166       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1167       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1168       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1169       v += 48;
1170 
1171       x1 = xb[8];
1172       x2 = xb[9];
1173       x3 = xb[10];
1174       x4 = xb[11];
1175       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1176       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1177       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1178       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1179       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1180       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1181       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1182       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1183       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1184       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1185       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1186       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1187       v += 48;
1188     }
1189     if (usecprow) z = zarray + 12 * ridx[i];
1190     z[0]  = sum1;
1191     z[1]  = sum2;
1192     z[2]  = sum3;
1193     z[3]  = sum4;
1194     z[4]  = sum5;
1195     z[5]  = sum6;
1196     z[6]  = sum7;
1197     z[7]  = sum8;
1198     z[8]  = sum9;
1199     z[9]  = sum10;
1200     z[10] = sum11;
1201     z[11] = sum12;
1202     if (!usecprow) z += 12;
1203   }
1204   PetscCall(VecRestoreArrayRead(xx, &x));
1205   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1206   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1211 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) {
1212   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1213   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1214   const PetscScalar *x, *xb;
1215   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1216   const MatScalar   *v;
1217   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1218   PetscInt           mbs      = a->mbs, i, j, n;
1219   PetscBool          usecprow = a->compressedrow.use;
1220 
1221   PetscFunctionBegin;
1222   PetscCall(VecGetArrayRead(xx, &x));
1223   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1224 
1225   v = a->a;
1226   if (usecprow) {
1227     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1228     mbs  = a->compressedrow.nrows;
1229     ii   = a->compressedrow.i;
1230     ridx = a->compressedrow.rindex;
1231   } else {
1232     ii = a->i;
1233     y  = yarray;
1234     z  = zarray;
1235   }
1236 
1237   for (i = 0; i < mbs; i++) {
1238     n   = ii[i + 1] - ii[i];
1239     idx = ij + ii[i];
1240 
1241     if (usecprow) {
1242       y = yarray + 12 * ridx[i];
1243       z = zarray + 12 * ridx[i];
1244     }
1245     sum1  = y[0];
1246     sum2  = y[1];
1247     sum3  = y[2];
1248     sum4  = y[3];
1249     sum5  = y[4];
1250     sum6  = y[5];
1251     sum7  = y[6];
1252     sum8  = y[7];
1253     sum9  = y[8];
1254     sum10 = y[9];
1255     sum11 = y[10];
1256     sum12 = y[11];
1257 
1258     for (j = 0; j < n; j++) {
1259       xb = x + 12 * (idx[j]);
1260       x1 = xb[0];
1261       x2 = xb[1];
1262       x3 = xb[2];
1263       x4 = xb[3];
1264 
1265       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1266       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1267       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1268       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1269       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1270       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1271       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1272       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1273       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1274       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1275       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1276       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1277       v += 48;
1278 
1279       x1 = xb[4];
1280       x2 = xb[5];
1281       x3 = xb[6];
1282       x4 = xb[7];
1283 
1284       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296       v += 48;
1297 
1298       x1 = xb[8];
1299       x2 = xb[9];
1300       x3 = xb[10];
1301       x4 = xb[11];
1302       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1303       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1304       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1305       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1306       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1307       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1308       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1309       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1310       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1311       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1312       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1313       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1314       v += 48;
1315     }
1316     z[0]  = sum1;
1317     z[1]  = sum2;
1318     z[2]  = sum3;
1319     z[3]  = sum4;
1320     z[4]  = sum5;
1321     z[5]  = sum6;
1322     z[6]  = sum7;
1323     z[7]  = sum8;
1324     z[8]  = sum9;
1325     z[9]  = sum10;
1326     z[10] = sum11;
1327     z[11] = sum12;
1328     if (!usecprow) {
1329       y += 12;
1330       z += 12;
1331     }
1332   }
1333   PetscCall(VecRestoreArrayRead(xx, &x));
1334   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1335   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1340 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) {
1341   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1342   PetscScalar       *z = NULL, *zarray;
1343   const PetscScalar *x, *work;
1344   const MatScalar   *v = a->a;
1345   PetscInt           mbs, i, j, n;
1346   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1347   PetscBool          usecprow = a->compressedrow.use;
1348   const PetscInt     bs = 12, bs2 = 144;
1349 
1350   __m256d a0, a1, a2, a3, a4, a5;
1351   __m256d w0, w1, w2, w3;
1352   __m256d z0, z1, z2;
1353 
1354   PetscFunctionBegin;
1355   PetscCall(VecGetArrayRead(xx, &x));
1356   PetscCall(VecGetArrayWrite(zz, &zarray));
1357 
1358   if (usecprow) {
1359     mbs  = a->compressedrow.nrows;
1360     ii   = a->compressedrow.i;
1361     ridx = a->compressedrow.rindex;
1362     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1363   } else {
1364     mbs = a->mbs;
1365     ii  = a->i;
1366     z   = zarray;
1367   }
1368 
1369   for (i = 0; i < mbs; i++) {
1370     z0 = _mm256_setzero_pd();
1371     z1 = _mm256_setzero_pd();
1372     z2 = _mm256_setzero_pd();
1373 
1374     n = ii[1] - ii[0];
1375     ii++;
1376     for (j = 0; j < n; j++) {
1377       work = x + bs * (*idx++);
1378 
1379       /* first column of a */
1380       w0 = _mm256_set1_pd(work[0]);
1381       a0 = _mm256_loadu_pd(v + 0);
1382       z0 = _mm256_fmadd_pd(a0, w0, z0);
1383       a1 = _mm256_loadu_pd(v + 4);
1384       z1 = _mm256_fmadd_pd(a1, w0, z1);
1385       a2 = _mm256_loadu_pd(v + 8);
1386       z2 = _mm256_fmadd_pd(a2, w0, z2);
1387 
1388       /* second column of a */
1389       w1 = _mm256_set1_pd(work[1]);
1390       a3 = _mm256_loadu_pd(v + 12);
1391       z0 = _mm256_fmadd_pd(a3, w1, z0);
1392       a4 = _mm256_loadu_pd(v + 16);
1393       z1 = _mm256_fmadd_pd(a4, w1, z1);
1394       a5 = _mm256_loadu_pd(v + 20);
1395       z2 = _mm256_fmadd_pd(a5, w1, z2);
1396 
1397       /* third column of a */
1398       w2 = _mm256_set1_pd(work[2]);
1399       a0 = _mm256_loadu_pd(v + 24);
1400       z0 = _mm256_fmadd_pd(a0, w2, z0);
1401       a1 = _mm256_loadu_pd(v + 28);
1402       z1 = _mm256_fmadd_pd(a1, w2, z1);
1403       a2 = _mm256_loadu_pd(v + 32);
1404       z2 = _mm256_fmadd_pd(a2, w2, z2);
1405 
1406       /* fourth column of a */
1407       w3 = _mm256_set1_pd(work[3]);
1408       a3 = _mm256_loadu_pd(v + 36);
1409       z0 = _mm256_fmadd_pd(a3, w3, z0);
1410       a4 = _mm256_loadu_pd(v + 40);
1411       z1 = _mm256_fmadd_pd(a4, w3, z1);
1412       a5 = _mm256_loadu_pd(v + 44);
1413       z2 = _mm256_fmadd_pd(a5, w3, z2);
1414 
1415       /* fifth column of a */
1416       w0 = _mm256_set1_pd(work[4]);
1417       a0 = _mm256_loadu_pd(v + 48);
1418       z0 = _mm256_fmadd_pd(a0, w0, z0);
1419       a1 = _mm256_loadu_pd(v + 52);
1420       z1 = _mm256_fmadd_pd(a1, w0, z1);
1421       a2 = _mm256_loadu_pd(v + 56);
1422       z2 = _mm256_fmadd_pd(a2, w0, z2);
1423 
1424       /* sixth column of a */
1425       w1 = _mm256_set1_pd(work[5]);
1426       a3 = _mm256_loadu_pd(v + 60);
1427       z0 = _mm256_fmadd_pd(a3, w1, z0);
1428       a4 = _mm256_loadu_pd(v + 64);
1429       z1 = _mm256_fmadd_pd(a4, w1, z1);
1430       a5 = _mm256_loadu_pd(v + 68);
1431       z2 = _mm256_fmadd_pd(a5, w1, z2);
1432 
1433       /* seventh column of a */
1434       w2 = _mm256_set1_pd(work[6]);
1435       a0 = _mm256_loadu_pd(v + 72);
1436       z0 = _mm256_fmadd_pd(a0, w2, z0);
1437       a1 = _mm256_loadu_pd(v + 76);
1438       z1 = _mm256_fmadd_pd(a1, w2, z1);
1439       a2 = _mm256_loadu_pd(v + 80);
1440       z2 = _mm256_fmadd_pd(a2, w2, z2);
1441 
1442       /* eighth column of a */
1443       w3 = _mm256_set1_pd(work[7]);
1444       a3 = _mm256_loadu_pd(v + 84);
1445       z0 = _mm256_fmadd_pd(a3, w3, z0);
1446       a4 = _mm256_loadu_pd(v + 88);
1447       z1 = _mm256_fmadd_pd(a4, w3, z1);
1448       a5 = _mm256_loadu_pd(v + 92);
1449       z2 = _mm256_fmadd_pd(a5, w3, z2);
1450 
1451       /* ninth column of a */
1452       w0 = _mm256_set1_pd(work[8]);
1453       a0 = _mm256_loadu_pd(v + 96);
1454       z0 = _mm256_fmadd_pd(a0, w0, z0);
1455       a1 = _mm256_loadu_pd(v + 100);
1456       z1 = _mm256_fmadd_pd(a1, w0, z1);
1457       a2 = _mm256_loadu_pd(v + 104);
1458       z2 = _mm256_fmadd_pd(a2, w0, z2);
1459 
1460       /* tenth column of a */
1461       w1 = _mm256_set1_pd(work[9]);
1462       a3 = _mm256_loadu_pd(v + 108);
1463       z0 = _mm256_fmadd_pd(a3, w1, z0);
1464       a4 = _mm256_loadu_pd(v + 112);
1465       z1 = _mm256_fmadd_pd(a4, w1, z1);
1466       a5 = _mm256_loadu_pd(v + 116);
1467       z2 = _mm256_fmadd_pd(a5, w1, z2);
1468 
1469       /* eleventh column of a */
1470       w2 = _mm256_set1_pd(work[10]);
1471       a0 = _mm256_loadu_pd(v + 120);
1472       z0 = _mm256_fmadd_pd(a0, w2, z0);
1473       a1 = _mm256_loadu_pd(v + 124);
1474       z1 = _mm256_fmadd_pd(a1, w2, z1);
1475       a2 = _mm256_loadu_pd(v + 128);
1476       z2 = _mm256_fmadd_pd(a2, w2, z2);
1477 
1478       /* twelveth column of a */
1479       w3 = _mm256_set1_pd(work[11]);
1480       a3 = _mm256_loadu_pd(v + 132);
1481       z0 = _mm256_fmadd_pd(a3, w3, z0);
1482       a4 = _mm256_loadu_pd(v + 136);
1483       z1 = _mm256_fmadd_pd(a4, w3, z1);
1484       a5 = _mm256_loadu_pd(v + 140);
1485       z2 = _mm256_fmadd_pd(a5, w3, z2);
1486 
1487       v += bs2;
1488     }
1489     if (usecprow) z = zarray + bs * ridx[i];
1490     _mm256_storeu_pd(&z[0], z0);
1491     _mm256_storeu_pd(&z[4], z1);
1492     _mm256_storeu_pd(&z[8], z2);
1493     if (!usecprow) z += bs;
1494   }
1495   PetscCall(VecRestoreArrayRead(xx, &x));
1496   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1497   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1498   PetscFunctionReturn(0);
1499 }
1500 #endif
1501 
1502 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1503 /* Default MatMult for block size 15 */
1504 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) {
1505   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1506   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1507   const PetscScalar *x, *xb;
1508   PetscScalar       *zarray, xv;
1509   const MatScalar   *v;
1510   const PetscInt    *ii, *ij = a->j, *idx;
1511   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1512   PetscBool          usecprow = a->compressedrow.use;
1513 
1514   PetscFunctionBegin;
1515   PetscCall(VecGetArrayRead(xx, &x));
1516   PetscCall(VecGetArrayWrite(zz, &zarray));
1517 
1518   v = a->a;
1519   if (usecprow) {
1520     mbs  = a->compressedrow.nrows;
1521     ii   = a->compressedrow.i;
1522     ridx = a->compressedrow.rindex;
1523     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1524   } else {
1525     mbs = a->mbs;
1526     ii  = a->i;
1527     z   = zarray;
1528   }
1529 
1530   for (i = 0; i < mbs; i++) {
1531     n     = ii[i + 1] - ii[i];
1532     idx   = ij + ii[i];
1533     sum1  = 0.0;
1534     sum2  = 0.0;
1535     sum3  = 0.0;
1536     sum4  = 0.0;
1537     sum5  = 0.0;
1538     sum6  = 0.0;
1539     sum7  = 0.0;
1540     sum8  = 0.0;
1541     sum9  = 0.0;
1542     sum10 = 0.0;
1543     sum11 = 0.0;
1544     sum12 = 0.0;
1545     sum13 = 0.0;
1546     sum14 = 0.0;
1547     sum15 = 0.0;
1548 
1549     for (j = 0; j < n; j++) {
1550       xb = x + 15 * (idx[j]);
1551 
1552       for (k = 0; k < 15; k++) {
1553         xv = xb[k];
1554         sum1 += v[0] * xv;
1555         sum2 += v[1] * xv;
1556         sum3 += v[2] * xv;
1557         sum4 += v[3] * xv;
1558         sum5 += v[4] * xv;
1559         sum6 += v[5] * xv;
1560         sum7 += v[6] * xv;
1561         sum8 += v[7] * xv;
1562         sum9 += v[8] * xv;
1563         sum10 += v[9] * xv;
1564         sum11 += v[10] * xv;
1565         sum12 += v[11] * xv;
1566         sum13 += v[12] * xv;
1567         sum14 += v[13] * xv;
1568         sum15 += v[14] * xv;
1569         v += 15;
1570       }
1571     }
1572     if (usecprow) z = zarray + 15 * ridx[i];
1573     z[0]  = sum1;
1574     z[1]  = sum2;
1575     z[2]  = sum3;
1576     z[3]  = sum4;
1577     z[4]  = sum5;
1578     z[5]  = sum6;
1579     z[6]  = sum7;
1580     z[7]  = sum8;
1581     z[8]  = sum9;
1582     z[9]  = sum10;
1583     z[10] = sum11;
1584     z[11] = sum12;
1585     z[12] = sum13;
1586     z[13] = sum14;
1587     z[14] = sum15;
1588 
1589     if (!usecprow) z += 15;
1590   }
1591 
1592   PetscCall(VecRestoreArrayRead(xx, &x));
1593   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1594   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1595   PetscFunctionReturn(0);
1596 }
1597 
1598 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1599 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) {
1600   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1601   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1602   const PetscScalar *x, *xb;
1603   PetscScalar        x1, x2, x3, x4, *zarray;
1604   const MatScalar   *v;
1605   const PetscInt    *ii, *ij = a->j, *idx;
1606   PetscInt           mbs, i, j, n, *ridx = NULL;
1607   PetscBool          usecprow = a->compressedrow.use;
1608 
1609   PetscFunctionBegin;
1610   PetscCall(VecGetArrayRead(xx, &x));
1611   PetscCall(VecGetArrayWrite(zz, &zarray));
1612 
1613   v = a->a;
1614   if (usecprow) {
1615     mbs  = a->compressedrow.nrows;
1616     ii   = a->compressedrow.i;
1617     ridx = a->compressedrow.rindex;
1618     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1619   } else {
1620     mbs = a->mbs;
1621     ii  = a->i;
1622     z   = zarray;
1623   }
1624 
1625   for (i = 0; i < mbs; i++) {
1626     n     = ii[i + 1] - ii[i];
1627     idx   = ij + ii[i];
1628     sum1  = 0.0;
1629     sum2  = 0.0;
1630     sum3  = 0.0;
1631     sum4  = 0.0;
1632     sum5  = 0.0;
1633     sum6  = 0.0;
1634     sum7  = 0.0;
1635     sum8  = 0.0;
1636     sum9  = 0.0;
1637     sum10 = 0.0;
1638     sum11 = 0.0;
1639     sum12 = 0.0;
1640     sum13 = 0.0;
1641     sum14 = 0.0;
1642     sum15 = 0.0;
1643 
1644     for (j = 0; j < n; j++) {
1645       xb = x + 15 * (idx[j]);
1646       x1 = xb[0];
1647       x2 = xb[1];
1648       x3 = xb[2];
1649       x4 = xb[3];
1650 
1651       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1652       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1653       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1654       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1655       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1656       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1657       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1658       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1659       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1660       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1661       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1662       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1663       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1664       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1665       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1666 
1667       v += 60;
1668 
1669       x1 = xb[4];
1670       x2 = xb[5];
1671       x3 = xb[6];
1672       x4 = xb[7];
1673 
1674       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1675       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1676       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1677       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1678       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1679       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1680       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1681       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1682       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1683       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1684       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1685       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1686       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1687       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1688       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1689       v += 60;
1690 
1691       x1 = xb[8];
1692       x2 = xb[9];
1693       x3 = xb[10];
1694       x4 = xb[11];
1695       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1696       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1697       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1698       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1699       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1700       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1701       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1702       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1703       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1704       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1705       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1706       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1707       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1708       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1709       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1710       v += 60;
1711 
1712       x1 = xb[12];
1713       x2 = xb[13];
1714       x3 = xb[14];
1715       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1716       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1717       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1718       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1719       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1720       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1721       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1722       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1723       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1724       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1725       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1726       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1727       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1728       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1729       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1730       v += 45;
1731     }
1732     if (usecprow) z = zarray + 15 * ridx[i];
1733     z[0]  = sum1;
1734     z[1]  = sum2;
1735     z[2]  = sum3;
1736     z[3]  = sum4;
1737     z[4]  = sum5;
1738     z[5]  = sum6;
1739     z[6]  = sum7;
1740     z[7]  = sum8;
1741     z[8]  = sum9;
1742     z[9]  = sum10;
1743     z[10] = sum11;
1744     z[11] = sum12;
1745     z[12] = sum13;
1746     z[13] = sum14;
1747     z[14] = sum15;
1748 
1749     if (!usecprow) z += 15;
1750   }
1751 
1752   PetscCall(VecRestoreArrayRead(xx, &x));
1753   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1754   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1759 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) {
1760   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1761   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1762   const PetscScalar *x, *xb;
1763   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1764   const MatScalar   *v;
1765   const PetscInt    *ii, *ij = a->j, *idx;
1766   PetscInt           mbs, i, j, n, *ridx = NULL;
1767   PetscBool          usecprow = a->compressedrow.use;
1768 
1769   PetscFunctionBegin;
1770   PetscCall(VecGetArrayRead(xx, &x));
1771   PetscCall(VecGetArrayWrite(zz, &zarray));
1772 
1773   v = a->a;
1774   if (usecprow) {
1775     mbs  = a->compressedrow.nrows;
1776     ii   = a->compressedrow.i;
1777     ridx = a->compressedrow.rindex;
1778     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1779   } else {
1780     mbs = a->mbs;
1781     ii  = a->i;
1782     z   = zarray;
1783   }
1784 
1785   for (i = 0; i < mbs; i++) {
1786     n     = ii[i + 1] - ii[i];
1787     idx   = ij + ii[i];
1788     sum1  = 0.0;
1789     sum2  = 0.0;
1790     sum3  = 0.0;
1791     sum4  = 0.0;
1792     sum5  = 0.0;
1793     sum6  = 0.0;
1794     sum7  = 0.0;
1795     sum8  = 0.0;
1796     sum9  = 0.0;
1797     sum10 = 0.0;
1798     sum11 = 0.0;
1799     sum12 = 0.0;
1800     sum13 = 0.0;
1801     sum14 = 0.0;
1802     sum15 = 0.0;
1803 
1804     for (j = 0; j < n; j++) {
1805       xb = x + 15 * (idx[j]);
1806       x1 = xb[0];
1807       x2 = xb[1];
1808       x3 = xb[2];
1809       x4 = xb[3];
1810       x5 = xb[4];
1811       x6 = xb[5];
1812       x7 = xb[6];
1813       x8 = xb[7];
1814 
1815       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1816       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1817       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1818       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1819       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1820       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1821       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1822       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1823       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1824       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1825       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1826       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1827       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1828       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1829       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1830       v += 120;
1831 
1832       x1 = xb[8];
1833       x2 = xb[9];
1834       x3 = xb[10];
1835       x4 = xb[11];
1836       x5 = xb[12];
1837       x6 = xb[13];
1838       x7 = xb[14];
1839 
1840       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1841       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1842       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1843       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1844       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1845       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1846       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1847       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1848       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1849       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1850       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1851       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1852       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1853       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1854       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1855       v += 105;
1856     }
1857     if (usecprow) z = zarray + 15 * ridx[i];
1858     z[0]  = sum1;
1859     z[1]  = sum2;
1860     z[2]  = sum3;
1861     z[3]  = sum4;
1862     z[4]  = sum5;
1863     z[5]  = sum6;
1864     z[6]  = sum7;
1865     z[7]  = sum8;
1866     z[8]  = sum9;
1867     z[9]  = sum10;
1868     z[10] = sum11;
1869     z[11] = sum12;
1870     z[12] = sum13;
1871     z[13] = sum14;
1872     z[14] = sum15;
1873 
1874     if (!usecprow) z += 15;
1875   }
1876 
1877   PetscCall(VecRestoreArrayRead(xx, &x));
1878   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1879   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1880   PetscFunctionReturn(0);
1881 }
1882 
1883 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1884 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) {
1885   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1886   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1887   const PetscScalar *x, *xb;
1888   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1889   const MatScalar   *v;
1890   const PetscInt    *ii, *ij = a->j, *idx;
1891   PetscInt           mbs, i, j, n, *ridx = NULL;
1892   PetscBool          usecprow = a->compressedrow.use;
1893 
1894   PetscFunctionBegin;
1895   PetscCall(VecGetArrayRead(xx, &x));
1896   PetscCall(VecGetArrayWrite(zz, &zarray));
1897 
1898   v = a->a;
1899   if (usecprow) {
1900     mbs  = a->compressedrow.nrows;
1901     ii   = a->compressedrow.i;
1902     ridx = a->compressedrow.rindex;
1903     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1904   } else {
1905     mbs = a->mbs;
1906     ii  = a->i;
1907     z   = zarray;
1908   }
1909 
1910   for (i = 0; i < mbs; i++) {
1911     n     = ii[i + 1] - ii[i];
1912     idx   = ij + ii[i];
1913     sum1  = 0.0;
1914     sum2  = 0.0;
1915     sum3  = 0.0;
1916     sum4  = 0.0;
1917     sum5  = 0.0;
1918     sum6  = 0.0;
1919     sum7  = 0.0;
1920     sum8  = 0.0;
1921     sum9  = 0.0;
1922     sum10 = 0.0;
1923     sum11 = 0.0;
1924     sum12 = 0.0;
1925     sum13 = 0.0;
1926     sum14 = 0.0;
1927     sum15 = 0.0;
1928 
1929     for (j = 0; j < n; j++) {
1930       xb  = x + 15 * (idx[j]);
1931       x1  = xb[0];
1932       x2  = xb[1];
1933       x3  = xb[2];
1934       x4  = xb[3];
1935       x5  = xb[4];
1936       x6  = xb[5];
1937       x7  = xb[6];
1938       x8  = xb[7];
1939       x9  = xb[8];
1940       x10 = xb[9];
1941       x11 = xb[10];
1942       x12 = xb[11];
1943       x13 = xb[12];
1944       x14 = xb[13];
1945       x15 = xb[14];
1946 
1947       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1948       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1949       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1950       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1951       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1952       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1953       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1954       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1955       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1956       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1957       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1958       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1959       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1960       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1961       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1962       v += 225;
1963     }
1964     if (usecprow) z = zarray + 15 * ridx[i];
1965     z[0]  = sum1;
1966     z[1]  = sum2;
1967     z[2]  = sum3;
1968     z[3]  = sum4;
1969     z[4]  = sum5;
1970     z[5]  = sum6;
1971     z[6]  = sum7;
1972     z[7]  = sum8;
1973     z[8]  = sum9;
1974     z[9]  = sum10;
1975     z[10] = sum11;
1976     z[11] = sum12;
1977     z[12] = sum13;
1978     z[13] = sum14;
1979     z[14] = sum15;
1980 
1981     if (!usecprow) z += 15;
1982   }
1983 
1984   PetscCall(VecRestoreArrayRead(xx, &x));
1985   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1986   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1987   PetscFunctionReturn(0);
1988 }
1989 
1990 /*
1991     This will not work with MatScalar == float because it calls the BLAS
1992 */
1993 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) {
1994   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1995   PetscScalar       *z = NULL, *work, *workt, *zarray;
1996   const PetscScalar *x, *xb;
1997   const MatScalar   *v;
1998   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1999   const PetscInt    *idx, *ii, *ridx = NULL;
2000   PetscInt           ncols, k;
2001   PetscBool          usecprow = a->compressedrow.use;
2002 
2003   PetscFunctionBegin;
2004   PetscCall(VecGetArrayRead(xx, &x));
2005   PetscCall(VecGetArrayWrite(zz, &zarray));
2006 
2007   idx = a->j;
2008   v   = a->a;
2009   if (usecprow) {
2010     mbs  = a->compressedrow.nrows;
2011     ii   = a->compressedrow.i;
2012     ridx = a->compressedrow.rindex;
2013     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2014   } else {
2015     mbs = a->mbs;
2016     ii  = a->i;
2017     z   = zarray;
2018   }
2019 
2020   if (!a->mult_work) {
2021     k = PetscMax(A->rmap->n, A->cmap->n);
2022     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2023   }
2024   work = a->mult_work;
2025   for (i = 0; i < mbs; i++) {
2026     n = ii[1] - ii[0];
2027     ii++;
2028     ncols = n * bs;
2029     workt = work;
2030     for (j = 0; j < n; j++) {
2031       xb = x + bs * (*idx++);
2032       for (k = 0; k < bs; k++) workt[k] = xb[k];
2033       workt += bs;
2034     }
2035     if (usecprow) z = zarray + bs * ridx[i];
2036     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2037     v += n * bs2;
2038     if (!usecprow) z += bs;
2039   }
2040   PetscCall(VecRestoreArrayRead(xx, &x));
2041   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2042   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2043   PetscFunctionReturn(0);
2044 }
2045 
2046 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) {
2047   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2048   const PetscScalar *x;
2049   PetscScalar       *y, *z, sum;
2050   const MatScalar   *v;
2051   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2052   const PetscInt    *idx, *ii;
2053   PetscBool          usecprow = a->compressedrow.use;
2054 
2055   PetscFunctionBegin;
2056   PetscCall(VecGetArrayRead(xx, &x));
2057   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2058 
2059   idx = a->j;
2060   v   = a->a;
2061   if (usecprow) {
2062     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2063     mbs  = a->compressedrow.nrows;
2064     ii   = a->compressedrow.i;
2065     ridx = a->compressedrow.rindex;
2066   } else {
2067     ii = a->i;
2068   }
2069 
2070   for (i = 0; i < mbs; i++) {
2071     n = ii[1] - ii[0];
2072     ii++;
2073     if (!usecprow) {
2074       sum = y[i];
2075     } else {
2076       sum = y[ridx[i]];
2077     }
2078     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2079     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2080     PetscSparseDensePlusDot(sum, x, v, idx, n);
2081     v += n;
2082     idx += n;
2083     if (usecprow) {
2084       z[ridx[i]] = sum;
2085     } else {
2086       z[i] = sum;
2087     }
2088   }
2089   PetscCall(VecRestoreArrayRead(xx, &x));
2090   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2091   PetscCall(PetscLogFlops(2.0 * a->nz));
2092   PetscFunctionReturn(0);
2093 }
2094 
2095 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
2096   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2097   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2098   const PetscScalar *x, *xb;
2099   PetscScalar        x1, x2, *yarray, *zarray;
2100   const MatScalar   *v;
2101   PetscInt           mbs = a->mbs, i, n, j;
2102   const PetscInt    *idx, *ii, *ridx = NULL;
2103   PetscBool          usecprow = a->compressedrow.use;
2104 
2105   PetscFunctionBegin;
2106   PetscCall(VecGetArrayRead(xx, &x));
2107   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2108 
2109   idx = a->j;
2110   v   = a->a;
2111   if (usecprow) {
2112     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2113     mbs  = a->compressedrow.nrows;
2114     ii   = a->compressedrow.i;
2115     ridx = a->compressedrow.rindex;
2116   } else {
2117     ii = a->i;
2118     y  = yarray;
2119     z  = zarray;
2120   }
2121 
2122   for (i = 0; i < mbs; i++) {
2123     n = ii[1] - ii[0];
2124     ii++;
2125     if (usecprow) {
2126       z = zarray + 2 * ridx[i];
2127       y = yarray + 2 * ridx[i];
2128     }
2129     sum1 = y[0];
2130     sum2 = y[1];
2131     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2132     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2133     for (j = 0; j < n; j++) {
2134       xb = x + 2 * (*idx++);
2135       x1 = xb[0];
2136       x2 = xb[1];
2137 
2138       sum1 += v[0] * x1 + v[2] * x2;
2139       sum2 += v[1] * x1 + v[3] * x2;
2140       v += 4;
2141     }
2142     z[0] = sum1;
2143     z[1] = sum2;
2144     if (!usecprow) {
2145       z += 2;
2146       y += 2;
2147     }
2148   }
2149   PetscCall(VecRestoreArrayRead(xx, &x));
2150   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2151   PetscCall(PetscLogFlops(4.0 * a->nz));
2152   PetscFunctionReturn(0);
2153 }
2154 
2155 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
2156   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2157   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2158   const PetscScalar *x, *xb;
2159   const MatScalar   *v;
2160   PetscInt           mbs = a->mbs, i, j, n;
2161   const PetscInt    *idx, *ii, *ridx = NULL;
2162   PetscBool          usecprow = a->compressedrow.use;
2163 
2164   PetscFunctionBegin;
2165   PetscCall(VecGetArrayRead(xx, &x));
2166   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2167 
2168   idx = a->j;
2169   v   = a->a;
2170   if (usecprow) {
2171     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2172     mbs  = a->compressedrow.nrows;
2173     ii   = a->compressedrow.i;
2174     ridx = a->compressedrow.rindex;
2175   } else {
2176     ii = a->i;
2177     y  = yarray;
2178     z  = zarray;
2179   }
2180 
2181   for (i = 0; i < mbs; i++) {
2182     n = ii[1] - ii[0];
2183     ii++;
2184     if (usecprow) {
2185       z = zarray + 3 * ridx[i];
2186       y = yarray + 3 * ridx[i];
2187     }
2188     sum1 = y[0];
2189     sum2 = y[1];
2190     sum3 = y[2];
2191     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2192     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2193     for (j = 0; j < n; j++) {
2194       xb = x + 3 * (*idx++);
2195       x1 = xb[0];
2196       x2 = xb[1];
2197       x3 = xb[2];
2198       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2199       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2200       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2201       v += 9;
2202     }
2203     z[0] = sum1;
2204     z[1] = sum2;
2205     z[2] = sum3;
2206     if (!usecprow) {
2207       z += 3;
2208       y += 3;
2209     }
2210   }
2211   PetscCall(VecRestoreArrayRead(xx, &x));
2212   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2213   PetscCall(PetscLogFlops(18.0 * a->nz));
2214   PetscFunctionReturn(0);
2215 }
2216 
2217 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
2218   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2219   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2220   const PetscScalar *x, *xb;
2221   const MatScalar   *v;
2222   PetscInt           mbs = a->mbs, i, j, n;
2223   const PetscInt    *idx, *ii, *ridx = NULL;
2224   PetscBool          usecprow = a->compressedrow.use;
2225 
2226   PetscFunctionBegin;
2227   PetscCall(VecGetArrayRead(xx, &x));
2228   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2229 
2230   idx = a->j;
2231   v   = a->a;
2232   if (usecprow) {
2233     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2234     mbs  = a->compressedrow.nrows;
2235     ii   = a->compressedrow.i;
2236     ridx = a->compressedrow.rindex;
2237   } else {
2238     ii = a->i;
2239     y  = yarray;
2240     z  = zarray;
2241   }
2242 
2243   for (i = 0; i < mbs; i++) {
2244     n = ii[1] - ii[0];
2245     ii++;
2246     if (usecprow) {
2247       z = zarray + 4 * ridx[i];
2248       y = yarray + 4 * ridx[i];
2249     }
2250     sum1 = y[0];
2251     sum2 = y[1];
2252     sum3 = y[2];
2253     sum4 = y[3];
2254     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2255     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2256     for (j = 0; j < n; j++) {
2257       xb = x + 4 * (*idx++);
2258       x1 = xb[0];
2259       x2 = xb[1];
2260       x3 = xb[2];
2261       x4 = xb[3];
2262       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2263       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2264       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2265       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2266       v += 16;
2267     }
2268     z[0] = sum1;
2269     z[1] = sum2;
2270     z[2] = sum3;
2271     z[3] = sum4;
2272     if (!usecprow) {
2273       z += 4;
2274       y += 4;
2275     }
2276   }
2277   PetscCall(VecRestoreArrayRead(xx, &x));
2278   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2279   PetscCall(PetscLogFlops(32.0 * a->nz));
2280   PetscFunctionReturn(0);
2281 }
2282 
2283 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
2284   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2285   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2286   const PetscScalar *x, *xb;
2287   PetscScalar       *yarray, *zarray;
2288   const MatScalar   *v;
2289   PetscInt           mbs = a->mbs, i, j, n;
2290   const PetscInt    *idx, *ii, *ridx = NULL;
2291   PetscBool          usecprow = a->compressedrow.use;
2292 
2293   PetscFunctionBegin;
2294   PetscCall(VecGetArrayRead(xx, &x));
2295   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2296 
2297   idx = a->j;
2298   v   = a->a;
2299   if (usecprow) {
2300     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2301     mbs  = a->compressedrow.nrows;
2302     ii   = a->compressedrow.i;
2303     ridx = a->compressedrow.rindex;
2304   } else {
2305     ii = a->i;
2306     y  = yarray;
2307     z  = zarray;
2308   }
2309 
2310   for (i = 0; i < mbs; i++) {
2311     n = ii[1] - ii[0];
2312     ii++;
2313     if (usecprow) {
2314       z = zarray + 5 * ridx[i];
2315       y = yarray + 5 * ridx[i];
2316     }
2317     sum1 = y[0];
2318     sum2 = y[1];
2319     sum3 = y[2];
2320     sum4 = y[3];
2321     sum5 = y[4];
2322     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2323     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2324     for (j = 0; j < n; j++) {
2325       xb = x + 5 * (*idx++);
2326       x1 = xb[0];
2327       x2 = xb[1];
2328       x3 = xb[2];
2329       x4 = xb[3];
2330       x5 = xb[4];
2331       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2332       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2333       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2334       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2335       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2336       v += 25;
2337     }
2338     z[0] = sum1;
2339     z[1] = sum2;
2340     z[2] = sum3;
2341     z[3] = sum4;
2342     z[4] = sum5;
2343     if (!usecprow) {
2344       z += 5;
2345       y += 5;
2346     }
2347   }
2348   PetscCall(VecRestoreArrayRead(xx, &x));
2349   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2350   PetscCall(PetscLogFlops(50.0 * a->nz));
2351   PetscFunctionReturn(0);
2352 }
2353 
2354 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
2355   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2356   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2357   const PetscScalar *x, *xb;
2358   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2359   const MatScalar   *v;
2360   PetscInt           mbs = a->mbs, i, j, n;
2361   const PetscInt    *idx, *ii, *ridx = NULL;
2362   PetscBool          usecprow = a->compressedrow.use;
2363 
2364   PetscFunctionBegin;
2365   PetscCall(VecGetArrayRead(xx, &x));
2366   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2367 
2368   idx = a->j;
2369   v   = a->a;
2370   if (usecprow) {
2371     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2372     mbs  = a->compressedrow.nrows;
2373     ii   = a->compressedrow.i;
2374     ridx = a->compressedrow.rindex;
2375   } else {
2376     ii = a->i;
2377     y  = yarray;
2378     z  = zarray;
2379   }
2380 
2381   for (i = 0; i < mbs; i++) {
2382     n = ii[1] - ii[0];
2383     ii++;
2384     if (usecprow) {
2385       z = zarray + 6 * ridx[i];
2386       y = yarray + 6 * ridx[i];
2387     }
2388     sum1 = y[0];
2389     sum2 = y[1];
2390     sum3 = y[2];
2391     sum4 = y[3];
2392     sum5 = y[4];
2393     sum6 = y[5];
2394     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2395     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2396     for (j = 0; j < n; j++) {
2397       xb = x + 6 * (*idx++);
2398       x1 = xb[0];
2399       x2 = xb[1];
2400       x3 = xb[2];
2401       x4 = xb[3];
2402       x5 = xb[4];
2403       x6 = xb[5];
2404       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2405       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2406       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2407       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2408       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2409       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2410       v += 36;
2411     }
2412     z[0] = sum1;
2413     z[1] = sum2;
2414     z[2] = sum3;
2415     z[3] = sum4;
2416     z[4] = sum5;
2417     z[5] = sum6;
2418     if (!usecprow) {
2419       z += 6;
2420       y += 6;
2421     }
2422   }
2423   PetscCall(VecRestoreArrayRead(xx, &x));
2424   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2425   PetscCall(PetscLogFlops(72.0 * a->nz));
2426   PetscFunctionReturn(0);
2427 }
2428 
2429 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
2430   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2431   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2432   const PetscScalar *x, *xb;
2433   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2434   const MatScalar   *v;
2435   PetscInt           mbs = a->mbs, i, j, n;
2436   const PetscInt    *idx, *ii, *ridx = NULL;
2437   PetscBool          usecprow = a->compressedrow.use;
2438 
2439   PetscFunctionBegin;
2440   PetscCall(VecGetArrayRead(xx, &x));
2441   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2442 
2443   idx = a->j;
2444   v   = a->a;
2445   if (usecprow) {
2446     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2447     mbs  = a->compressedrow.nrows;
2448     ii   = a->compressedrow.i;
2449     ridx = a->compressedrow.rindex;
2450   } else {
2451     ii = a->i;
2452     y  = yarray;
2453     z  = zarray;
2454   }
2455 
2456   for (i = 0; i < mbs; i++) {
2457     n = ii[1] - ii[0];
2458     ii++;
2459     if (usecprow) {
2460       z = zarray + 7 * ridx[i];
2461       y = yarray + 7 * ridx[i];
2462     }
2463     sum1 = y[0];
2464     sum2 = y[1];
2465     sum3 = y[2];
2466     sum4 = y[3];
2467     sum5 = y[4];
2468     sum6 = y[5];
2469     sum7 = y[6];
2470     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2471     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2472     for (j = 0; j < n; j++) {
2473       xb = x + 7 * (*idx++);
2474       x1 = xb[0];
2475       x2 = xb[1];
2476       x3 = xb[2];
2477       x4 = xb[3];
2478       x5 = xb[4];
2479       x6 = xb[5];
2480       x7 = xb[6];
2481       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2482       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2483       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2484       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2485       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2486       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2487       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2488       v += 49;
2489     }
2490     z[0] = sum1;
2491     z[1] = sum2;
2492     z[2] = sum3;
2493     z[3] = sum4;
2494     z[4] = sum5;
2495     z[5] = sum6;
2496     z[6] = sum7;
2497     if (!usecprow) {
2498       z += 7;
2499       y += 7;
2500     }
2501   }
2502   PetscCall(VecRestoreArrayRead(xx, &x));
2503   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2504   PetscCall(PetscLogFlops(98.0 * a->nz));
2505   PetscFunctionReturn(0);
2506 }
2507 
2508 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2509 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) {
2510   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2511   PetscScalar       *z = NULL, *work, *workt, *zarray;
2512   const PetscScalar *x, *xb;
2513   const MatScalar   *v;
2514   PetscInt           mbs, i, j, n;
2515   PetscInt           k;
2516   PetscBool          usecprow = a->compressedrow.use;
2517   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2518 
2519   __m256d a0, a1, a2, a3, a4, a5;
2520   __m256d w0, w1, w2, w3;
2521   __m256d z0, z1, z2;
2522   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2523 
2524   PetscFunctionBegin;
2525   PetscCall(VecCopy(yy, zz));
2526   PetscCall(VecGetArrayRead(xx, &x));
2527   PetscCall(VecGetArray(zz, &zarray));
2528 
2529   idx = a->j;
2530   v   = a->a;
2531   if (usecprow) {
2532     mbs  = a->compressedrow.nrows;
2533     ii   = a->compressedrow.i;
2534     ridx = a->compressedrow.rindex;
2535   } else {
2536     mbs = a->mbs;
2537     ii  = a->i;
2538     z   = zarray;
2539   }
2540 
2541   if (!a->mult_work) {
2542     k = PetscMax(A->rmap->n, A->cmap->n);
2543     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2544   }
2545 
2546   work = a->mult_work;
2547   for (i = 0; i < mbs; i++) {
2548     n = ii[1] - ii[0];
2549     ii++;
2550     workt = work;
2551     for (j = 0; j < n; j++) {
2552       xb = x + bs * (*idx++);
2553       for (k = 0; k < bs; k++) workt[k] = xb[k];
2554       workt += bs;
2555     }
2556     if (usecprow) z = zarray + bs * ridx[i];
2557 
2558     z0 = _mm256_loadu_pd(&z[0]);
2559     z1 = _mm256_loadu_pd(&z[4]);
2560     z2 = _mm256_set1_pd(z[8]);
2561 
2562     for (j = 0; j < n; j++) {
2563       /* first column of a */
2564       w0 = _mm256_set1_pd(work[j * 9]);
2565       a0 = _mm256_loadu_pd(&v[j * 81]);
2566       z0 = _mm256_fmadd_pd(a0, w0, z0);
2567       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2568       z1 = _mm256_fmadd_pd(a1, w0, z1);
2569       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2570       z2 = _mm256_fmadd_pd(a2, w0, z2);
2571 
2572       /* second column of a */
2573       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2574       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2575       z0 = _mm256_fmadd_pd(a0, w1, z0);
2576       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2577       z1 = _mm256_fmadd_pd(a1, w1, z1);
2578       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2579       z2 = _mm256_fmadd_pd(a2, w1, z2);
2580 
2581       /* third column of a */
2582       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2583       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2584       z0 = _mm256_fmadd_pd(a3, w2, z0);
2585       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2586       z1 = _mm256_fmadd_pd(a4, w2, z1);
2587       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2588       z2 = _mm256_fmadd_pd(a5, w2, z2);
2589 
2590       /* fourth column of a */
2591       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2592       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2593       z0 = _mm256_fmadd_pd(a0, w3, z0);
2594       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2595       z1 = _mm256_fmadd_pd(a1, w3, z1);
2596       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2597       z2 = _mm256_fmadd_pd(a2, w3, z2);
2598 
2599       /* fifth column of a */
2600       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2601       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2602       z0 = _mm256_fmadd_pd(a3, w0, z0);
2603       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2604       z1 = _mm256_fmadd_pd(a4, w0, z1);
2605       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2606       z2 = _mm256_fmadd_pd(a5, w0, z2);
2607 
2608       /* sixth column of a */
2609       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2610       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2611       z0 = _mm256_fmadd_pd(a0, w1, z0);
2612       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2613       z1 = _mm256_fmadd_pd(a1, w1, z1);
2614       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2615       z2 = _mm256_fmadd_pd(a2, w1, z2);
2616 
2617       /* seventh column of a */
2618       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2619       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2620       z0 = _mm256_fmadd_pd(a0, w2, z0);
2621       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2622       z1 = _mm256_fmadd_pd(a1, w2, z1);
2623       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2624       z2 = _mm256_fmadd_pd(a2, w2, z2);
2625 
2626       /* eighth column of a */
2627       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2628       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2629       z0 = _mm256_fmadd_pd(a3, w3, z0);
2630       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2631       z1 = _mm256_fmadd_pd(a4, w3, z1);
2632       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2633       z2 = _mm256_fmadd_pd(a5, w3, z2);
2634 
2635       /* ninth column of a */
2636       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2637       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2638       z0 = _mm256_fmadd_pd(a0, w0, z0);
2639       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2640       z1 = _mm256_fmadd_pd(a1, w0, z1);
2641       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2642       z2 = _mm256_fmadd_pd(a2, w0, z2);
2643     }
2644 
2645     _mm256_storeu_pd(&z[0], z0);
2646     _mm256_storeu_pd(&z[4], z1);
2647     _mm256_maskstore_pd(&z[8], mask1, z2);
2648 
2649     v += n * bs2;
2650     if (!usecprow) z += bs;
2651   }
2652   PetscCall(VecRestoreArrayRead(xx, &x));
2653   PetscCall(VecRestoreArray(zz, &zarray));
2654   PetscCall(PetscLogFlops(162.0 * a->nz));
2655   PetscFunctionReturn(0);
2656 }
2657 #endif
2658 
2659 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) {
2660   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2661   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2662   const PetscScalar *x, *xb;
2663   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2664   const MatScalar   *v;
2665   PetscInt           mbs = a->mbs, i, j, n;
2666   const PetscInt    *idx, *ii, *ridx = NULL;
2667   PetscBool          usecprow = a->compressedrow.use;
2668 
2669   PetscFunctionBegin;
2670   PetscCall(VecGetArrayRead(xx, &x));
2671   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2672 
2673   idx = a->j;
2674   v   = a->a;
2675   if (usecprow) {
2676     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2677     mbs  = a->compressedrow.nrows;
2678     ii   = a->compressedrow.i;
2679     ridx = a->compressedrow.rindex;
2680   } else {
2681     ii = a->i;
2682     y  = yarray;
2683     z  = zarray;
2684   }
2685 
2686   for (i = 0; i < mbs; i++) {
2687     n = ii[1] - ii[0];
2688     ii++;
2689     if (usecprow) {
2690       z = zarray + 11 * ridx[i];
2691       y = yarray + 11 * ridx[i];
2692     }
2693     sum1  = y[0];
2694     sum2  = y[1];
2695     sum3  = y[2];
2696     sum4  = y[3];
2697     sum5  = y[4];
2698     sum6  = y[5];
2699     sum7  = y[6];
2700     sum8  = y[7];
2701     sum9  = y[8];
2702     sum10 = y[9];
2703     sum11 = y[10];
2704     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2705     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2706     for (j = 0; j < n; j++) {
2707       xb  = x + 11 * (*idx++);
2708       x1  = xb[0];
2709       x2  = xb[1];
2710       x3  = xb[2];
2711       x4  = xb[3];
2712       x5  = xb[4];
2713       x6  = xb[5];
2714       x7  = xb[6];
2715       x8  = xb[7];
2716       x9  = xb[8];
2717       x10 = xb[9];
2718       x11 = xb[10];
2719       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2720       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2721       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2722       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2723       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2724       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2725       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2726       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2727       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2728       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2729       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2730       v += 121;
2731     }
2732     z[0]  = sum1;
2733     z[1]  = sum2;
2734     z[2]  = sum3;
2735     z[3]  = sum4;
2736     z[4]  = sum5;
2737     z[5]  = sum6;
2738     z[6]  = sum7;
2739     z[7]  = sum8;
2740     z[8]  = sum9;
2741     z[9]  = sum10;
2742     z[10] = sum11;
2743     if (!usecprow) {
2744       z += 11;
2745       y += 11;
2746     }
2747   }
2748   PetscCall(VecRestoreArrayRead(xx, &x));
2749   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2750   PetscCall(PetscLogFlops(242.0 * a->nz));
2751   PetscFunctionReturn(0);
2752 }
2753 
2754 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
2755   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2756   PetscScalar       *z = NULL, *work, *workt, *zarray;
2757   const PetscScalar *x, *xb;
2758   const MatScalar   *v;
2759   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2760   PetscInt           ncols, k;
2761   const PetscInt    *ridx     = NULL, *idx, *ii;
2762   PetscBool          usecprow = a->compressedrow.use;
2763 
2764   PetscFunctionBegin;
2765   PetscCall(VecCopy(yy, zz));
2766   PetscCall(VecGetArrayRead(xx, &x));
2767   PetscCall(VecGetArray(zz, &zarray));
2768 
2769   idx = a->j;
2770   v   = a->a;
2771   if (usecprow) {
2772     mbs  = a->compressedrow.nrows;
2773     ii   = a->compressedrow.i;
2774     ridx = a->compressedrow.rindex;
2775   } else {
2776     mbs = a->mbs;
2777     ii  = a->i;
2778     z   = zarray;
2779   }
2780 
2781   if (!a->mult_work) {
2782     k = PetscMax(A->rmap->n, A->cmap->n);
2783     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2784   }
2785   work = a->mult_work;
2786   for (i = 0; i < mbs; i++) {
2787     n = ii[1] - ii[0];
2788     ii++;
2789     ncols = n * bs;
2790     workt = work;
2791     for (j = 0; j < n; j++) {
2792       xb = x + bs * (*idx++);
2793       for (k = 0; k < bs; k++) workt[k] = xb[k];
2794       workt += bs;
2795     }
2796     if (usecprow) z = zarray + bs * ridx[i];
2797     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2798     v += n * bs2;
2799     if (!usecprow) z += bs;
2800   }
2801   PetscCall(VecRestoreArrayRead(xx, &x));
2802   PetscCall(VecRestoreArray(zz, &zarray));
2803   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2804   PetscFunctionReturn(0);
2805 }
2806 
2807 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
2808   PetscScalar zero = 0.0;
2809 
2810   PetscFunctionBegin;
2811   PetscCall(VecSet(zz, zero));
2812   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2813   PetscFunctionReturn(0);
2814 }
2815 
2816 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
2817   PetscScalar zero = 0.0;
2818 
2819   PetscFunctionBegin;
2820   PetscCall(VecSet(zz, zero));
2821   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2822   PetscFunctionReturn(0);
2823 }
2824 
2825 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
2826   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2827   PetscScalar       *z, x1, x2, x3, x4, x5;
2828   const PetscScalar *x, *xb = NULL;
2829   const MatScalar   *v;
2830   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2831   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2832   Mat_CompressedRow  cprow    = a->compressedrow;
2833   PetscBool          usecprow = cprow.use;
2834 
2835   PetscFunctionBegin;
2836   if (yy != zz) PetscCall(VecCopy(yy, zz));
2837   PetscCall(VecGetArrayRead(xx, &x));
2838   PetscCall(VecGetArray(zz, &z));
2839 
2840   idx = a->j;
2841   v   = a->a;
2842   if (usecprow) {
2843     mbs  = cprow.nrows;
2844     ii   = cprow.i;
2845     ridx = cprow.rindex;
2846   } else {
2847     mbs = a->mbs;
2848     ii  = a->i;
2849     xb  = x;
2850   }
2851 
2852   switch (bs) {
2853   case 1:
2854     for (i = 0; i < mbs; i++) {
2855       if (usecprow) xb = x + ridx[i];
2856       x1 = xb[0];
2857       ib = idx + ii[0];
2858       n  = ii[1] - ii[0];
2859       ii++;
2860       for (j = 0; j < n; j++) {
2861         rval = ib[j];
2862         z[rval] += PetscConj(*v) * x1;
2863         v++;
2864       }
2865       if (!usecprow) xb++;
2866     }
2867     break;
2868   case 2:
2869     for (i = 0; i < mbs; i++) {
2870       if (usecprow) xb = x + 2 * ridx[i];
2871       x1 = xb[0];
2872       x2 = xb[1];
2873       ib = idx + ii[0];
2874       n  = ii[1] - ii[0];
2875       ii++;
2876       for (j = 0; j < n; j++) {
2877         rval = ib[j] * 2;
2878         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2879         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2880         v += 4;
2881       }
2882       if (!usecprow) xb += 2;
2883     }
2884     break;
2885   case 3:
2886     for (i = 0; i < mbs; i++) {
2887       if (usecprow) xb = x + 3 * ridx[i];
2888       x1 = xb[0];
2889       x2 = xb[1];
2890       x3 = xb[2];
2891       ib = idx + ii[0];
2892       n  = ii[1] - ii[0];
2893       ii++;
2894       for (j = 0; j < n; j++) {
2895         rval = ib[j] * 3;
2896         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2897         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2898         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2899         v += 9;
2900       }
2901       if (!usecprow) xb += 3;
2902     }
2903     break;
2904   case 4:
2905     for (i = 0; i < mbs; i++) {
2906       if (usecprow) xb = x + 4 * ridx[i];
2907       x1 = xb[0];
2908       x2 = xb[1];
2909       x3 = xb[2];
2910       x4 = xb[3];
2911       ib = idx + ii[0];
2912       n  = ii[1] - ii[0];
2913       ii++;
2914       for (j = 0; j < n; j++) {
2915         rval = ib[j] * 4;
2916         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2917         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2918         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2919         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2920         v += 16;
2921       }
2922       if (!usecprow) xb += 4;
2923     }
2924     break;
2925   case 5:
2926     for (i = 0; i < mbs; i++) {
2927       if (usecprow) xb = x + 5 * ridx[i];
2928       x1 = xb[0];
2929       x2 = xb[1];
2930       x3 = xb[2];
2931       x4 = xb[3];
2932       x5 = xb[4];
2933       ib = idx + ii[0];
2934       n  = ii[1] - ii[0];
2935       ii++;
2936       for (j = 0; j < n; j++) {
2937         rval = ib[j] * 5;
2938         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2939         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2940         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2941         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2942         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2943         v += 25;
2944       }
2945       if (!usecprow) xb += 5;
2946     }
2947     break;
2948   default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2949 #if 0
2950     {
2951       PetscInt          ncols,k,bs2=a->bs2;
2952       PetscScalar       *work,*workt,zb;
2953       const PetscScalar *xtmp;
2954       if (!a->mult_work) {
2955         k    = PetscMax(A->rmap->n,A->cmap->n);
2956         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2957       }
2958       work = a->mult_work;
2959       xtmp = x;
2960       for (i=0; i<mbs; i++) {
2961         n     = ii[1] - ii[0]; ii++;
2962         ncols = n*bs;
2963         PetscCall(PetscArrayzero(work,ncols));
2964         if (usecprow) xtmp = x + bs*ridx[i];
2965         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2966         v += n*bs2;
2967         if (!usecprow) xtmp += bs;
2968         workt = work;
2969         for (j=0; j<n; j++) {
2970           zb = z + bs*(*idx++);
2971           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2972           workt += bs;
2973         }
2974       }
2975     }
2976 #endif
2977   }
2978   PetscCall(VecRestoreArrayRead(xx, &x));
2979   PetscCall(VecRestoreArray(zz, &z));
2980   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2981   PetscFunctionReturn(0);
2982 }
2983 
2984 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
2985   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2986   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2987   const PetscScalar *x, *xb = NULL;
2988   const MatScalar   *v;
2989   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2990   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2991   Mat_CompressedRow  cprow    = a->compressedrow;
2992   PetscBool          usecprow = cprow.use;
2993 
2994   PetscFunctionBegin;
2995   if (yy != zz) PetscCall(VecCopy(yy, zz));
2996   PetscCall(VecGetArrayRead(xx, &x));
2997   PetscCall(VecGetArray(zz, &z));
2998 
2999   idx = a->j;
3000   v   = a->a;
3001   if (usecprow) {
3002     mbs  = cprow.nrows;
3003     ii   = cprow.i;
3004     ridx = cprow.rindex;
3005   } else {
3006     mbs = a->mbs;
3007     ii  = a->i;
3008     xb  = x;
3009   }
3010 
3011   switch (bs) {
3012   case 1:
3013     for (i = 0; i < mbs; i++) {
3014       if (usecprow) xb = x + ridx[i];
3015       x1 = xb[0];
3016       ib = idx + ii[0];
3017       n  = ii[1] - ii[0];
3018       ii++;
3019       for (j = 0; j < n; j++) {
3020         rval = ib[j];
3021         z[rval] += *v * x1;
3022         v++;
3023       }
3024       if (!usecprow) xb++;
3025     }
3026     break;
3027   case 2:
3028     for (i = 0; i < mbs; i++) {
3029       if (usecprow) xb = x + 2 * ridx[i];
3030       x1 = xb[0];
3031       x2 = xb[1];
3032       ib = idx + ii[0];
3033       n  = ii[1] - ii[0];
3034       ii++;
3035       for (j = 0; j < n; j++) {
3036         rval = ib[j] * 2;
3037         z[rval++] += v[0] * x1 + v[1] * x2;
3038         z[rval++] += v[2] * x1 + v[3] * x2;
3039         v += 4;
3040       }
3041       if (!usecprow) xb += 2;
3042     }
3043     break;
3044   case 3:
3045     for (i = 0; i < mbs; i++) {
3046       if (usecprow) xb = x + 3 * ridx[i];
3047       x1 = xb[0];
3048       x2 = xb[1];
3049       x3 = xb[2];
3050       ib = idx + ii[0];
3051       n  = ii[1] - ii[0];
3052       ii++;
3053       for (j = 0; j < n; j++) {
3054         rval = ib[j] * 3;
3055         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3056         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3057         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3058         v += 9;
3059       }
3060       if (!usecprow) xb += 3;
3061     }
3062     break;
3063   case 4:
3064     for (i = 0; i < mbs; i++) {
3065       if (usecprow) xb = x + 4 * ridx[i];
3066       x1 = xb[0];
3067       x2 = xb[1];
3068       x3 = xb[2];
3069       x4 = xb[3];
3070       ib = idx + ii[0];
3071       n  = ii[1] - ii[0];
3072       ii++;
3073       for (j = 0; j < n; j++) {
3074         rval = ib[j] * 4;
3075         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3076         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3077         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3078         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3079         v += 16;
3080       }
3081       if (!usecprow) xb += 4;
3082     }
3083     break;
3084   case 5:
3085     for (i = 0; i < mbs; i++) {
3086       if (usecprow) xb = x + 5 * ridx[i];
3087       x1 = xb[0];
3088       x2 = xb[1];
3089       x3 = xb[2];
3090       x4 = xb[3];
3091       x5 = xb[4];
3092       ib = idx + ii[0];
3093       n  = ii[1] - ii[0];
3094       ii++;
3095       for (j = 0; j < n; j++) {
3096         rval = ib[j] * 5;
3097         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3098         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3099         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3100         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3101         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3102         v += 25;
3103       }
3104       if (!usecprow) xb += 5;
3105     }
3106     break;
3107   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3108     PetscInt           ncols, k;
3109     PetscScalar       *work, *workt;
3110     const PetscScalar *xtmp;
3111     if (!a->mult_work) {
3112       k = PetscMax(A->rmap->n, A->cmap->n);
3113       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3114     }
3115     work = a->mult_work;
3116     xtmp = x;
3117     for (i = 0; i < mbs; i++) {
3118       n = ii[1] - ii[0];
3119       ii++;
3120       ncols = n * bs;
3121       PetscCall(PetscArrayzero(work, ncols));
3122       if (usecprow) xtmp = x + bs * ridx[i];
3123       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3124       v += n * bs2;
3125       if (!usecprow) xtmp += bs;
3126       workt = work;
3127       for (j = 0; j < n; j++) {
3128         zb = z + bs * (*idx++);
3129         for (k = 0; k < bs; k++) zb[k] += workt[k];
3130         workt += bs;
3131       }
3132     }
3133   }
3134   }
3135   PetscCall(VecRestoreArrayRead(xx, &x));
3136   PetscCall(VecRestoreArray(zz, &z));
3137   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3138   PetscFunctionReturn(0);
3139 }
3140 
3141 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) {
3142   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3143   PetscInt     totalnz = a->bs2 * a->nz;
3144   PetscScalar  oalpha  = alpha;
3145   PetscBLASInt one     = 1, tnz;
3146 
3147   PetscFunctionBegin;
3148   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3149   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3150   PetscCall(PetscLogFlops(totalnz));
3151   PetscFunctionReturn(0);
3152 }
3153 
3154 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) {
3155   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3156   MatScalar   *v   = a->a;
3157   PetscReal    sum = 0.0;
3158   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3159 
3160   PetscFunctionBegin;
3161   if (type == NORM_FROBENIUS) {
3162 #if defined(PETSC_USE_REAL___FP16)
3163     PetscBLASInt one = 1, cnt = bs2 * nz;
3164     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3165 #else
3166     for (i = 0; i < bs2 * nz; i++) {
3167       sum += PetscRealPart(PetscConj(*v) * (*v));
3168       v++;
3169     }
3170 #endif
3171     *norm = PetscSqrtReal(sum);
3172     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3173   } else if (type == NORM_1) { /* maximum column sum */
3174     PetscReal *tmp;
3175     PetscInt  *bcol = a->j;
3176     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3177     for (i = 0; i < nz; i++) {
3178       for (j = 0; j < bs; j++) {
3179         k1 = bs * (*bcol) + j; /* column index */
3180         for (k = 0; k < bs; k++) {
3181           tmp[k1] += PetscAbsScalar(*v);
3182           v++;
3183         }
3184       }
3185       bcol++;
3186     }
3187     *norm = 0.0;
3188     for (j = 0; j < A->cmap->n; j++) {
3189       if (tmp[j] > *norm) *norm = tmp[j];
3190     }
3191     PetscCall(PetscFree(tmp));
3192     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3193   } else if (type == NORM_INFINITY) { /* maximum row sum */
3194     *norm = 0.0;
3195     for (k = 0; k < bs; k++) {
3196       for (j = 0; j < a->mbs; j++) {
3197         v   = a->a + bs2 * a->i[j] + k;
3198         sum = 0.0;
3199         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3200           for (k1 = 0; k1 < bs; k1++) {
3201             sum += PetscAbsScalar(*v);
3202             v += bs;
3203           }
3204         }
3205         if (sum > *norm) *norm = sum;
3206       }
3207     }
3208     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3209   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3210   PetscFunctionReturn(0);
3211 }
3212 
3213 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) {
3214   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3215 
3216   PetscFunctionBegin;
3217   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3218   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3219     *flg = PETSC_FALSE;
3220     PetscFunctionReturn(0);
3221   }
3222 
3223   /* if the a->i are the same */
3224   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3225   if (!*flg) PetscFunctionReturn(0);
3226 
3227   /* if a->j are the same */
3228   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3229   if (!*flg) PetscFunctionReturn(0);
3230 
3231   /* if a->a are the same */
3232   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3233   PetscFunctionReturn(0);
3234 }
3235 
3236 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) {
3237   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3238   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3239   PetscScalar *x, zero = 0.0;
3240   MatScalar   *aa, *aa_j;
3241 
3242   PetscFunctionBegin;
3243   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3244   bs   = A->rmap->bs;
3245   aa   = a->a;
3246   ai   = a->i;
3247   aj   = a->j;
3248   ambs = a->mbs;
3249   bs2  = a->bs2;
3250 
3251   PetscCall(VecSet(v, zero));
3252   PetscCall(VecGetArray(v, &x));
3253   PetscCall(VecGetLocalSize(v, &n));
3254   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3255   for (i = 0; i < ambs; i++) {
3256     for (j = ai[i]; j < ai[i + 1]; j++) {
3257       if (aj[j] == i) {
3258         row  = i * bs;
3259         aa_j = aa + j * bs2;
3260         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3261         break;
3262       }
3263     }
3264   }
3265   PetscCall(VecRestoreArray(v, &x));
3266   PetscFunctionReturn(0);
3267 }
3268 
3269 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) {
3270   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3271   const PetscScalar *l, *r, *li, *ri;
3272   PetscScalar        x;
3273   MatScalar         *aa, *v;
3274   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3275   const PetscInt    *ai, *aj;
3276 
3277   PetscFunctionBegin;
3278   ai  = a->i;
3279   aj  = a->j;
3280   aa  = a->a;
3281   m   = A->rmap->n;
3282   n   = A->cmap->n;
3283   bs  = A->rmap->bs;
3284   mbs = a->mbs;
3285   bs2 = a->bs2;
3286   if (ll) {
3287     PetscCall(VecGetArrayRead(ll, &l));
3288     PetscCall(VecGetLocalSize(ll, &lm));
3289     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3290     for (i = 0; i < mbs; i++) { /* for each block row */
3291       M  = ai[i + 1] - ai[i];
3292       li = l + i * bs;
3293       v  = aa + bs2 * ai[i];
3294       for (j = 0; j < M; j++) { /* for each block */
3295         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3296       }
3297     }
3298     PetscCall(VecRestoreArrayRead(ll, &l));
3299     PetscCall(PetscLogFlops(a->nz));
3300   }
3301 
3302   if (rr) {
3303     PetscCall(VecGetArrayRead(rr, &r));
3304     PetscCall(VecGetLocalSize(rr, &rn));
3305     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3306     for (i = 0; i < mbs; i++) { /* for each block row */
3307       iai = ai[i];
3308       M   = ai[i + 1] - iai;
3309       v   = aa + bs2 * iai;
3310       for (j = 0; j < M; j++) { /* for each block */
3311         ri = r + bs * aj[iai + j];
3312         for (k = 0; k < bs; k++) {
3313           x = ri[k];
3314           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3315           v += bs;
3316         }
3317       }
3318     }
3319     PetscCall(VecRestoreArrayRead(rr, &r));
3320     PetscCall(PetscLogFlops(a->nz));
3321   }
3322   PetscFunctionReturn(0);
3323 }
3324 
3325 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) {
3326   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3327 
3328   PetscFunctionBegin;
3329   info->block_size   = a->bs2;
3330   info->nz_allocated = a->bs2 * a->maxnz;
3331   info->nz_used      = a->bs2 * a->nz;
3332   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3333   info->assemblies   = A->num_ass;
3334   info->mallocs      = A->info.mallocs;
3335   info->memory       = 0; /* REVIEW ME */
3336   if (A->factortype) {
3337     info->fill_ratio_given  = A->info.fill_ratio_given;
3338     info->fill_ratio_needed = A->info.fill_ratio_needed;
3339     info->factor_mallocs    = A->info.factor_mallocs;
3340   } else {
3341     info->fill_ratio_given  = 0;
3342     info->fill_ratio_needed = 0;
3343     info->factor_mallocs    = 0;
3344   }
3345   PetscFunctionReturn(0);
3346 }
3347 
3348 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) {
3349   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3350 
3351   PetscFunctionBegin;
3352   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3353   PetscFunctionReturn(0);
3354 }
3355 
3356 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) {
3357   PetscFunctionBegin;
3358   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3359   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3360   PetscFunctionReturn(0);
3361 }
3362 
3363 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3364   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3365   PetscScalar       *z = NULL, sum1;
3366   const PetscScalar *xb;
3367   PetscScalar        x1;
3368   const MatScalar   *v, *vv;
3369   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3370   PetscBool          usecprow = a->compressedrow.use;
3371 
3372   PetscFunctionBegin;
3373   idx = a->j;
3374   v   = a->a;
3375   if (usecprow) {
3376     mbs  = a->compressedrow.nrows;
3377     ii   = a->compressedrow.i;
3378     ridx = a->compressedrow.rindex;
3379   } else {
3380     mbs = a->mbs;
3381     ii  = a->i;
3382     z   = c;
3383   }
3384 
3385   for (i = 0; i < mbs; i++) {
3386     n = ii[1] - ii[0];
3387     ii++;
3388     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3389     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3390     if (usecprow) z = c + ridx[i];
3391     jj = idx;
3392     vv = v;
3393     for (k = 0; k < cn; k++) {
3394       idx  = jj;
3395       v    = vv;
3396       sum1 = 0.0;
3397       for (j = 0; j < n; j++) {
3398         xb = b + (*idx++);
3399         x1 = xb[0 + k * bm];
3400         sum1 += v[0] * x1;
3401         v += 1;
3402       }
3403       z[0 + k * cm] = sum1;
3404     }
3405     if (!usecprow) z += 1;
3406   }
3407   PetscFunctionReturn(0);
3408 }
3409 
3410 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3411   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3412   PetscScalar       *z = NULL, sum1, sum2;
3413   const PetscScalar *xb;
3414   PetscScalar        x1, x2;
3415   const MatScalar   *v, *vv;
3416   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3417   PetscBool          usecprow = a->compressedrow.use;
3418 
3419   PetscFunctionBegin;
3420   idx = a->j;
3421   v   = a->a;
3422   if (usecprow) {
3423     mbs  = a->compressedrow.nrows;
3424     ii   = a->compressedrow.i;
3425     ridx = a->compressedrow.rindex;
3426   } else {
3427     mbs = a->mbs;
3428     ii  = a->i;
3429     z   = c;
3430   }
3431 
3432   for (i = 0; i < mbs; i++) {
3433     n = ii[1] - ii[0];
3434     ii++;
3435     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3436     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3437     if (usecprow) z = c + 2 * ridx[i];
3438     jj = idx;
3439     vv = v;
3440     for (k = 0; k < cn; k++) {
3441       idx  = jj;
3442       v    = vv;
3443       sum1 = 0.0;
3444       sum2 = 0.0;
3445       for (j = 0; j < n; j++) {
3446         xb = b + 2 * (*idx++);
3447         x1 = xb[0 + k * bm];
3448         x2 = xb[1 + k * bm];
3449         sum1 += v[0] * x1 + v[2] * x2;
3450         sum2 += v[1] * x1 + v[3] * x2;
3451         v += 4;
3452       }
3453       z[0 + k * cm] = sum1;
3454       z[1 + k * cm] = sum2;
3455     }
3456     if (!usecprow) z += 2;
3457   }
3458   PetscFunctionReturn(0);
3459 }
3460 
3461 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3462   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3463   PetscScalar       *z = NULL, sum1, sum2, sum3;
3464   const PetscScalar *xb;
3465   PetscScalar        x1, x2, x3;
3466   const MatScalar   *v, *vv;
3467   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3468   PetscBool          usecprow = a->compressedrow.use;
3469 
3470   PetscFunctionBegin;
3471   idx = a->j;
3472   v   = a->a;
3473   if (usecprow) {
3474     mbs  = a->compressedrow.nrows;
3475     ii   = a->compressedrow.i;
3476     ridx = a->compressedrow.rindex;
3477   } else {
3478     mbs = a->mbs;
3479     ii  = a->i;
3480     z   = c;
3481   }
3482 
3483   for (i = 0; i < mbs; i++) {
3484     n = ii[1] - ii[0];
3485     ii++;
3486     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3487     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3488     if (usecprow) z = c + 3 * ridx[i];
3489     jj = idx;
3490     vv = v;
3491     for (k = 0; k < cn; k++) {
3492       idx  = jj;
3493       v    = vv;
3494       sum1 = 0.0;
3495       sum2 = 0.0;
3496       sum3 = 0.0;
3497       for (j = 0; j < n; j++) {
3498         xb = b + 3 * (*idx++);
3499         x1 = xb[0 + k * bm];
3500         x2 = xb[1 + k * bm];
3501         x3 = xb[2 + k * bm];
3502         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3503         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3504         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3505         v += 9;
3506       }
3507       z[0 + k * cm] = sum1;
3508       z[1 + k * cm] = sum2;
3509       z[2 + k * cm] = sum3;
3510     }
3511     if (!usecprow) z += 3;
3512   }
3513   PetscFunctionReturn(0);
3514 }
3515 
3516 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3517   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3518   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3519   const PetscScalar *xb;
3520   PetscScalar        x1, x2, x3, x4;
3521   const MatScalar   *v, *vv;
3522   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3523   PetscBool          usecprow = a->compressedrow.use;
3524 
3525   PetscFunctionBegin;
3526   idx = a->j;
3527   v   = a->a;
3528   if (usecprow) {
3529     mbs  = a->compressedrow.nrows;
3530     ii   = a->compressedrow.i;
3531     ridx = a->compressedrow.rindex;
3532   } else {
3533     mbs = a->mbs;
3534     ii  = a->i;
3535     z   = c;
3536   }
3537 
3538   for (i = 0; i < mbs; i++) {
3539     n = ii[1] - ii[0];
3540     ii++;
3541     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3542     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3543     if (usecprow) z = c + 4 * ridx[i];
3544     jj = idx;
3545     vv = v;
3546     for (k = 0; k < cn; k++) {
3547       idx  = jj;
3548       v    = vv;
3549       sum1 = 0.0;
3550       sum2 = 0.0;
3551       sum3 = 0.0;
3552       sum4 = 0.0;
3553       for (j = 0; j < n; j++) {
3554         xb = b + 4 * (*idx++);
3555         x1 = xb[0 + k * bm];
3556         x2 = xb[1 + k * bm];
3557         x3 = xb[2 + k * bm];
3558         x4 = xb[3 + k * bm];
3559         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3560         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3561         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3562         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3563         v += 16;
3564       }
3565       z[0 + k * cm] = sum1;
3566       z[1 + k * cm] = sum2;
3567       z[2 + k * cm] = sum3;
3568       z[3 + k * cm] = sum4;
3569     }
3570     if (!usecprow) z += 4;
3571   }
3572   PetscFunctionReturn(0);
3573 }
3574 
3575 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3576   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3577   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3578   const PetscScalar *xb;
3579   PetscScalar        x1, x2, x3, x4, x5;
3580   const MatScalar   *v, *vv;
3581   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3582   PetscBool          usecprow = a->compressedrow.use;
3583 
3584   PetscFunctionBegin;
3585   idx = a->j;
3586   v   = a->a;
3587   if (usecprow) {
3588     mbs  = a->compressedrow.nrows;
3589     ii   = a->compressedrow.i;
3590     ridx = a->compressedrow.rindex;
3591   } else {
3592     mbs = a->mbs;
3593     ii  = a->i;
3594     z   = c;
3595   }
3596 
3597   for (i = 0; i < mbs; i++) {
3598     n = ii[1] - ii[0];
3599     ii++;
3600     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3601     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3602     if (usecprow) z = c + 5 * ridx[i];
3603     jj = idx;
3604     vv = v;
3605     for (k = 0; k < cn; k++) {
3606       idx  = jj;
3607       v    = vv;
3608       sum1 = 0.0;
3609       sum2 = 0.0;
3610       sum3 = 0.0;
3611       sum4 = 0.0;
3612       sum5 = 0.0;
3613       for (j = 0; j < n; j++) {
3614         xb = b + 5 * (*idx++);
3615         x1 = xb[0 + k * bm];
3616         x2 = xb[1 + k * bm];
3617         x3 = xb[2 + k * bm];
3618         x4 = xb[3 + k * bm];
3619         x5 = xb[4 + k * bm];
3620         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3621         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3622         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3623         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3624         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3625         v += 25;
3626       }
3627       z[0 + k * cm] = sum1;
3628       z[1 + k * cm] = sum2;
3629       z[2 + k * cm] = sum3;
3630       z[3 + k * cm] = sum4;
3631       z[4 + k * cm] = sum5;
3632     }
3633     if (!usecprow) z += 5;
3634   }
3635   PetscFunctionReturn(0);
3636 }
3637 
3638 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) {
3639   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3640   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3641   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3642   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3643   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3644   PetscBLASInt     bbs, bcn, bbm, bcm;
3645   PetscScalar     *z = NULL;
3646   PetscScalar     *c, *b;
3647   const MatScalar *v;
3648   const PetscInt  *idx, *ii, *ridx = NULL;
3649   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3650   PetscBool        usecprow = a->compressedrow.use;
3651 
3652   PetscFunctionBegin;
3653   if (!cm || !cn) PetscFunctionReturn(0);
3654   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3655   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3656   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3657   b = bd->v;
3658   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3659   PetscCall(MatDenseGetArray(C, &c));
3660   switch (bs) {
3661   case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break;
3662   case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break;
3663   case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break;
3664   case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break;
3665   case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break;
3666   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3667     PetscCall(PetscBLASIntCast(bs, &bbs));
3668     PetscCall(PetscBLASIntCast(cn, &bcn));
3669     PetscCall(PetscBLASIntCast(bm, &bbm));
3670     PetscCall(PetscBLASIntCast(cm, &bcm));
3671     idx = a->j;
3672     v   = a->a;
3673     if (usecprow) {
3674       mbs  = a->compressedrow.nrows;
3675       ii   = a->compressedrow.i;
3676       ridx = a->compressedrow.rindex;
3677     } else {
3678       mbs = a->mbs;
3679       ii  = a->i;
3680       z   = c;
3681     }
3682     for (i = 0; i < mbs; i++) {
3683       n = ii[1] - ii[0];
3684       ii++;
3685       if (usecprow) z = c + bs * ridx[i];
3686       if (n) {
3687         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3688         v += bs2;
3689       }
3690       for (j = 1; j < n; j++) {
3691         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3692         v += bs2;
3693       }
3694       if (!usecprow) z += bs;
3695     }
3696   }
3697   PetscCall(MatDenseRestoreArray(C, &c));
3698   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3699   PetscFunctionReturn(0);
3700 }
3701