xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
13   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
14   const PetscInt *idx;
15   PetscInt        start, end, *ai, *aj, bs, *nidx2;
16   PetscBT         table;
17 
18   PetscFunctionBegin;
19   m  = a->mbs;
20   ai = a->i;
21   aj = a->j;
22   bs = A->rmap->bs;
23 
24   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
25 
26   PetscCall(PetscBTCreate(m, &table));
27   PetscCall(PetscMalloc1(m + 1, &nidx));
28   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
29 
30   for (i = 0; i < is_max; i++) {
31     /* Initialise the two local arrays */
32     isz = 0;
33     PetscCall(PetscBTMemzero(m, table));
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     PetscCall(ISGetIndices(is[i], &idx));
37     PetscCall(ISGetLocalSize(is[i], &n));
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j = 0; j < n; ++j) {
41       ival = idx[j] / bs; /* convert the indices into block indices */
42       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
43       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44     }
45     PetscCall(ISRestoreIndices(is[i], &idx));
46     PetscCall(ISDestroy(&is[i]));
47 
48     k = 0;
49     for (j = 0; j < ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row + 1];
55         for (l = start; l < end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58         }
59       }
60     }
61     /* expand the Index Set */
62     for (j = 0; j < isz; j++) {
63       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
64     }
65     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
66   }
67   PetscCall(PetscBTDestroy(&table));
68   PetscCall(PetscFree(nidx));
69   PetscCall(PetscFree(nidx2));
70   PetscFunctionReturn(0);
71 }
72 
73 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
74   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
75   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
76   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
77   const PetscInt *irow, *icol;
78   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
79   PetscInt       *aj = a->j, *ai = a->i;
80   MatScalar      *mat_a;
81   Mat             C;
82   PetscBool       flag;
83 
84   PetscFunctionBegin;
85   PetscCall(ISGetIndices(isrow, &irow));
86   PetscCall(ISGetIndices(iscol, &icol));
87   PetscCall(ISGetLocalSize(isrow, &nrows));
88   PetscCall(ISGetLocalSize(iscol, &ncols));
89 
90   PetscCall(PetscCalloc1(1 + oldcols, &smap));
91   ssmap = smap;
92   PetscCall(PetscMalloc1(1 + nrows, &lens));
93   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
94   /* determine lens of each row */
95   for (i = 0; i < nrows; i++) {
96     kstart  = ai[irow[i]];
97     kend    = kstart + a->ilen[irow[i]];
98     lens[i] = 0;
99     for (k = kstart; k < kend; k++) {
100       if (ssmap[aj[k]]) lens[i]++;
101     }
102   }
103   /* Create and fill new matrix */
104   if (scall == MAT_REUSE_MATRIX) {
105     c = (Mat_SeqBAIJ *)((*B)->data);
106 
107     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
108     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
109     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
110     PetscCall(PetscArrayzero(c->ilen, c->mbs));
111     C = *B;
112   } else {
113     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
114     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
115     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
116     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
117   }
118   c = (Mat_SeqBAIJ *)(C->data);
119   for (i = 0; i < nrows; i++) {
120     row      = irow[i];
121     kstart   = ai[row];
122     kend     = kstart + a->ilen[row];
123     mat_i    = c->i[i];
124     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
125     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
126     mat_ilen = c->ilen + i;
127     for (k = kstart; k < kend; k++) {
128       if ((tcol = ssmap[a->j[k]])) {
129         *mat_j++ = tcol - 1;
130         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
131         mat_a += bs2;
132         (*mat_ilen)++;
133       }
134     }
135   }
136   /* sort */
137   if (c->j && c->a) {
138     MatScalar *work;
139     PetscCall(PetscMalloc1(bs2, &work));
140     for (i = 0; i < nrows; i++) {
141       PetscInt ilen;
142       mat_i = c->i[i];
143       mat_j = c->j + mat_i;
144       mat_a = c->a + mat_i * bs2;
145       ilen  = c->ilen[i];
146       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
147     }
148     PetscCall(PetscFree(work));
149   }
150 
151   /* Free work space */
152   PetscCall(ISRestoreIndices(iscol, &icol));
153   PetscCall(PetscFree(smap));
154   PetscCall(PetscFree(lens));
155   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
156   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
157 
158   PetscCall(ISRestoreIndices(isrow, &irow));
159   *B = C;
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) {
164   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
165   IS              is1, is2;
166   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
167   const PetscInt *irow, *icol;
168 
169   PetscFunctionBegin;
170   PetscCall(ISGetIndices(isrow, &irow));
171   PetscCall(ISGetIndices(iscol, &icol));
172   PetscCall(ISGetLocalSize(isrow, &nrows));
173   PetscCall(ISGetLocalSize(iscol, &ncols));
174 
175   /* Verify if the indices corespond to each element in a block
176    and form the IS with compressed IS */
177   maxmnbs = PetscMax(a->mbs, a->nbs);
178   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
179   PetscCall(PetscArrayzero(vary, a->mbs));
180   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
181   for (i = 0; i < a->mbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks"); }
182   count = 0;
183   for (i = 0; i < nrows; i++) {
184     j = irow[i] / bs;
185     if ((vary[j]--) == bs) iary[count++] = j;
186   }
187   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
188 
189   PetscCall(PetscArrayzero(vary, a->nbs));
190   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
191   for (i = 0; i < a->nbs; i++) { PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc"); }
192   count = 0;
193   for (i = 0; i < ncols; i++) {
194     j = icol[i] / bs;
195     if ((vary[j]--) == bs) iary[count++] = j;
196   }
197   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
198   PetscCall(ISRestoreIndices(isrow, &irow));
199   PetscCall(ISRestoreIndices(iscol, &icol));
200   PetscCall(PetscFree2(vary, iary));
201 
202   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
203   PetscCall(ISDestroy(&is1));
204   PetscCall(ISDestroy(&is2));
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) {
209   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
210   Mat_SubSppt *submatj = c->submatis1;
211 
212   PetscFunctionBegin;
213   PetscCall((*submatj->destroy)(C));
214   PetscCall(MatDestroySubMatrix_Private(submatj));
215   PetscFunctionReturn(0);
216 }
217 
218 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
219 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) {
220   PetscInt     i;
221   Mat          C;
222   Mat_SeqBAIJ *c;
223   Mat_SubSppt *submatj;
224 
225   PetscFunctionBegin;
226   for (i = 0; i < n; i++) {
227     C       = (*mat)[i];
228     c       = (Mat_SeqBAIJ *)C->data;
229     submatj = c->submatis1;
230     if (submatj) {
231       if (--((PetscObject)C)->refct <= 0) {
232         PetscCall(PetscFree(C->factorprefix));
233         PetscCall((*submatj->destroy)(C));
234         PetscCall(MatDestroySubMatrix_Private(submatj));
235         PetscCall(PetscFree(C->defaultvectype));
236         PetscCall(PetscLayoutDestroy(&C->rmap));
237         PetscCall(PetscLayoutDestroy(&C->cmap));
238         PetscCall(PetscHeaderDestroy(&C));
239       }
240     } else {
241       PetscCall(MatDestroy(&C));
242     }
243   }
244 
245   /* Destroy Dummy submatrices created for reuse */
246   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
247 
248   PetscCall(PetscFree(*mat));
249   PetscFunctionReturn(0);
250 }
251 
252 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) {
253   PetscInt i;
254 
255   PetscFunctionBegin;
256   if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(n + 1, B)); }
257 
258   for (i = 0; i < n; i++) { PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); }
259   PetscFunctionReturn(0);
260 }
261 
262 /* -------------------------------------------------------*/
263 /* Should check that shapes of vectors and matrices match */
264 /* -------------------------------------------------------*/
265 
266 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) {
267   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
268   PetscScalar       *z, sum;
269   const PetscScalar *x;
270   const MatScalar   *v;
271   PetscInt           mbs, i, n;
272   const PetscInt    *idx, *ii, *ridx = NULL;
273   PetscBool          usecprow = a->compressedrow.use;
274 
275   PetscFunctionBegin;
276   PetscCall(VecGetArrayRead(xx, &x));
277   PetscCall(VecGetArrayWrite(zz, &z));
278 
279   if (usecprow) {
280     mbs  = a->compressedrow.nrows;
281     ii   = a->compressedrow.i;
282     ridx = a->compressedrow.rindex;
283     PetscCall(PetscArrayzero(z, a->mbs));
284   } else {
285     mbs = a->mbs;
286     ii  = a->i;
287   }
288 
289   for (i = 0; i < mbs; i++) {
290     n   = ii[1] - ii[0];
291     v   = a->a + ii[0];
292     idx = a->j + ii[0];
293     ii++;
294     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
295     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
296     sum = 0.0;
297     PetscSparseDensePlusDot(sum, x, v, idx, n);
298     if (usecprow) {
299       z[ridx[i]] = sum;
300     } else {
301       z[i] = sum;
302     }
303   }
304   PetscCall(VecRestoreArrayRead(xx, &x));
305   PetscCall(VecRestoreArrayWrite(zz, &z));
306   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
307   PetscFunctionReturn(0);
308 }
309 
310 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) {
311   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
312   PetscScalar       *z = NULL, sum1, sum2, *zarray;
313   const PetscScalar *x, *xb;
314   PetscScalar        x1, x2;
315   const MatScalar   *v;
316   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
317   PetscBool          usecprow = a->compressedrow.use;
318 
319   PetscFunctionBegin;
320   PetscCall(VecGetArrayRead(xx, &x));
321   PetscCall(VecGetArrayWrite(zz, &zarray));
322 
323   idx = a->j;
324   v   = a->a;
325   if (usecprow) {
326     mbs  = a->compressedrow.nrows;
327     ii   = a->compressedrow.i;
328     ridx = a->compressedrow.rindex;
329     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
330   } else {
331     mbs = a->mbs;
332     ii  = a->i;
333     z   = zarray;
334   }
335 
336   for (i = 0; i < mbs; i++) {
337     n = ii[1] - ii[0];
338     ii++;
339     sum1 = 0.0;
340     sum2 = 0.0;
341     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
342     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
343     for (j = 0; j < n; j++) {
344       xb = x + 2 * (*idx++);
345       x1 = xb[0];
346       x2 = xb[1];
347       sum1 += v[0] * x1 + v[2] * x2;
348       sum2 += v[1] * x1 + v[3] * x2;
349       v += 4;
350     }
351     if (usecprow) z = zarray + 2 * ridx[i];
352     z[0] = sum1;
353     z[1] = sum2;
354     if (!usecprow) z += 2;
355   }
356   PetscCall(VecRestoreArrayRead(xx, &x));
357   PetscCall(VecRestoreArrayWrite(zz, &zarray));
358   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
359   PetscFunctionReturn(0);
360 }
361 
362 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) {
363   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
364   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
365   const PetscScalar *x, *xb;
366   const MatScalar   *v;
367   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
368   PetscBool          usecprow = a->compressedrow.use;
369 
370 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
371 #pragma disjoint(*v, *z, *xb)
372 #endif
373 
374   PetscFunctionBegin;
375   PetscCall(VecGetArrayRead(xx, &x));
376   PetscCall(VecGetArrayWrite(zz, &zarray));
377 
378   idx = a->j;
379   v   = a->a;
380   if (usecprow) {
381     mbs  = a->compressedrow.nrows;
382     ii   = a->compressedrow.i;
383     ridx = a->compressedrow.rindex;
384     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
385   } else {
386     mbs = a->mbs;
387     ii  = a->i;
388     z   = zarray;
389   }
390 
391   for (i = 0; i < mbs; i++) {
392     n = ii[1] - ii[0];
393     ii++;
394     sum1 = 0.0;
395     sum2 = 0.0;
396     sum3 = 0.0;
397     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
398     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
399     for (j = 0; j < n; j++) {
400       xb = x + 3 * (*idx++);
401       x1 = xb[0];
402       x2 = xb[1];
403       x3 = xb[2];
404 
405       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
406       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
407       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
408       v += 9;
409     }
410     if (usecprow) z = zarray + 3 * ridx[i];
411     z[0] = sum1;
412     z[1] = sum2;
413     z[2] = sum3;
414     if (!usecprow) z += 3;
415   }
416   PetscCall(VecRestoreArrayRead(xx, &x));
417   PetscCall(VecRestoreArrayWrite(zz, &zarray));
418   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
419   PetscFunctionReturn(0);
420 }
421 
422 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) {
423   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
424   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
425   const PetscScalar *x, *xb;
426   const MatScalar   *v;
427   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
428   PetscBool          usecprow = a->compressedrow.use;
429 
430   PetscFunctionBegin;
431   PetscCall(VecGetArrayRead(xx, &x));
432   PetscCall(VecGetArrayWrite(zz, &zarray));
433 
434   idx = a->j;
435   v   = a->a;
436   if (usecprow) {
437     mbs  = a->compressedrow.nrows;
438     ii   = a->compressedrow.i;
439     ridx = a->compressedrow.rindex;
440     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
441   } else {
442     mbs = a->mbs;
443     ii  = a->i;
444     z   = zarray;
445   }
446 
447   for (i = 0; i < mbs; i++) {
448     n = ii[1] - ii[0];
449     ii++;
450     sum1 = 0.0;
451     sum2 = 0.0;
452     sum3 = 0.0;
453     sum4 = 0.0;
454 
455     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
456     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
457     for (j = 0; j < n; j++) {
458       xb = x + 4 * (*idx++);
459       x1 = xb[0];
460       x2 = xb[1];
461       x3 = xb[2];
462       x4 = xb[3];
463       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
464       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
465       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
466       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
467       v += 16;
468     }
469     if (usecprow) z = zarray + 4 * ridx[i];
470     z[0] = sum1;
471     z[1] = sum2;
472     z[2] = sum3;
473     z[3] = sum4;
474     if (!usecprow) z += 4;
475   }
476   PetscCall(VecRestoreArrayRead(xx, &x));
477   PetscCall(VecRestoreArrayWrite(zz, &zarray));
478   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
479   PetscFunctionReturn(0);
480 }
481 
482 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) {
483   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
484   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
485   const PetscScalar *xb, *x;
486   const MatScalar   *v;
487   const PetscInt    *idx, *ii, *ridx = NULL;
488   PetscInt           mbs, i, j, n;
489   PetscBool          usecprow = a->compressedrow.use;
490 
491   PetscFunctionBegin;
492   PetscCall(VecGetArrayRead(xx, &x));
493   PetscCall(VecGetArrayWrite(zz, &zarray));
494 
495   idx = a->j;
496   v   = a->a;
497   if (usecprow) {
498     mbs  = a->compressedrow.nrows;
499     ii   = a->compressedrow.i;
500     ridx = a->compressedrow.rindex;
501     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
502   } else {
503     mbs = a->mbs;
504     ii  = a->i;
505     z   = zarray;
506   }
507 
508   for (i = 0; i < mbs; i++) {
509     n = ii[1] - ii[0];
510     ii++;
511     sum1 = 0.0;
512     sum2 = 0.0;
513     sum3 = 0.0;
514     sum4 = 0.0;
515     sum5 = 0.0;
516     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
517     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
518     for (j = 0; j < n; j++) {
519       xb = x + 5 * (*idx++);
520       x1 = xb[0];
521       x2 = xb[1];
522       x3 = xb[2];
523       x4 = xb[3];
524       x5 = xb[4];
525       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
526       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
527       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
528       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
529       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
530       v += 25;
531     }
532     if (usecprow) z = zarray + 5 * ridx[i];
533     z[0] = sum1;
534     z[1] = sum2;
535     z[2] = sum3;
536     z[3] = sum4;
537     z[4] = sum5;
538     if (!usecprow) z += 5;
539   }
540   PetscCall(VecRestoreArrayRead(xx, &x));
541   PetscCall(VecRestoreArrayWrite(zz, &zarray));
542   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
543   PetscFunctionReturn(0);
544 }
545 
546 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) {
547   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
548   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
549   const PetscScalar *x, *xb;
550   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
551   const MatScalar   *v;
552   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
553   PetscBool          usecprow = a->compressedrow.use;
554 
555   PetscFunctionBegin;
556   PetscCall(VecGetArrayRead(xx, &x));
557   PetscCall(VecGetArrayWrite(zz, &zarray));
558 
559   idx = a->j;
560   v   = a->a;
561   if (usecprow) {
562     mbs  = a->compressedrow.nrows;
563     ii   = a->compressedrow.i;
564     ridx = a->compressedrow.rindex;
565     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
566   } else {
567     mbs = a->mbs;
568     ii  = a->i;
569     z   = zarray;
570   }
571 
572   for (i = 0; i < mbs; i++) {
573     n = ii[1] - ii[0];
574     ii++;
575     sum1 = 0.0;
576     sum2 = 0.0;
577     sum3 = 0.0;
578     sum4 = 0.0;
579     sum5 = 0.0;
580     sum6 = 0.0;
581 
582     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
583     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
584     for (j = 0; j < n; j++) {
585       xb = x + 6 * (*idx++);
586       x1 = xb[0];
587       x2 = xb[1];
588       x3 = xb[2];
589       x4 = xb[3];
590       x5 = xb[4];
591       x6 = xb[5];
592       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
593       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
594       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
595       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
596       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
597       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
598       v += 36;
599     }
600     if (usecprow) z = zarray + 6 * ridx[i];
601     z[0] = sum1;
602     z[1] = sum2;
603     z[2] = sum3;
604     z[3] = sum4;
605     z[4] = sum5;
606     z[5] = sum6;
607     if (!usecprow) z += 6;
608   }
609 
610   PetscCall(VecRestoreArrayRead(xx, &x));
611   PetscCall(VecRestoreArrayWrite(zz, &zarray));
612   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
613   PetscFunctionReturn(0);
614 }
615 
616 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) {
617   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
618   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
619   const PetscScalar *x, *xb;
620   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
621   const MatScalar   *v;
622   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
623   PetscBool          usecprow = a->compressedrow.use;
624 
625   PetscFunctionBegin;
626   PetscCall(VecGetArrayRead(xx, &x));
627   PetscCall(VecGetArrayWrite(zz, &zarray));
628 
629   idx = a->j;
630   v   = a->a;
631   if (usecprow) {
632     mbs  = a->compressedrow.nrows;
633     ii   = a->compressedrow.i;
634     ridx = a->compressedrow.rindex;
635     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
636   } else {
637     mbs = a->mbs;
638     ii  = a->i;
639     z   = zarray;
640   }
641 
642   for (i = 0; i < mbs; i++) {
643     n = ii[1] - ii[0];
644     ii++;
645     sum1 = 0.0;
646     sum2 = 0.0;
647     sum3 = 0.0;
648     sum4 = 0.0;
649     sum5 = 0.0;
650     sum6 = 0.0;
651     sum7 = 0.0;
652 
653     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
654     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
655     for (j = 0; j < n; j++) {
656       xb = x + 7 * (*idx++);
657       x1 = xb[0];
658       x2 = xb[1];
659       x3 = xb[2];
660       x4 = xb[3];
661       x5 = xb[4];
662       x6 = xb[5];
663       x7 = xb[6];
664       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
665       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
666       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
667       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
668       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
669       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
670       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
671       v += 49;
672     }
673     if (usecprow) z = zarray + 7 * ridx[i];
674     z[0] = sum1;
675     z[1] = sum2;
676     z[2] = sum3;
677     z[3] = sum4;
678     z[4] = sum5;
679     z[5] = sum6;
680     z[6] = sum7;
681     if (!usecprow) z += 7;
682   }
683 
684   PetscCall(VecRestoreArrayRead(xx, &x));
685   PetscCall(VecRestoreArrayWrite(zz, &zarray));
686   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
687   PetscFunctionReturn(0);
688 }
689 
690 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
691 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) {
692   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
693   PetscScalar       *z = NULL, *work, *workt, *zarray;
694   const PetscScalar *x, *xb;
695   const MatScalar   *v;
696   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
697   const PetscInt    *idx, *ii, *ridx = NULL;
698   PetscInt           k;
699   PetscBool          usecprow = a->compressedrow.use;
700 
701   __m256d a0, a1, a2, a3, a4, a5;
702   __m256d w0, w1, w2, w3;
703   __m256d z0, z1, z2;
704   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
705 
706   PetscFunctionBegin;
707   PetscCall(VecGetArrayRead(xx, &x));
708   PetscCall(VecGetArrayWrite(zz, &zarray));
709 
710   idx = a->j;
711   v   = a->a;
712   if (usecprow) {
713     mbs  = a->compressedrow.nrows;
714     ii   = a->compressedrow.i;
715     ridx = a->compressedrow.rindex;
716     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
717   } else {
718     mbs = a->mbs;
719     ii  = a->i;
720     z   = zarray;
721   }
722 
723   if (!a->mult_work) {
724     k = PetscMax(A->rmap->n, A->cmap->n);
725     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
726   }
727 
728   work = a->mult_work;
729   for (i = 0; i < mbs; i++) {
730     n = ii[1] - ii[0];
731     ii++;
732     workt = work;
733     for (j = 0; j < n; j++) {
734       xb = x + bs * (*idx++);
735       for (k = 0; k < bs; k++) workt[k] = xb[k];
736       workt += bs;
737     }
738     if (usecprow) z = zarray + bs * ridx[i];
739 
740     z0 = _mm256_setzero_pd();
741     z1 = _mm256_setzero_pd();
742     z2 = _mm256_setzero_pd();
743 
744     for (j = 0; j < n; j++) {
745       /* first column of a */
746       w0 = _mm256_set1_pd(work[j * 9]);
747       a0 = _mm256_loadu_pd(&v[j * 81]);
748       z0 = _mm256_fmadd_pd(a0, w0, z0);
749       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
750       z1 = _mm256_fmadd_pd(a1, w0, z1);
751       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
752       z2 = _mm256_fmadd_pd(a2, w0, z2);
753 
754       /* second column of a */
755       w1 = _mm256_set1_pd(work[j * 9 + 1]);
756       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
757       z0 = _mm256_fmadd_pd(a0, w1, z0);
758       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
759       z1 = _mm256_fmadd_pd(a1, w1, z1);
760       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
761       z2 = _mm256_fmadd_pd(a2, w1, z2);
762 
763       /* third column of a */
764       w2 = _mm256_set1_pd(work[j * 9 + 2]);
765       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
766       z0 = _mm256_fmadd_pd(a3, w2, z0);
767       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
768       z1 = _mm256_fmadd_pd(a4, w2, z1);
769       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
770       z2 = _mm256_fmadd_pd(a5, w2, z2);
771 
772       /* fourth column of a */
773       w3 = _mm256_set1_pd(work[j * 9 + 3]);
774       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
775       z0 = _mm256_fmadd_pd(a0, w3, z0);
776       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
777       z1 = _mm256_fmadd_pd(a1, w3, z1);
778       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
779       z2 = _mm256_fmadd_pd(a2, w3, z2);
780 
781       /* fifth column of a */
782       w0 = _mm256_set1_pd(work[j * 9 + 4]);
783       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
784       z0 = _mm256_fmadd_pd(a3, w0, z0);
785       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
786       z1 = _mm256_fmadd_pd(a4, w0, z1);
787       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
788       z2 = _mm256_fmadd_pd(a5, w0, z2);
789 
790       /* sixth column of a */
791       w1 = _mm256_set1_pd(work[j * 9 + 5]);
792       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
793       z0 = _mm256_fmadd_pd(a0, w1, z0);
794       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
795       z1 = _mm256_fmadd_pd(a1, w1, z1);
796       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
797       z2 = _mm256_fmadd_pd(a2, w1, z2);
798 
799       /* seventh column of a */
800       w2 = _mm256_set1_pd(work[j * 9 + 6]);
801       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
802       z0 = _mm256_fmadd_pd(a0, w2, z0);
803       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
804       z1 = _mm256_fmadd_pd(a1, w2, z1);
805       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
806       z2 = _mm256_fmadd_pd(a2, w2, z2);
807 
808       /* eighth column of a */
809       w3 = _mm256_set1_pd(work[j * 9 + 7]);
810       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
811       z0 = _mm256_fmadd_pd(a3, w3, z0);
812       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
813       z1 = _mm256_fmadd_pd(a4, w3, z1);
814       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
815       z2 = _mm256_fmadd_pd(a5, w3, z2);
816 
817       /* ninth column of a */
818       w0 = _mm256_set1_pd(work[j * 9 + 8]);
819       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
820       z0 = _mm256_fmadd_pd(a0, w0, z0);
821       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
822       z1 = _mm256_fmadd_pd(a1, w0, z1);
823       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
824       z2 = _mm256_fmadd_pd(a2, w0, z2);
825     }
826 
827     _mm256_storeu_pd(&z[0], z0);
828     _mm256_storeu_pd(&z[4], z1);
829     _mm256_maskstore_pd(&z[8], mask1, z2);
830 
831     v += n * bs2;
832     if (!usecprow) z += bs;
833   }
834   PetscCall(VecRestoreArrayRead(xx, &x));
835   PetscCall(VecRestoreArrayWrite(zz, &zarray));
836   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
837   PetscFunctionReturn(0);
838 }
839 #endif
840 
841 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) {
842   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
843   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
844   const PetscScalar *x, *xb;
845   PetscScalar       *zarray, xv;
846   const MatScalar   *v;
847   const PetscInt    *ii, *ij = a->j, *idx;
848   PetscInt           mbs, i, j, k, n, *ridx = NULL;
849   PetscBool          usecprow = a->compressedrow.use;
850 
851   PetscFunctionBegin;
852   PetscCall(VecGetArrayRead(xx, &x));
853   PetscCall(VecGetArrayWrite(zz, &zarray));
854 
855   v = a->a;
856   if (usecprow) {
857     mbs  = a->compressedrow.nrows;
858     ii   = a->compressedrow.i;
859     ridx = a->compressedrow.rindex;
860     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
861   } else {
862     mbs = a->mbs;
863     ii  = a->i;
864     z   = zarray;
865   }
866 
867   for (i = 0; i < mbs; i++) {
868     n     = ii[i + 1] - ii[i];
869     idx   = ij + ii[i];
870     sum1  = 0.0;
871     sum2  = 0.0;
872     sum3  = 0.0;
873     sum4  = 0.0;
874     sum5  = 0.0;
875     sum6  = 0.0;
876     sum7  = 0.0;
877     sum8  = 0.0;
878     sum9  = 0.0;
879     sum10 = 0.0;
880     sum11 = 0.0;
881 
882     for (j = 0; j < n; j++) {
883       xb = x + 11 * (idx[j]);
884 
885       for (k = 0; k < 11; k++) {
886         xv = xb[k];
887         sum1 += v[0] * xv;
888         sum2 += v[1] * xv;
889         sum3 += v[2] * xv;
890         sum4 += v[3] * xv;
891         sum5 += v[4] * xv;
892         sum6 += v[5] * xv;
893         sum7 += v[6] * xv;
894         sum8 += v[7] * xv;
895         sum9 += v[8] * xv;
896         sum10 += v[9] * xv;
897         sum11 += v[10] * xv;
898         v += 11;
899       }
900     }
901     if (usecprow) z = zarray + 11 * ridx[i];
902     z[0]  = sum1;
903     z[1]  = sum2;
904     z[2]  = sum3;
905     z[3]  = sum4;
906     z[4]  = sum5;
907     z[5]  = sum6;
908     z[6]  = sum7;
909     z[7]  = sum8;
910     z[8]  = sum9;
911     z[9]  = sum10;
912     z[10] = sum11;
913 
914     if (!usecprow) z += 11;
915   }
916 
917   PetscCall(VecRestoreArrayRead(xx, &x));
918   PetscCall(VecRestoreArrayWrite(zz, &zarray));
919   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
920   PetscFunctionReturn(0);
921 }
922 
923 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
924 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) {
925   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
926   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
927   const PetscScalar *x, *xb;
928   PetscScalar       *zarray, xv;
929   const MatScalar   *v;
930   const PetscInt    *ii, *ij = a->j, *idx;
931   PetscInt           mbs, i, j, k, n, *ridx = NULL;
932   PetscBool          usecprow = a->compressedrow.use;
933 
934   PetscFunctionBegin;
935   PetscCall(VecGetArrayRead(xx, &x));
936   PetscCall(VecGetArrayWrite(zz, &zarray));
937 
938   v = a->a;
939   if (usecprow) {
940     mbs  = a->compressedrow.nrows;
941     ii   = a->compressedrow.i;
942     ridx = a->compressedrow.rindex;
943     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
944   } else {
945     mbs = a->mbs;
946     ii  = a->i;
947     z   = zarray;
948   }
949 
950   for (i = 0; i < mbs; i++) {
951     n     = ii[i + 1] - ii[i];
952     idx   = ij + ii[i];
953     sum1  = 0.0;
954     sum2  = 0.0;
955     sum3  = 0.0;
956     sum4  = 0.0;
957     sum5  = 0.0;
958     sum6  = 0.0;
959     sum7  = 0.0;
960     sum8  = 0.0;
961     sum9  = 0.0;
962     sum10 = 0.0;
963     sum11 = 0.0;
964     sum12 = 0.0;
965 
966     for (j = 0; j < n; j++) {
967       xb = x + 12 * (idx[j]);
968 
969       for (k = 0; k < 12; k++) {
970         xv = xb[k];
971         sum1 += v[0] * xv;
972         sum2 += v[1] * xv;
973         sum3 += v[2] * xv;
974         sum4 += v[3] * xv;
975         sum5 += v[4] * xv;
976         sum6 += v[5] * xv;
977         sum7 += v[6] * xv;
978         sum8 += v[7] * xv;
979         sum9 += v[8] * xv;
980         sum10 += v[9] * xv;
981         sum11 += v[10] * xv;
982         sum12 += v[11] * xv;
983         v += 12;
984       }
985     }
986     if (usecprow) z = zarray + 12 * ridx[i];
987     z[0]  = sum1;
988     z[1]  = sum2;
989     z[2]  = sum3;
990     z[3]  = sum4;
991     z[4]  = sum5;
992     z[5]  = sum6;
993     z[6]  = sum7;
994     z[7]  = sum8;
995     z[8]  = sum9;
996     z[9]  = sum10;
997     z[10] = sum11;
998     z[11] = sum12;
999     if (!usecprow) z += 12;
1000   }
1001   PetscCall(VecRestoreArrayRead(xx, &x));
1002   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1003   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) {
1008   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1009   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1010   const PetscScalar *x, *xb;
1011   PetscScalar       *zarray, *yarray, xv;
1012   const MatScalar   *v;
1013   const PetscInt    *ii, *ij = a->j, *idx;
1014   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1015   PetscBool          usecprow = a->compressedrow.use;
1016 
1017   PetscFunctionBegin;
1018   PetscCall(VecGetArrayRead(xx, &x));
1019   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1020 
1021   v = a->a;
1022   if (usecprow) {
1023     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); }
1024     mbs  = a->compressedrow.nrows;
1025     ii   = a->compressedrow.i;
1026     ridx = a->compressedrow.rindex;
1027   } else {
1028     ii = a->i;
1029     y  = yarray;
1030     z  = zarray;
1031   }
1032 
1033   for (i = 0; i < mbs; i++) {
1034     n   = ii[i + 1] - ii[i];
1035     idx = ij + ii[i];
1036 
1037     if (usecprow) {
1038       y = yarray + 12 * ridx[i];
1039       z = zarray + 12 * ridx[i];
1040     }
1041     sum1  = y[0];
1042     sum2  = y[1];
1043     sum3  = y[2];
1044     sum4  = y[3];
1045     sum5  = y[4];
1046     sum6  = y[5];
1047     sum7  = y[6];
1048     sum8  = y[7];
1049     sum9  = y[8];
1050     sum10 = y[9];
1051     sum11 = y[10];
1052     sum12 = y[11];
1053 
1054     for (j = 0; j < n; j++) {
1055       xb = x + 12 * (idx[j]);
1056 
1057       for (k = 0; k < 12; k++) {
1058         xv = xb[k];
1059         sum1 += v[0] * xv;
1060         sum2 += v[1] * xv;
1061         sum3 += v[2] * xv;
1062         sum4 += v[3] * xv;
1063         sum5 += v[4] * xv;
1064         sum6 += v[5] * xv;
1065         sum7 += v[6] * xv;
1066         sum8 += v[7] * xv;
1067         sum9 += v[8] * xv;
1068         sum10 += v[9] * xv;
1069         sum11 += v[10] * xv;
1070         sum12 += v[11] * xv;
1071         v += 12;
1072       }
1073     }
1074 
1075     z[0]  = sum1;
1076     z[1]  = sum2;
1077     z[2]  = sum3;
1078     z[3]  = sum4;
1079     z[4]  = sum5;
1080     z[5]  = sum6;
1081     z[6]  = sum7;
1082     z[7]  = sum8;
1083     z[8]  = sum9;
1084     z[9]  = sum10;
1085     z[10] = sum11;
1086     z[11] = sum12;
1087     if (!usecprow) {
1088       y += 12;
1089       z += 12;
1090     }
1091   }
1092   PetscCall(VecRestoreArrayRead(xx, &x));
1093   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1094   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1099 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) {
1100   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1101   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1102   const PetscScalar *x, *xb;
1103   PetscScalar        x1, x2, x3, x4, *zarray;
1104   const MatScalar   *v;
1105   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1106   PetscInt           mbs, i, j, n;
1107   PetscBool          usecprow = a->compressedrow.use;
1108 
1109   PetscFunctionBegin;
1110   PetscCall(VecGetArrayRead(xx, &x));
1111   PetscCall(VecGetArrayWrite(zz, &zarray));
1112 
1113   v = a->a;
1114   if (usecprow) {
1115     mbs  = a->compressedrow.nrows;
1116     ii   = a->compressedrow.i;
1117     ridx = a->compressedrow.rindex;
1118     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1119   } else {
1120     mbs = a->mbs;
1121     ii  = a->i;
1122     z   = zarray;
1123   }
1124 
1125   for (i = 0; i < mbs; i++) {
1126     n   = ii[i + 1] - ii[i];
1127     idx = ij + ii[i];
1128 
1129     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1130     for (j = 0; j < n; j++) {
1131       xb = x + 12 * (idx[j]);
1132       x1 = xb[0];
1133       x2 = xb[1];
1134       x3 = xb[2];
1135       x4 = xb[3];
1136 
1137       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1138       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1139       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1140       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1141       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1142       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1143       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1144       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1145       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1146       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1147       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1148       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1149       v += 48;
1150 
1151       x1 = xb[4];
1152       x2 = xb[5];
1153       x3 = xb[6];
1154       x4 = xb[7];
1155 
1156       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1157       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1158       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1159       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1160       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1161       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1162       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1163       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1164       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1165       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1166       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1167       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1168       v += 48;
1169 
1170       x1 = xb[8];
1171       x2 = xb[9];
1172       x3 = xb[10];
1173       x4 = xb[11];
1174       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1175       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1176       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1177       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1178       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1179       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1180       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1181       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1182       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1183       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1184       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1185       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1186       v += 48;
1187     }
1188     if (usecprow) z = zarray + 12 * ridx[i];
1189     z[0]  = sum1;
1190     z[1]  = sum2;
1191     z[2]  = sum3;
1192     z[3]  = sum4;
1193     z[4]  = sum5;
1194     z[5]  = sum6;
1195     z[6]  = sum7;
1196     z[7]  = sum8;
1197     z[8]  = sum9;
1198     z[9]  = sum10;
1199     z[10] = sum11;
1200     z[11] = sum12;
1201     if (!usecprow) z += 12;
1202   }
1203   PetscCall(VecRestoreArrayRead(xx, &x));
1204   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1205   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1206   PetscFunctionReturn(0);
1207 }
1208 
1209 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1210 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) {
1211   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1212   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1213   const PetscScalar *x, *xb;
1214   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1215   const MatScalar   *v;
1216   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1217   PetscInt           mbs      = a->mbs, i, j, n;
1218   PetscBool          usecprow = a->compressedrow.use;
1219 
1220   PetscFunctionBegin;
1221   PetscCall(VecGetArrayRead(xx, &x));
1222   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1223 
1224   v = a->a;
1225   if (usecprow) {
1226     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); }
1227     mbs  = a->compressedrow.nrows;
1228     ii   = a->compressedrow.i;
1229     ridx = a->compressedrow.rindex;
1230   } else {
1231     ii = a->i;
1232     y  = yarray;
1233     z  = zarray;
1234   }
1235 
1236   for (i = 0; i < mbs; i++) {
1237     n   = ii[i + 1] - ii[i];
1238     idx = ij + ii[i];
1239 
1240     if (usecprow) {
1241       y = yarray + 12 * ridx[i];
1242       z = zarray + 12 * ridx[i];
1243     }
1244     sum1  = y[0];
1245     sum2  = y[1];
1246     sum3  = y[2];
1247     sum4  = y[3];
1248     sum5  = y[4];
1249     sum6  = y[5];
1250     sum7  = y[6];
1251     sum8  = y[7];
1252     sum9  = y[8];
1253     sum10 = y[9];
1254     sum11 = y[10];
1255     sum12 = y[11];
1256 
1257     for (j = 0; j < n; j++) {
1258       xb = x + 12 * (idx[j]);
1259       x1 = xb[0];
1260       x2 = xb[1];
1261       x3 = xb[2];
1262       x4 = xb[3];
1263 
1264       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1265       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1266       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1267       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1268       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1269       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1270       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1271       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1272       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1273       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1274       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1275       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1276       v += 48;
1277 
1278       x1 = xb[4];
1279       x2 = xb[5];
1280       x3 = xb[6];
1281       x4 = xb[7];
1282 
1283       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1284       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1285       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1286       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1287       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1288       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1289       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1290       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1291       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1292       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1293       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1294       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1295       v += 48;
1296 
1297       x1 = xb[8];
1298       x2 = xb[9];
1299       x3 = xb[10];
1300       x4 = xb[11];
1301       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1302       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1303       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1304       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1305       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1306       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1307       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1308       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1309       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1310       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1311       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1312       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1313       v += 48;
1314     }
1315     z[0]  = sum1;
1316     z[1]  = sum2;
1317     z[2]  = sum3;
1318     z[3]  = sum4;
1319     z[4]  = sum5;
1320     z[5]  = sum6;
1321     z[6]  = sum7;
1322     z[7]  = sum8;
1323     z[8]  = sum9;
1324     z[9]  = sum10;
1325     z[10] = sum11;
1326     z[11] = sum12;
1327     if (!usecprow) {
1328       y += 12;
1329       z += 12;
1330     }
1331   }
1332   PetscCall(VecRestoreArrayRead(xx, &x));
1333   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1334   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1339 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) {
1340   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1341   PetscScalar       *z = NULL, *zarray;
1342   const PetscScalar *x, *work;
1343   const MatScalar   *v = a->a;
1344   PetscInt           mbs, i, j, n;
1345   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1346   PetscBool          usecprow = a->compressedrow.use;
1347   const PetscInt     bs = 12, bs2 = 144;
1348 
1349   __m256d a0, a1, a2, a3, a4, a5;
1350   __m256d w0, w1, w2, w3;
1351   __m256d z0, z1, z2;
1352 
1353   PetscFunctionBegin;
1354   PetscCall(VecGetArrayRead(xx, &x));
1355   PetscCall(VecGetArrayWrite(zz, &zarray));
1356 
1357   if (usecprow) {
1358     mbs  = a->compressedrow.nrows;
1359     ii   = a->compressedrow.i;
1360     ridx = a->compressedrow.rindex;
1361     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1362   } else {
1363     mbs = a->mbs;
1364     ii  = a->i;
1365     z   = zarray;
1366   }
1367 
1368   for (i = 0; i < mbs; i++) {
1369     z0 = _mm256_setzero_pd();
1370     z1 = _mm256_setzero_pd();
1371     z2 = _mm256_setzero_pd();
1372 
1373     n = ii[1] - ii[0];
1374     ii++;
1375     for (j = 0; j < n; j++) {
1376       work = x + bs * (*idx++);
1377 
1378       /* first column of a */
1379       w0 = _mm256_set1_pd(work[0]);
1380       a0 = _mm256_loadu_pd(v + 0);
1381       z0 = _mm256_fmadd_pd(a0, w0, z0);
1382       a1 = _mm256_loadu_pd(v + 4);
1383       z1 = _mm256_fmadd_pd(a1, w0, z1);
1384       a2 = _mm256_loadu_pd(v + 8);
1385       z2 = _mm256_fmadd_pd(a2, w0, z2);
1386 
1387       /* second column of a */
1388       w1 = _mm256_set1_pd(work[1]);
1389       a3 = _mm256_loadu_pd(v + 12);
1390       z0 = _mm256_fmadd_pd(a3, w1, z0);
1391       a4 = _mm256_loadu_pd(v + 16);
1392       z1 = _mm256_fmadd_pd(a4, w1, z1);
1393       a5 = _mm256_loadu_pd(v + 20);
1394       z2 = _mm256_fmadd_pd(a5, w1, z2);
1395 
1396       /* third column of a */
1397       w2 = _mm256_set1_pd(work[2]);
1398       a0 = _mm256_loadu_pd(v + 24);
1399       z0 = _mm256_fmadd_pd(a0, w2, z0);
1400       a1 = _mm256_loadu_pd(v + 28);
1401       z1 = _mm256_fmadd_pd(a1, w2, z1);
1402       a2 = _mm256_loadu_pd(v + 32);
1403       z2 = _mm256_fmadd_pd(a2, w2, z2);
1404 
1405       /* fourth column of a */
1406       w3 = _mm256_set1_pd(work[3]);
1407       a3 = _mm256_loadu_pd(v + 36);
1408       z0 = _mm256_fmadd_pd(a3, w3, z0);
1409       a4 = _mm256_loadu_pd(v + 40);
1410       z1 = _mm256_fmadd_pd(a4, w3, z1);
1411       a5 = _mm256_loadu_pd(v + 44);
1412       z2 = _mm256_fmadd_pd(a5, w3, z2);
1413 
1414       /* fifth column of a */
1415       w0 = _mm256_set1_pd(work[4]);
1416       a0 = _mm256_loadu_pd(v + 48);
1417       z0 = _mm256_fmadd_pd(a0, w0, z0);
1418       a1 = _mm256_loadu_pd(v + 52);
1419       z1 = _mm256_fmadd_pd(a1, w0, z1);
1420       a2 = _mm256_loadu_pd(v + 56);
1421       z2 = _mm256_fmadd_pd(a2, w0, z2);
1422 
1423       /* sixth column of a */
1424       w1 = _mm256_set1_pd(work[5]);
1425       a3 = _mm256_loadu_pd(v + 60);
1426       z0 = _mm256_fmadd_pd(a3, w1, z0);
1427       a4 = _mm256_loadu_pd(v + 64);
1428       z1 = _mm256_fmadd_pd(a4, w1, z1);
1429       a5 = _mm256_loadu_pd(v + 68);
1430       z2 = _mm256_fmadd_pd(a5, w1, z2);
1431 
1432       /* seventh column of a */
1433       w2 = _mm256_set1_pd(work[6]);
1434       a0 = _mm256_loadu_pd(v + 72);
1435       z0 = _mm256_fmadd_pd(a0, w2, z0);
1436       a1 = _mm256_loadu_pd(v + 76);
1437       z1 = _mm256_fmadd_pd(a1, w2, z1);
1438       a2 = _mm256_loadu_pd(v + 80);
1439       z2 = _mm256_fmadd_pd(a2, w2, z2);
1440 
1441       /* eighth column of a */
1442       w3 = _mm256_set1_pd(work[7]);
1443       a3 = _mm256_loadu_pd(v + 84);
1444       z0 = _mm256_fmadd_pd(a3, w3, z0);
1445       a4 = _mm256_loadu_pd(v + 88);
1446       z1 = _mm256_fmadd_pd(a4, w3, z1);
1447       a5 = _mm256_loadu_pd(v + 92);
1448       z2 = _mm256_fmadd_pd(a5, w3, z2);
1449 
1450       /* ninth column of a */
1451       w0 = _mm256_set1_pd(work[8]);
1452       a0 = _mm256_loadu_pd(v + 96);
1453       z0 = _mm256_fmadd_pd(a0, w0, z0);
1454       a1 = _mm256_loadu_pd(v + 100);
1455       z1 = _mm256_fmadd_pd(a1, w0, z1);
1456       a2 = _mm256_loadu_pd(v + 104);
1457       z2 = _mm256_fmadd_pd(a2, w0, z2);
1458 
1459       /* tenth column of a */
1460       w1 = _mm256_set1_pd(work[9]);
1461       a3 = _mm256_loadu_pd(v + 108);
1462       z0 = _mm256_fmadd_pd(a3, w1, z0);
1463       a4 = _mm256_loadu_pd(v + 112);
1464       z1 = _mm256_fmadd_pd(a4, w1, z1);
1465       a5 = _mm256_loadu_pd(v + 116);
1466       z2 = _mm256_fmadd_pd(a5, w1, z2);
1467 
1468       /* eleventh column of a */
1469       w2 = _mm256_set1_pd(work[10]);
1470       a0 = _mm256_loadu_pd(v + 120);
1471       z0 = _mm256_fmadd_pd(a0, w2, z0);
1472       a1 = _mm256_loadu_pd(v + 124);
1473       z1 = _mm256_fmadd_pd(a1, w2, z1);
1474       a2 = _mm256_loadu_pd(v + 128);
1475       z2 = _mm256_fmadd_pd(a2, w2, z2);
1476 
1477       /* twelveth column of a */
1478       w3 = _mm256_set1_pd(work[11]);
1479       a3 = _mm256_loadu_pd(v + 132);
1480       z0 = _mm256_fmadd_pd(a3, w3, z0);
1481       a4 = _mm256_loadu_pd(v + 136);
1482       z1 = _mm256_fmadd_pd(a4, w3, z1);
1483       a5 = _mm256_loadu_pd(v + 140);
1484       z2 = _mm256_fmadd_pd(a5, w3, z2);
1485 
1486       v += bs2;
1487     }
1488     if (usecprow) z = zarray + bs * ridx[i];
1489     _mm256_storeu_pd(&z[0], z0);
1490     _mm256_storeu_pd(&z[4], z1);
1491     _mm256_storeu_pd(&z[8], z2);
1492     if (!usecprow) z += bs;
1493   }
1494   PetscCall(VecRestoreArrayRead(xx, &x));
1495   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1496   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1497   PetscFunctionReturn(0);
1498 }
1499 #endif
1500 
1501 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1502 /* Default MatMult for block size 15 */
1503 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) {
1504   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1505   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1506   const PetscScalar *x, *xb;
1507   PetscScalar       *zarray, xv;
1508   const MatScalar   *v;
1509   const PetscInt    *ii, *ij = a->j, *idx;
1510   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1511   PetscBool          usecprow = a->compressedrow.use;
1512 
1513   PetscFunctionBegin;
1514   PetscCall(VecGetArrayRead(xx, &x));
1515   PetscCall(VecGetArrayWrite(zz, &zarray));
1516 
1517   v = a->a;
1518   if (usecprow) {
1519     mbs  = a->compressedrow.nrows;
1520     ii   = a->compressedrow.i;
1521     ridx = a->compressedrow.rindex;
1522     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1523   } else {
1524     mbs = a->mbs;
1525     ii  = a->i;
1526     z   = zarray;
1527   }
1528 
1529   for (i = 0; i < mbs; i++) {
1530     n     = ii[i + 1] - ii[i];
1531     idx   = ij + ii[i];
1532     sum1  = 0.0;
1533     sum2  = 0.0;
1534     sum3  = 0.0;
1535     sum4  = 0.0;
1536     sum5  = 0.0;
1537     sum6  = 0.0;
1538     sum7  = 0.0;
1539     sum8  = 0.0;
1540     sum9  = 0.0;
1541     sum10 = 0.0;
1542     sum11 = 0.0;
1543     sum12 = 0.0;
1544     sum13 = 0.0;
1545     sum14 = 0.0;
1546     sum15 = 0.0;
1547 
1548     for (j = 0; j < n; j++) {
1549       xb = x + 15 * (idx[j]);
1550 
1551       for (k = 0; k < 15; k++) {
1552         xv = xb[k];
1553         sum1 += v[0] * xv;
1554         sum2 += v[1] * xv;
1555         sum3 += v[2] * xv;
1556         sum4 += v[3] * xv;
1557         sum5 += v[4] * xv;
1558         sum6 += v[5] * xv;
1559         sum7 += v[6] * xv;
1560         sum8 += v[7] * xv;
1561         sum9 += v[8] * xv;
1562         sum10 += v[9] * xv;
1563         sum11 += v[10] * xv;
1564         sum12 += v[11] * xv;
1565         sum13 += v[12] * xv;
1566         sum14 += v[13] * xv;
1567         sum15 += v[14] * xv;
1568         v += 15;
1569       }
1570     }
1571     if (usecprow) z = zarray + 15 * ridx[i];
1572     z[0]  = sum1;
1573     z[1]  = sum2;
1574     z[2]  = sum3;
1575     z[3]  = sum4;
1576     z[4]  = sum5;
1577     z[5]  = sum6;
1578     z[6]  = sum7;
1579     z[7]  = sum8;
1580     z[8]  = sum9;
1581     z[9]  = sum10;
1582     z[10] = sum11;
1583     z[11] = sum12;
1584     z[12] = sum13;
1585     z[13] = sum14;
1586     z[14] = sum15;
1587 
1588     if (!usecprow) z += 15;
1589   }
1590 
1591   PetscCall(VecRestoreArrayRead(xx, &x));
1592   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1593   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1598 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) {
1599   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1600   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1601   const PetscScalar *x, *xb;
1602   PetscScalar        x1, x2, x3, x4, *zarray;
1603   const MatScalar   *v;
1604   const PetscInt    *ii, *ij = a->j, *idx;
1605   PetscInt           mbs, i, j, n, *ridx = NULL;
1606   PetscBool          usecprow = a->compressedrow.use;
1607 
1608   PetscFunctionBegin;
1609   PetscCall(VecGetArrayRead(xx, &x));
1610   PetscCall(VecGetArrayWrite(zz, &zarray));
1611 
1612   v = a->a;
1613   if (usecprow) {
1614     mbs  = a->compressedrow.nrows;
1615     ii   = a->compressedrow.i;
1616     ridx = a->compressedrow.rindex;
1617     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1618   } else {
1619     mbs = a->mbs;
1620     ii  = a->i;
1621     z   = zarray;
1622   }
1623 
1624   for (i = 0; i < mbs; i++) {
1625     n     = ii[i + 1] - ii[i];
1626     idx   = ij + ii[i];
1627     sum1  = 0.0;
1628     sum2  = 0.0;
1629     sum3  = 0.0;
1630     sum4  = 0.0;
1631     sum5  = 0.0;
1632     sum6  = 0.0;
1633     sum7  = 0.0;
1634     sum8  = 0.0;
1635     sum9  = 0.0;
1636     sum10 = 0.0;
1637     sum11 = 0.0;
1638     sum12 = 0.0;
1639     sum13 = 0.0;
1640     sum14 = 0.0;
1641     sum15 = 0.0;
1642 
1643     for (j = 0; j < n; j++) {
1644       xb = x + 15 * (idx[j]);
1645       x1 = xb[0];
1646       x2 = xb[1];
1647       x3 = xb[2];
1648       x4 = xb[3];
1649 
1650       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1651       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1652       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1653       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1654       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1655       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1656       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1657       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1658       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1659       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1660       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1661       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1662       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1663       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1664       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1665 
1666       v += 60;
1667 
1668       x1 = xb[4];
1669       x2 = xb[5];
1670       x3 = xb[6];
1671       x4 = xb[7];
1672 
1673       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1674       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1675       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1676       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1677       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1678       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1679       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1680       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1681       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1682       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1683       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1684       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1685       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1686       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1687       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1688       v += 60;
1689 
1690       x1 = xb[8];
1691       x2 = xb[9];
1692       x3 = xb[10];
1693       x4 = xb[11];
1694       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1695       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1696       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1697       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1698       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1699       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1700       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1701       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1702       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1703       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1704       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1705       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1706       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1707       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1708       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1709       v += 60;
1710 
1711       x1 = xb[12];
1712       x2 = xb[13];
1713       x3 = xb[14];
1714       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1715       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1716       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1717       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1718       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1719       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1720       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1721       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1722       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1723       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1724       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1725       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1726       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1727       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1728       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1729       v += 45;
1730     }
1731     if (usecprow) z = zarray + 15 * ridx[i];
1732     z[0]  = sum1;
1733     z[1]  = sum2;
1734     z[2]  = sum3;
1735     z[3]  = sum4;
1736     z[4]  = sum5;
1737     z[5]  = sum6;
1738     z[6]  = sum7;
1739     z[7]  = sum8;
1740     z[8]  = sum9;
1741     z[9]  = sum10;
1742     z[10] = sum11;
1743     z[11] = sum12;
1744     z[12] = sum13;
1745     z[13] = sum14;
1746     z[14] = sum15;
1747 
1748     if (!usecprow) z += 15;
1749   }
1750 
1751   PetscCall(VecRestoreArrayRead(xx, &x));
1752   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1753   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1754   PetscFunctionReturn(0);
1755 }
1756 
1757 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1758 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) {
1759   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1760   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1761   const PetscScalar *x, *xb;
1762   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1763   const MatScalar   *v;
1764   const PetscInt    *ii, *ij = a->j, *idx;
1765   PetscInt           mbs, i, j, n, *ridx = NULL;
1766   PetscBool          usecprow = a->compressedrow.use;
1767 
1768   PetscFunctionBegin;
1769   PetscCall(VecGetArrayRead(xx, &x));
1770   PetscCall(VecGetArrayWrite(zz, &zarray));
1771 
1772   v = a->a;
1773   if (usecprow) {
1774     mbs  = a->compressedrow.nrows;
1775     ii   = a->compressedrow.i;
1776     ridx = a->compressedrow.rindex;
1777     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1778   } else {
1779     mbs = a->mbs;
1780     ii  = a->i;
1781     z   = zarray;
1782   }
1783 
1784   for (i = 0; i < mbs; i++) {
1785     n     = ii[i + 1] - ii[i];
1786     idx   = ij + ii[i];
1787     sum1  = 0.0;
1788     sum2  = 0.0;
1789     sum3  = 0.0;
1790     sum4  = 0.0;
1791     sum5  = 0.0;
1792     sum6  = 0.0;
1793     sum7  = 0.0;
1794     sum8  = 0.0;
1795     sum9  = 0.0;
1796     sum10 = 0.0;
1797     sum11 = 0.0;
1798     sum12 = 0.0;
1799     sum13 = 0.0;
1800     sum14 = 0.0;
1801     sum15 = 0.0;
1802 
1803     for (j = 0; j < n; j++) {
1804       xb = x + 15 * (idx[j]);
1805       x1 = xb[0];
1806       x2 = xb[1];
1807       x3 = xb[2];
1808       x4 = xb[3];
1809       x5 = xb[4];
1810       x6 = xb[5];
1811       x7 = xb[6];
1812       x8 = xb[7];
1813 
1814       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1815       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1816       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1817       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1818       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1819       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1820       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1821       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1822       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1823       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1824       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1825       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1826       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1827       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1828       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1829       v += 120;
1830 
1831       x1 = xb[8];
1832       x2 = xb[9];
1833       x3 = xb[10];
1834       x4 = xb[11];
1835       x5 = xb[12];
1836       x6 = xb[13];
1837       x7 = xb[14];
1838 
1839       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1840       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1841       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1842       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1843       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1844       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1845       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1846       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1847       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1848       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1849       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1850       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1851       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1852       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1853       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1854       v += 105;
1855     }
1856     if (usecprow) z = zarray + 15 * ridx[i];
1857     z[0]  = sum1;
1858     z[1]  = sum2;
1859     z[2]  = sum3;
1860     z[3]  = sum4;
1861     z[4]  = sum5;
1862     z[5]  = sum6;
1863     z[6]  = sum7;
1864     z[7]  = sum8;
1865     z[8]  = sum9;
1866     z[9]  = sum10;
1867     z[10] = sum11;
1868     z[11] = sum12;
1869     z[12] = sum13;
1870     z[13] = sum14;
1871     z[14] = sum15;
1872 
1873     if (!usecprow) z += 15;
1874   }
1875 
1876   PetscCall(VecRestoreArrayRead(xx, &x));
1877   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1878   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1883 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) {
1884   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1885   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1886   const PetscScalar *x, *xb;
1887   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1888   const MatScalar   *v;
1889   const PetscInt    *ii, *ij = a->j, *idx;
1890   PetscInt           mbs, i, j, n, *ridx = NULL;
1891   PetscBool          usecprow = a->compressedrow.use;
1892 
1893   PetscFunctionBegin;
1894   PetscCall(VecGetArrayRead(xx, &x));
1895   PetscCall(VecGetArrayWrite(zz, &zarray));
1896 
1897   v = a->a;
1898   if (usecprow) {
1899     mbs  = a->compressedrow.nrows;
1900     ii   = a->compressedrow.i;
1901     ridx = a->compressedrow.rindex;
1902     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1903   } else {
1904     mbs = a->mbs;
1905     ii  = a->i;
1906     z   = zarray;
1907   }
1908 
1909   for (i = 0; i < mbs; i++) {
1910     n     = ii[i + 1] - ii[i];
1911     idx   = ij + ii[i];
1912     sum1  = 0.0;
1913     sum2  = 0.0;
1914     sum3  = 0.0;
1915     sum4  = 0.0;
1916     sum5  = 0.0;
1917     sum6  = 0.0;
1918     sum7  = 0.0;
1919     sum8  = 0.0;
1920     sum9  = 0.0;
1921     sum10 = 0.0;
1922     sum11 = 0.0;
1923     sum12 = 0.0;
1924     sum13 = 0.0;
1925     sum14 = 0.0;
1926     sum15 = 0.0;
1927 
1928     for (j = 0; j < n; j++) {
1929       xb  = x + 15 * (idx[j]);
1930       x1  = xb[0];
1931       x2  = xb[1];
1932       x3  = xb[2];
1933       x4  = xb[3];
1934       x5  = xb[4];
1935       x6  = xb[5];
1936       x7  = xb[6];
1937       x8  = xb[7];
1938       x9  = xb[8];
1939       x10 = xb[9];
1940       x11 = xb[10];
1941       x12 = xb[11];
1942       x13 = xb[12];
1943       x14 = xb[13];
1944       x15 = xb[14];
1945 
1946       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1947       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1948       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1949       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1950       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1951       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1952       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1953       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1954       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1955       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1956       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1957       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1958       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1959       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1960       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1961       v += 225;
1962     }
1963     if (usecprow) z = zarray + 15 * ridx[i];
1964     z[0]  = sum1;
1965     z[1]  = sum2;
1966     z[2]  = sum3;
1967     z[3]  = sum4;
1968     z[4]  = sum5;
1969     z[5]  = sum6;
1970     z[6]  = sum7;
1971     z[7]  = sum8;
1972     z[8]  = sum9;
1973     z[9]  = sum10;
1974     z[10] = sum11;
1975     z[11] = sum12;
1976     z[12] = sum13;
1977     z[13] = sum14;
1978     z[14] = sum15;
1979 
1980     if (!usecprow) z += 15;
1981   }
1982 
1983   PetscCall(VecRestoreArrayRead(xx, &x));
1984   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1985   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1986   PetscFunctionReturn(0);
1987 }
1988 
1989 /*
1990     This will not work with MatScalar == float because it calls the BLAS
1991 */
1992 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) {
1993   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1994   PetscScalar       *z = NULL, *work, *workt, *zarray;
1995   const PetscScalar *x, *xb;
1996   const MatScalar   *v;
1997   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1998   const PetscInt    *idx, *ii, *ridx = NULL;
1999   PetscInt           ncols, k;
2000   PetscBool          usecprow = a->compressedrow.use;
2001 
2002   PetscFunctionBegin;
2003   PetscCall(VecGetArrayRead(xx, &x));
2004   PetscCall(VecGetArrayWrite(zz, &zarray));
2005 
2006   idx = a->j;
2007   v   = a->a;
2008   if (usecprow) {
2009     mbs  = a->compressedrow.nrows;
2010     ii   = a->compressedrow.i;
2011     ridx = a->compressedrow.rindex;
2012     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2013   } else {
2014     mbs = a->mbs;
2015     ii  = a->i;
2016     z   = zarray;
2017   }
2018 
2019   if (!a->mult_work) {
2020     k = PetscMax(A->rmap->n, A->cmap->n);
2021     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2022   }
2023   work = a->mult_work;
2024   for (i = 0; i < mbs; i++) {
2025     n = ii[1] - ii[0];
2026     ii++;
2027     ncols = n * bs;
2028     workt = work;
2029     for (j = 0; j < n; j++) {
2030       xb = x + bs * (*idx++);
2031       for (k = 0; k < bs; k++) workt[k] = xb[k];
2032       workt += bs;
2033     }
2034     if (usecprow) z = zarray + bs * ridx[i];
2035     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2036     v += n * bs2;
2037     if (!usecprow) z += bs;
2038   }
2039   PetscCall(VecRestoreArrayRead(xx, &x));
2040   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2041   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2042   PetscFunctionReturn(0);
2043 }
2044 
2045 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) {
2046   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2047   const PetscScalar *x;
2048   PetscScalar       *y, *z, sum;
2049   const MatScalar   *v;
2050   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2051   const PetscInt    *idx, *ii;
2052   PetscBool          usecprow = a->compressedrow.use;
2053 
2054   PetscFunctionBegin;
2055   PetscCall(VecGetArrayRead(xx, &x));
2056   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2057 
2058   idx = a->j;
2059   v   = a->a;
2060   if (usecprow) {
2061     if (zz != yy) { PetscCall(PetscArraycpy(z, y, mbs)); }
2062     mbs  = a->compressedrow.nrows;
2063     ii   = a->compressedrow.i;
2064     ridx = a->compressedrow.rindex;
2065   } else {
2066     ii = a->i;
2067   }
2068 
2069   for (i = 0; i < mbs; i++) {
2070     n = ii[1] - ii[0];
2071     ii++;
2072     if (!usecprow) {
2073       sum = y[i];
2074     } else {
2075       sum = y[ridx[i]];
2076     }
2077     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2078     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2079     PetscSparseDensePlusDot(sum, x, v, idx, n);
2080     v += n;
2081     idx += n;
2082     if (usecprow) {
2083       z[ridx[i]] = sum;
2084     } else {
2085       z[i] = sum;
2086     }
2087   }
2088   PetscCall(VecRestoreArrayRead(xx, &x));
2089   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2090   PetscCall(PetscLogFlops(2.0 * a->nz));
2091   PetscFunctionReturn(0);
2092 }
2093 
2094 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
2095   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2096   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2097   const PetscScalar *x, *xb;
2098   PetscScalar        x1, x2, *yarray, *zarray;
2099   const MatScalar   *v;
2100   PetscInt           mbs = a->mbs, i, n, j;
2101   const PetscInt    *idx, *ii, *ridx = NULL;
2102   PetscBool          usecprow = a->compressedrow.use;
2103 
2104   PetscFunctionBegin;
2105   PetscCall(VecGetArrayRead(xx, &x));
2106   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2107 
2108   idx = a->j;
2109   v   = a->a;
2110   if (usecprow) {
2111     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); }
2112     mbs  = a->compressedrow.nrows;
2113     ii   = a->compressedrow.i;
2114     ridx = a->compressedrow.rindex;
2115   } else {
2116     ii = a->i;
2117     y  = yarray;
2118     z  = zarray;
2119   }
2120 
2121   for (i = 0; i < mbs; i++) {
2122     n = ii[1] - ii[0];
2123     ii++;
2124     if (usecprow) {
2125       z = zarray + 2 * ridx[i];
2126       y = yarray + 2 * ridx[i];
2127     }
2128     sum1 = y[0];
2129     sum2 = y[1];
2130     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2131     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2132     for (j = 0; j < n; j++) {
2133       xb = x + 2 * (*idx++);
2134       x1 = xb[0];
2135       x2 = xb[1];
2136 
2137       sum1 += v[0] * x1 + v[2] * x2;
2138       sum2 += v[1] * x1 + v[3] * x2;
2139       v += 4;
2140     }
2141     z[0] = sum1;
2142     z[1] = sum2;
2143     if (!usecprow) {
2144       z += 2;
2145       y += 2;
2146     }
2147   }
2148   PetscCall(VecRestoreArrayRead(xx, &x));
2149   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2150   PetscCall(PetscLogFlops(4.0 * a->nz));
2151   PetscFunctionReturn(0);
2152 }
2153 
2154 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
2155   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2156   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2157   const PetscScalar *x, *xb;
2158   const MatScalar   *v;
2159   PetscInt           mbs = a->mbs, i, j, n;
2160   const PetscInt    *idx, *ii, *ridx = NULL;
2161   PetscBool          usecprow = a->compressedrow.use;
2162 
2163   PetscFunctionBegin;
2164   PetscCall(VecGetArrayRead(xx, &x));
2165   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2166 
2167   idx = a->j;
2168   v   = a->a;
2169   if (usecprow) {
2170     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); }
2171     mbs  = a->compressedrow.nrows;
2172     ii   = a->compressedrow.i;
2173     ridx = a->compressedrow.rindex;
2174   } else {
2175     ii = a->i;
2176     y  = yarray;
2177     z  = zarray;
2178   }
2179 
2180   for (i = 0; i < mbs; i++) {
2181     n = ii[1] - ii[0];
2182     ii++;
2183     if (usecprow) {
2184       z = zarray + 3 * ridx[i];
2185       y = yarray + 3 * ridx[i];
2186     }
2187     sum1 = y[0];
2188     sum2 = y[1];
2189     sum3 = y[2];
2190     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2191     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2192     for (j = 0; j < n; j++) {
2193       xb = x + 3 * (*idx++);
2194       x1 = xb[0];
2195       x2 = xb[1];
2196       x3 = xb[2];
2197       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2198       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2199       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2200       v += 9;
2201     }
2202     z[0] = sum1;
2203     z[1] = sum2;
2204     z[2] = sum3;
2205     if (!usecprow) {
2206       z += 3;
2207       y += 3;
2208     }
2209   }
2210   PetscCall(VecRestoreArrayRead(xx, &x));
2211   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2212   PetscCall(PetscLogFlops(18.0 * a->nz));
2213   PetscFunctionReturn(0);
2214 }
2215 
2216 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
2217   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2218   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2219   const PetscScalar *x, *xb;
2220   const MatScalar   *v;
2221   PetscInt           mbs = a->mbs, i, j, n;
2222   const PetscInt    *idx, *ii, *ridx = NULL;
2223   PetscBool          usecprow = a->compressedrow.use;
2224 
2225   PetscFunctionBegin;
2226   PetscCall(VecGetArrayRead(xx, &x));
2227   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2228 
2229   idx = a->j;
2230   v   = a->a;
2231   if (usecprow) {
2232     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); }
2233     mbs  = a->compressedrow.nrows;
2234     ii   = a->compressedrow.i;
2235     ridx = a->compressedrow.rindex;
2236   } else {
2237     ii = a->i;
2238     y  = yarray;
2239     z  = zarray;
2240   }
2241 
2242   for (i = 0; i < mbs; i++) {
2243     n = ii[1] - ii[0];
2244     ii++;
2245     if (usecprow) {
2246       z = zarray + 4 * ridx[i];
2247       y = yarray + 4 * ridx[i];
2248     }
2249     sum1 = y[0];
2250     sum2 = y[1];
2251     sum3 = y[2];
2252     sum4 = y[3];
2253     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2254     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2255     for (j = 0; j < n; j++) {
2256       xb = x + 4 * (*idx++);
2257       x1 = xb[0];
2258       x2 = xb[1];
2259       x3 = xb[2];
2260       x4 = xb[3];
2261       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2262       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2263       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2264       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2265       v += 16;
2266     }
2267     z[0] = sum1;
2268     z[1] = sum2;
2269     z[2] = sum3;
2270     z[3] = sum4;
2271     if (!usecprow) {
2272       z += 4;
2273       y += 4;
2274     }
2275   }
2276   PetscCall(VecRestoreArrayRead(xx, &x));
2277   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2278   PetscCall(PetscLogFlops(32.0 * a->nz));
2279   PetscFunctionReturn(0);
2280 }
2281 
2282 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
2283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2284   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2285   const PetscScalar *x, *xb;
2286   PetscScalar       *yarray, *zarray;
2287   const MatScalar   *v;
2288   PetscInt           mbs = a->mbs, i, j, n;
2289   const PetscInt    *idx, *ii, *ridx = NULL;
2290   PetscBool          usecprow = a->compressedrow.use;
2291 
2292   PetscFunctionBegin;
2293   PetscCall(VecGetArrayRead(xx, &x));
2294   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2295 
2296   idx = a->j;
2297   v   = a->a;
2298   if (usecprow) {
2299     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); }
2300     mbs  = a->compressedrow.nrows;
2301     ii   = a->compressedrow.i;
2302     ridx = a->compressedrow.rindex;
2303   } else {
2304     ii = a->i;
2305     y  = yarray;
2306     z  = zarray;
2307   }
2308 
2309   for (i = 0; i < mbs; i++) {
2310     n = ii[1] - ii[0];
2311     ii++;
2312     if (usecprow) {
2313       z = zarray + 5 * ridx[i];
2314       y = yarray + 5 * ridx[i];
2315     }
2316     sum1 = y[0];
2317     sum2 = y[1];
2318     sum3 = y[2];
2319     sum4 = y[3];
2320     sum5 = y[4];
2321     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2322     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2323     for (j = 0; j < n; j++) {
2324       xb = x + 5 * (*idx++);
2325       x1 = xb[0];
2326       x2 = xb[1];
2327       x3 = xb[2];
2328       x4 = xb[3];
2329       x5 = xb[4];
2330       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2331       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2332       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2333       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2334       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2335       v += 25;
2336     }
2337     z[0] = sum1;
2338     z[1] = sum2;
2339     z[2] = sum3;
2340     z[3] = sum4;
2341     z[4] = sum5;
2342     if (!usecprow) {
2343       z += 5;
2344       y += 5;
2345     }
2346   }
2347   PetscCall(VecRestoreArrayRead(xx, &x));
2348   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2349   PetscCall(PetscLogFlops(50.0 * a->nz));
2350   PetscFunctionReturn(0);
2351 }
2352 
2353 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
2354   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2355   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2356   const PetscScalar *x, *xb;
2357   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2358   const MatScalar   *v;
2359   PetscInt           mbs = a->mbs, i, j, n;
2360   const PetscInt    *idx, *ii, *ridx = NULL;
2361   PetscBool          usecprow = a->compressedrow.use;
2362 
2363   PetscFunctionBegin;
2364   PetscCall(VecGetArrayRead(xx, &x));
2365   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2366 
2367   idx = a->j;
2368   v   = a->a;
2369   if (usecprow) {
2370     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); }
2371     mbs  = a->compressedrow.nrows;
2372     ii   = a->compressedrow.i;
2373     ridx = a->compressedrow.rindex;
2374   } else {
2375     ii = a->i;
2376     y  = yarray;
2377     z  = zarray;
2378   }
2379 
2380   for (i = 0; i < mbs; i++) {
2381     n = ii[1] - ii[0];
2382     ii++;
2383     if (usecprow) {
2384       z = zarray + 6 * ridx[i];
2385       y = yarray + 6 * ridx[i];
2386     }
2387     sum1 = y[0];
2388     sum2 = y[1];
2389     sum3 = y[2];
2390     sum4 = y[3];
2391     sum5 = y[4];
2392     sum6 = y[5];
2393     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2394     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2395     for (j = 0; j < n; j++) {
2396       xb = x + 6 * (*idx++);
2397       x1 = xb[0];
2398       x2 = xb[1];
2399       x3 = xb[2];
2400       x4 = xb[3];
2401       x5 = xb[4];
2402       x6 = xb[5];
2403       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2404       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2405       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2406       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2407       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2408       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2409       v += 36;
2410     }
2411     z[0] = sum1;
2412     z[1] = sum2;
2413     z[2] = sum3;
2414     z[3] = sum4;
2415     z[4] = sum5;
2416     z[5] = sum6;
2417     if (!usecprow) {
2418       z += 6;
2419       y += 6;
2420     }
2421   }
2422   PetscCall(VecRestoreArrayRead(xx, &x));
2423   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2424   PetscCall(PetscLogFlops(72.0 * a->nz));
2425   PetscFunctionReturn(0);
2426 }
2427 
2428 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
2429   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2430   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2431   const PetscScalar *x, *xb;
2432   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2433   const MatScalar   *v;
2434   PetscInt           mbs = a->mbs, i, j, n;
2435   const PetscInt    *idx, *ii, *ridx = NULL;
2436   PetscBool          usecprow = a->compressedrow.use;
2437 
2438   PetscFunctionBegin;
2439   PetscCall(VecGetArrayRead(xx, &x));
2440   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2441 
2442   idx = a->j;
2443   v   = a->a;
2444   if (usecprow) {
2445     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); }
2446     mbs  = a->compressedrow.nrows;
2447     ii   = a->compressedrow.i;
2448     ridx = a->compressedrow.rindex;
2449   } else {
2450     ii = a->i;
2451     y  = yarray;
2452     z  = zarray;
2453   }
2454 
2455   for (i = 0; i < mbs; i++) {
2456     n = ii[1] - ii[0];
2457     ii++;
2458     if (usecprow) {
2459       z = zarray + 7 * ridx[i];
2460       y = yarray + 7 * ridx[i];
2461     }
2462     sum1 = y[0];
2463     sum2 = y[1];
2464     sum3 = y[2];
2465     sum4 = y[3];
2466     sum5 = y[4];
2467     sum6 = y[5];
2468     sum7 = y[6];
2469     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2470     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2471     for (j = 0; j < n; j++) {
2472       xb = x + 7 * (*idx++);
2473       x1 = xb[0];
2474       x2 = xb[1];
2475       x3 = xb[2];
2476       x4 = xb[3];
2477       x5 = xb[4];
2478       x6 = xb[5];
2479       x7 = xb[6];
2480       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2481       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2482       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2483       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2484       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2485       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2486       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2487       v += 49;
2488     }
2489     z[0] = sum1;
2490     z[1] = sum2;
2491     z[2] = sum3;
2492     z[3] = sum4;
2493     z[4] = sum5;
2494     z[5] = sum6;
2495     z[6] = sum7;
2496     if (!usecprow) {
2497       z += 7;
2498       y += 7;
2499     }
2500   }
2501   PetscCall(VecRestoreArrayRead(xx, &x));
2502   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2503   PetscCall(PetscLogFlops(98.0 * a->nz));
2504   PetscFunctionReturn(0);
2505 }
2506 
2507 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2508 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) {
2509   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2510   PetscScalar       *z = NULL, *work, *workt, *zarray;
2511   const PetscScalar *x, *xb;
2512   const MatScalar   *v;
2513   PetscInt           mbs, i, j, n;
2514   PetscInt           k;
2515   PetscBool          usecprow = a->compressedrow.use;
2516   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2517 
2518   __m256d a0, a1, a2, a3, a4, a5;
2519   __m256d w0, w1, w2, w3;
2520   __m256d z0, z1, z2;
2521   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2522 
2523   PetscFunctionBegin;
2524   PetscCall(VecCopy(yy, zz));
2525   PetscCall(VecGetArrayRead(xx, &x));
2526   PetscCall(VecGetArray(zz, &zarray));
2527 
2528   idx = a->j;
2529   v   = a->a;
2530   if (usecprow) {
2531     mbs  = a->compressedrow.nrows;
2532     ii   = a->compressedrow.i;
2533     ridx = a->compressedrow.rindex;
2534   } else {
2535     mbs = a->mbs;
2536     ii  = a->i;
2537     z   = zarray;
2538   }
2539 
2540   if (!a->mult_work) {
2541     k = PetscMax(A->rmap->n, A->cmap->n);
2542     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2543   }
2544 
2545   work = a->mult_work;
2546   for (i = 0; i < mbs; i++) {
2547     n = ii[1] - ii[0];
2548     ii++;
2549     workt = work;
2550     for (j = 0; j < n; j++) {
2551       xb = x + bs * (*idx++);
2552       for (k = 0; k < bs; k++) workt[k] = xb[k];
2553       workt += bs;
2554     }
2555     if (usecprow) z = zarray + bs * ridx[i];
2556 
2557     z0 = _mm256_loadu_pd(&z[0]);
2558     z1 = _mm256_loadu_pd(&z[4]);
2559     z2 = _mm256_set1_pd(z[8]);
2560 
2561     for (j = 0; j < n; j++) {
2562       /* first column of a */
2563       w0 = _mm256_set1_pd(work[j * 9]);
2564       a0 = _mm256_loadu_pd(&v[j * 81]);
2565       z0 = _mm256_fmadd_pd(a0, w0, z0);
2566       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2567       z1 = _mm256_fmadd_pd(a1, w0, z1);
2568       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2569       z2 = _mm256_fmadd_pd(a2, w0, z2);
2570 
2571       /* second column of a */
2572       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2573       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2574       z0 = _mm256_fmadd_pd(a0, w1, z0);
2575       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2576       z1 = _mm256_fmadd_pd(a1, w1, z1);
2577       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2578       z2 = _mm256_fmadd_pd(a2, w1, z2);
2579 
2580       /* third column of a */
2581       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2582       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2583       z0 = _mm256_fmadd_pd(a3, w2, z0);
2584       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2585       z1 = _mm256_fmadd_pd(a4, w2, z1);
2586       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2587       z2 = _mm256_fmadd_pd(a5, w2, z2);
2588 
2589       /* fourth column of a */
2590       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2591       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2592       z0 = _mm256_fmadd_pd(a0, w3, z0);
2593       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2594       z1 = _mm256_fmadd_pd(a1, w3, z1);
2595       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2596       z2 = _mm256_fmadd_pd(a2, w3, z2);
2597 
2598       /* fifth column of a */
2599       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2600       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2601       z0 = _mm256_fmadd_pd(a3, w0, z0);
2602       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2603       z1 = _mm256_fmadd_pd(a4, w0, z1);
2604       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2605       z2 = _mm256_fmadd_pd(a5, w0, z2);
2606 
2607       /* sixth column of a */
2608       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2609       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2610       z0 = _mm256_fmadd_pd(a0, w1, z0);
2611       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2612       z1 = _mm256_fmadd_pd(a1, w1, z1);
2613       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2614       z2 = _mm256_fmadd_pd(a2, w1, z2);
2615 
2616       /* seventh column of a */
2617       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2618       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2619       z0 = _mm256_fmadd_pd(a0, w2, z0);
2620       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2621       z1 = _mm256_fmadd_pd(a1, w2, z1);
2622       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2623       z2 = _mm256_fmadd_pd(a2, w2, z2);
2624 
2625       /* eighth column of a */
2626       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2627       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2628       z0 = _mm256_fmadd_pd(a3, w3, z0);
2629       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2630       z1 = _mm256_fmadd_pd(a4, w3, z1);
2631       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2632       z2 = _mm256_fmadd_pd(a5, w3, z2);
2633 
2634       /* ninth column of a */
2635       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2636       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2637       z0 = _mm256_fmadd_pd(a0, w0, z0);
2638       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2639       z1 = _mm256_fmadd_pd(a1, w0, z1);
2640       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2641       z2 = _mm256_fmadd_pd(a2, w0, z2);
2642     }
2643 
2644     _mm256_storeu_pd(&z[0], z0);
2645     _mm256_storeu_pd(&z[4], z1);
2646     _mm256_maskstore_pd(&z[8], mask1, z2);
2647 
2648     v += n * bs2;
2649     if (!usecprow) z += bs;
2650   }
2651   PetscCall(VecRestoreArrayRead(xx, &x));
2652   PetscCall(VecRestoreArray(zz, &zarray));
2653   PetscCall(PetscLogFlops(162.0 * a->nz));
2654   PetscFunctionReturn(0);
2655 }
2656 #endif
2657 
2658 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) {
2659   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2660   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2661   const PetscScalar *x, *xb;
2662   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2663   const MatScalar   *v;
2664   PetscInt           mbs = a->mbs, i, j, n;
2665   const PetscInt    *idx, *ii, *ridx = NULL;
2666   PetscBool          usecprow = a->compressedrow.use;
2667 
2668   PetscFunctionBegin;
2669   PetscCall(VecGetArrayRead(xx, &x));
2670   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2671 
2672   idx = a->j;
2673   v   = a->a;
2674   if (usecprow) {
2675     if (zz != yy) { PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); }
2676     mbs  = a->compressedrow.nrows;
2677     ii   = a->compressedrow.i;
2678     ridx = a->compressedrow.rindex;
2679   } else {
2680     ii = a->i;
2681     y  = yarray;
2682     z  = zarray;
2683   }
2684 
2685   for (i = 0; i < mbs; i++) {
2686     n = ii[1] - ii[0];
2687     ii++;
2688     if (usecprow) {
2689       z = zarray + 11 * ridx[i];
2690       y = yarray + 11 * ridx[i];
2691     }
2692     sum1  = y[0];
2693     sum2  = y[1];
2694     sum3  = y[2];
2695     sum4  = y[3];
2696     sum5  = y[4];
2697     sum6  = y[5];
2698     sum7  = y[6];
2699     sum8  = y[7];
2700     sum9  = y[8];
2701     sum10 = y[9];
2702     sum11 = y[10];
2703     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2704     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2705     for (j = 0; j < n; j++) {
2706       xb  = x + 11 * (*idx++);
2707       x1  = xb[0];
2708       x2  = xb[1];
2709       x3  = xb[2];
2710       x4  = xb[3];
2711       x5  = xb[4];
2712       x6  = xb[5];
2713       x7  = xb[6];
2714       x8  = xb[7];
2715       x9  = xb[8];
2716       x10 = xb[9];
2717       x11 = xb[10];
2718       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2719       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2720       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2721       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2722       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2723       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2724       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2725       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2726       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2727       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2728       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2729       v += 121;
2730     }
2731     z[0]  = sum1;
2732     z[1]  = sum2;
2733     z[2]  = sum3;
2734     z[3]  = sum4;
2735     z[4]  = sum5;
2736     z[5]  = sum6;
2737     z[6]  = sum7;
2738     z[7]  = sum8;
2739     z[8]  = sum9;
2740     z[9]  = sum10;
2741     z[10] = sum11;
2742     if (!usecprow) {
2743       z += 11;
2744       y += 11;
2745     }
2746   }
2747   PetscCall(VecRestoreArrayRead(xx, &x));
2748   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2749   PetscCall(PetscLogFlops(242.0 * a->nz));
2750   PetscFunctionReturn(0);
2751 }
2752 
2753 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
2754   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2755   PetscScalar       *z = NULL, *work, *workt, *zarray;
2756   const PetscScalar *x, *xb;
2757   const MatScalar   *v;
2758   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2759   PetscInt           ncols, k;
2760   const PetscInt    *ridx     = NULL, *idx, *ii;
2761   PetscBool          usecprow = a->compressedrow.use;
2762 
2763   PetscFunctionBegin;
2764   PetscCall(VecCopy(yy, zz));
2765   PetscCall(VecGetArrayRead(xx, &x));
2766   PetscCall(VecGetArray(zz, &zarray));
2767 
2768   idx = a->j;
2769   v   = a->a;
2770   if (usecprow) {
2771     mbs  = a->compressedrow.nrows;
2772     ii   = a->compressedrow.i;
2773     ridx = a->compressedrow.rindex;
2774   } else {
2775     mbs = a->mbs;
2776     ii  = a->i;
2777     z   = zarray;
2778   }
2779 
2780   if (!a->mult_work) {
2781     k = PetscMax(A->rmap->n, A->cmap->n);
2782     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2783   }
2784   work = a->mult_work;
2785   for (i = 0; i < mbs; i++) {
2786     n = ii[1] - ii[0];
2787     ii++;
2788     ncols = n * bs;
2789     workt = work;
2790     for (j = 0; j < n; j++) {
2791       xb = x + bs * (*idx++);
2792       for (k = 0; k < bs; k++) workt[k] = xb[k];
2793       workt += bs;
2794     }
2795     if (usecprow) z = zarray + bs * ridx[i];
2796     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2797     v += n * bs2;
2798     if (!usecprow) z += bs;
2799   }
2800   PetscCall(VecRestoreArrayRead(xx, &x));
2801   PetscCall(VecRestoreArray(zz, &zarray));
2802   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2803   PetscFunctionReturn(0);
2804 }
2805 
2806 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
2807   PetscScalar zero = 0.0;
2808 
2809   PetscFunctionBegin;
2810   PetscCall(VecSet(zz, zero));
2811   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2812   PetscFunctionReturn(0);
2813 }
2814 
2815 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) {
2816   PetscScalar zero = 0.0;
2817 
2818   PetscFunctionBegin;
2819   PetscCall(VecSet(zz, zero));
2820   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2821   PetscFunctionReturn(0);
2822 }
2823 
2824 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
2825   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2826   PetscScalar       *z, x1, x2, x3, x4, x5;
2827   const PetscScalar *x, *xb = NULL;
2828   const MatScalar   *v;
2829   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2830   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2831   Mat_CompressedRow  cprow    = a->compressedrow;
2832   PetscBool          usecprow = cprow.use;
2833 
2834   PetscFunctionBegin;
2835   if (yy != zz) PetscCall(VecCopy(yy, zz));
2836   PetscCall(VecGetArrayRead(xx, &x));
2837   PetscCall(VecGetArray(zz, &z));
2838 
2839   idx = a->j;
2840   v   = a->a;
2841   if (usecprow) {
2842     mbs  = cprow.nrows;
2843     ii   = cprow.i;
2844     ridx = cprow.rindex;
2845   } else {
2846     mbs = a->mbs;
2847     ii  = a->i;
2848     xb  = x;
2849   }
2850 
2851   switch (bs) {
2852   case 1:
2853     for (i = 0; i < mbs; i++) {
2854       if (usecprow) xb = x + ridx[i];
2855       x1 = xb[0];
2856       ib = idx + ii[0];
2857       n  = ii[1] - ii[0];
2858       ii++;
2859       for (j = 0; j < n; j++) {
2860         rval = ib[j];
2861         z[rval] += PetscConj(*v) * x1;
2862         v++;
2863       }
2864       if (!usecprow) xb++;
2865     }
2866     break;
2867   case 2:
2868     for (i = 0; i < mbs; i++) {
2869       if (usecprow) xb = x + 2 * ridx[i];
2870       x1 = xb[0];
2871       x2 = xb[1];
2872       ib = idx + ii[0];
2873       n  = ii[1] - ii[0];
2874       ii++;
2875       for (j = 0; j < n; j++) {
2876         rval = ib[j] * 2;
2877         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2878         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2879         v += 4;
2880       }
2881       if (!usecprow) xb += 2;
2882     }
2883     break;
2884   case 3:
2885     for (i = 0; i < mbs; i++) {
2886       if (usecprow) xb = x + 3 * ridx[i];
2887       x1 = xb[0];
2888       x2 = xb[1];
2889       x3 = xb[2];
2890       ib = idx + ii[0];
2891       n  = ii[1] - ii[0];
2892       ii++;
2893       for (j = 0; j < n; j++) {
2894         rval = ib[j] * 3;
2895         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2896         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2897         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2898         v += 9;
2899       }
2900       if (!usecprow) xb += 3;
2901     }
2902     break;
2903   case 4:
2904     for (i = 0; i < mbs; i++) {
2905       if (usecprow) xb = x + 4 * ridx[i];
2906       x1 = xb[0];
2907       x2 = xb[1];
2908       x3 = xb[2];
2909       x4 = xb[3];
2910       ib = idx + ii[0];
2911       n  = ii[1] - ii[0];
2912       ii++;
2913       for (j = 0; j < n; j++) {
2914         rval = ib[j] * 4;
2915         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2916         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2917         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2918         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2919         v += 16;
2920       }
2921       if (!usecprow) xb += 4;
2922     }
2923     break;
2924   case 5:
2925     for (i = 0; i < mbs; i++) {
2926       if (usecprow) xb = x + 5 * ridx[i];
2927       x1 = xb[0];
2928       x2 = xb[1];
2929       x3 = xb[2];
2930       x4 = xb[3];
2931       x5 = xb[4];
2932       ib = idx + ii[0];
2933       n  = ii[1] - ii[0];
2934       ii++;
2935       for (j = 0; j < n; j++) {
2936         rval = ib[j] * 5;
2937         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2938         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2939         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2940         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2941         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2942         v += 25;
2943       }
2944       if (!usecprow) xb += 5;
2945     }
2946     break;
2947   default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2948 #if 0
2949     {
2950       PetscInt          ncols,k,bs2=a->bs2;
2951       PetscScalar       *work,*workt,zb;
2952       const PetscScalar *xtmp;
2953       if (!a->mult_work) {
2954         k    = PetscMax(A->rmap->n,A->cmap->n);
2955         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2956       }
2957       work = a->mult_work;
2958       xtmp = x;
2959       for (i=0; i<mbs; i++) {
2960         n     = ii[1] - ii[0]; ii++;
2961         ncols = n*bs;
2962         PetscCall(PetscArrayzero(work,ncols));
2963         if (usecprow) xtmp = x + bs*ridx[i];
2964         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2965         v += n*bs2;
2966         if (!usecprow) xtmp += bs;
2967         workt = work;
2968         for (j=0; j<n; j++) {
2969           zb = z + bs*(*idx++);
2970           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2971           workt += bs;
2972         }
2973       }
2974     }
2975 #endif
2976   }
2977   PetscCall(VecRestoreArrayRead(xx, &x));
2978   PetscCall(VecRestoreArray(zz, &z));
2979   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2980   PetscFunctionReturn(0);
2981 }
2982 
2983 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) {
2984   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2985   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2986   const PetscScalar *x, *xb = NULL;
2987   const MatScalar   *v;
2988   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2989   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2990   Mat_CompressedRow  cprow    = a->compressedrow;
2991   PetscBool          usecprow = cprow.use;
2992 
2993   PetscFunctionBegin;
2994   if (yy != zz) PetscCall(VecCopy(yy, zz));
2995   PetscCall(VecGetArrayRead(xx, &x));
2996   PetscCall(VecGetArray(zz, &z));
2997 
2998   idx = a->j;
2999   v   = a->a;
3000   if (usecprow) {
3001     mbs  = cprow.nrows;
3002     ii   = cprow.i;
3003     ridx = cprow.rindex;
3004   } else {
3005     mbs = a->mbs;
3006     ii  = a->i;
3007     xb  = x;
3008   }
3009 
3010   switch (bs) {
3011   case 1:
3012     for (i = 0; i < mbs; i++) {
3013       if (usecprow) xb = x + ridx[i];
3014       x1 = xb[0];
3015       ib = idx + ii[0];
3016       n  = ii[1] - ii[0];
3017       ii++;
3018       for (j = 0; j < n; j++) {
3019         rval = ib[j];
3020         z[rval] += *v * x1;
3021         v++;
3022       }
3023       if (!usecprow) xb++;
3024     }
3025     break;
3026   case 2:
3027     for (i = 0; i < mbs; i++) {
3028       if (usecprow) xb = x + 2 * ridx[i];
3029       x1 = xb[0];
3030       x2 = xb[1];
3031       ib = idx + ii[0];
3032       n  = ii[1] - ii[0];
3033       ii++;
3034       for (j = 0; j < n; j++) {
3035         rval = ib[j] * 2;
3036         z[rval++] += v[0] * x1 + v[1] * x2;
3037         z[rval++] += v[2] * x1 + v[3] * x2;
3038         v += 4;
3039       }
3040       if (!usecprow) xb += 2;
3041     }
3042     break;
3043   case 3:
3044     for (i = 0; i < mbs; i++) {
3045       if (usecprow) xb = x + 3 * ridx[i];
3046       x1 = xb[0];
3047       x2 = xb[1];
3048       x3 = xb[2];
3049       ib = idx + ii[0];
3050       n  = ii[1] - ii[0];
3051       ii++;
3052       for (j = 0; j < n; j++) {
3053         rval = ib[j] * 3;
3054         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3055         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3056         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3057         v += 9;
3058       }
3059       if (!usecprow) xb += 3;
3060     }
3061     break;
3062   case 4:
3063     for (i = 0; i < mbs; i++) {
3064       if (usecprow) xb = x + 4 * ridx[i];
3065       x1 = xb[0];
3066       x2 = xb[1];
3067       x3 = xb[2];
3068       x4 = xb[3];
3069       ib = idx + ii[0];
3070       n  = ii[1] - ii[0];
3071       ii++;
3072       for (j = 0; j < n; j++) {
3073         rval = ib[j] * 4;
3074         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3075         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3076         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3077         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3078         v += 16;
3079       }
3080       if (!usecprow) xb += 4;
3081     }
3082     break;
3083   case 5:
3084     for (i = 0; i < mbs; i++) {
3085       if (usecprow) xb = x + 5 * ridx[i];
3086       x1 = xb[0];
3087       x2 = xb[1];
3088       x3 = xb[2];
3089       x4 = xb[3];
3090       x5 = xb[4];
3091       ib = idx + ii[0];
3092       n  = ii[1] - ii[0];
3093       ii++;
3094       for (j = 0; j < n; j++) {
3095         rval = ib[j] * 5;
3096         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3097         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3098         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3099         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3100         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3101         v += 25;
3102       }
3103       if (!usecprow) xb += 5;
3104     }
3105     break;
3106   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3107     PetscInt           ncols, k;
3108     PetscScalar       *work, *workt;
3109     const PetscScalar *xtmp;
3110     if (!a->mult_work) {
3111       k = PetscMax(A->rmap->n, A->cmap->n);
3112       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3113     }
3114     work = a->mult_work;
3115     xtmp = x;
3116     for (i = 0; i < mbs; i++) {
3117       n = ii[1] - ii[0];
3118       ii++;
3119       ncols = n * bs;
3120       PetscCall(PetscArrayzero(work, ncols));
3121       if (usecprow) xtmp = x + bs * ridx[i];
3122       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3123       v += n * bs2;
3124       if (!usecprow) xtmp += bs;
3125       workt = work;
3126       for (j = 0; j < n; j++) {
3127         zb = z + bs * (*idx++);
3128         for (k = 0; k < bs; k++) zb[k] += workt[k];
3129         workt += bs;
3130       }
3131     }
3132   }
3133   }
3134   PetscCall(VecRestoreArrayRead(xx, &x));
3135   PetscCall(VecRestoreArray(zz, &z));
3136   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3137   PetscFunctionReturn(0);
3138 }
3139 
3140 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) {
3141   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3142   PetscInt     totalnz = a->bs2 * a->nz;
3143   PetscScalar  oalpha  = alpha;
3144   PetscBLASInt one     = 1, tnz;
3145 
3146   PetscFunctionBegin;
3147   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3148   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3149   PetscCall(PetscLogFlops(totalnz));
3150   PetscFunctionReturn(0);
3151 }
3152 
3153 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) {
3154   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3155   MatScalar   *v   = a->a;
3156   PetscReal    sum = 0.0;
3157   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3158 
3159   PetscFunctionBegin;
3160   if (type == NORM_FROBENIUS) {
3161 #if defined(PETSC_USE_REAL___FP16)
3162     PetscBLASInt one = 1, cnt = bs2 * nz;
3163     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3164 #else
3165     for (i = 0; i < bs2 * nz; i++) {
3166       sum += PetscRealPart(PetscConj(*v) * (*v));
3167       v++;
3168     }
3169 #endif
3170     *norm = PetscSqrtReal(sum);
3171     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3172   } else if (type == NORM_1) { /* maximum column sum */
3173     PetscReal *tmp;
3174     PetscInt  *bcol = a->j;
3175     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3176     for (i = 0; i < nz; i++) {
3177       for (j = 0; j < bs; j++) {
3178         k1 = bs * (*bcol) + j; /* column index */
3179         for (k = 0; k < bs; k++) {
3180           tmp[k1] += PetscAbsScalar(*v);
3181           v++;
3182         }
3183       }
3184       bcol++;
3185     }
3186     *norm = 0.0;
3187     for (j = 0; j < A->cmap->n; j++) {
3188       if (tmp[j] > *norm) *norm = tmp[j];
3189     }
3190     PetscCall(PetscFree(tmp));
3191     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3192   } else if (type == NORM_INFINITY) { /* maximum row sum */
3193     *norm = 0.0;
3194     for (k = 0; k < bs; k++) {
3195       for (j = 0; j < a->mbs; j++) {
3196         v   = a->a + bs2 * a->i[j] + k;
3197         sum = 0.0;
3198         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3199           for (k1 = 0; k1 < bs; k1++) {
3200             sum += PetscAbsScalar(*v);
3201             v += bs;
3202           }
3203         }
3204         if (sum > *norm) *norm = sum;
3205       }
3206     }
3207     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3208   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3209   PetscFunctionReturn(0);
3210 }
3211 
3212 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg) {
3213   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3214 
3215   PetscFunctionBegin;
3216   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3217   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3218     *flg = PETSC_FALSE;
3219     PetscFunctionReturn(0);
3220   }
3221 
3222   /* if the a->i are the same */
3223   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3224   if (!*flg) PetscFunctionReturn(0);
3225 
3226   /* if a->j are the same */
3227   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3228   if (!*flg) PetscFunctionReturn(0);
3229 
3230   /* if a->a are the same */
3231   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3232   PetscFunctionReturn(0);
3233 }
3234 
3235 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) {
3236   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3237   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3238   PetscScalar *x, zero = 0.0;
3239   MatScalar   *aa, *aa_j;
3240 
3241   PetscFunctionBegin;
3242   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3243   bs   = A->rmap->bs;
3244   aa   = a->a;
3245   ai   = a->i;
3246   aj   = a->j;
3247   ambs = a->mbs;
3248   bs2  = a->bs2;
3249 
3250   PetscCall(VecSet(v, zero));
3251   PetscCall(VecGetArray(v, &x));
3252   PetscCall(VecGetLocalSize(v, &n));
3253   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3254   for (i = 0; i < ambs; i++) {
3255     for (j = ai[i]; j < ai[i + 1]; j++) {
3256       if (aj[j] == i) {
3257         row  = i * bs;
3258         aa_j = aa + j * bs2;
3259         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3260         break;
3261       }
3262     }
3263   }
3264   PetscCall(VecRestoreArray(v, &x));
3265   PetscFunctionReturn(0);
3266 }
3267 
3268 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) {
3269   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3270   const PetscScalar *l, *r, *li, *ri;
3271   PetscScalar        x;
3272   MatScalar         *aa, *v;
3273   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3274   const PetscInt    *ai, *aj;
3275 
3276   PetscFunctionBegin;
3277   ai  = a->i;
3278   aj  = a->j;
3279   aa  = a->a;
3280   m   = A->rmap->n;
3281   n   = A->cmap->n;
3282   bs  = A->rmap->bs;
3283   mbs = a->mbs;
3284   bs2 = a->bs2;
3285   if (ll) {
3286     PetscCall(VecGetArrayRead(ll, &l));
3287     PetscCall(VecGetLocalSize(ll, &lm));
3288     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3289     for (i = 0; i < mbs; i++) { /* for each block row */
3290       M  = ai[i + 1] - ai[i];
3291       li = l + i * bs;
3292       v  = aa + bs2 * ai[i];
3293       for (j = 0; j < M; j++) { /* for each block */
3294         for (k = 0; k < bs2; k++) { (*v++) *= li[k % bs]; }
3295       }
3296     }
3297     PetscCall(VecRestoreArrayRead(ll, &l));
3298     PetscCall(PetscLogFlops(a->nz));
3299   }
3300 
3301   if (rr) {
3302     PetscCall(VecGetArrayRead(rr, &r));
3303     PetscCall(VecGetLocalSize(rr, &rn));
3304     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3305     for (i = 0; i < mbs; i++) { /* for each block row */
3306       iai = ai[i];
3307       M   = ai[i + 1] - iai;
3308       v   = aa + bs2 * iai;
3309       for (j = 0; j < M; j++) { /* for each block */
3310         ri = r + bs * aj[iai + j];
3311         for (k = 0; k < bs; k++) {
3312           x = ri[k];
3313           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3314           v += bs;
3315         }
3316       }
3317     }
3318     PetscCall(VecRestoreArrayRead(rr, &r));
3319     PetscCall(PetscLogFlops(a->nz));
3320   }
3321   PetscFunctionReturn(0);
3322 }
3323 
3324 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) {
3325   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3326 
3327   PetscFunctionBegin;
3328   info->block_size   = a->bs2;
3329   info->nz_allocated = a->bs2 * a->maxnz;
3330   info->nz_used      = a->bs2 * a->nz;
3331   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3332   info->assemblies   = A->num_ass;
3333   info->mallocs      = A->info.mallocs;
3334   info->memory       = ((PetscObject)A)->mem;
3335   if (A->factortype) {
3336     info->fill_ratio_given  = A->info.fill_ratio_given;
3337     info->fill_ratio_needed = A->info.fill_ratio_needed;
3338     info->factor_mallocs    = A->info.factor_mallocs;
3339   } else {
3340     info->fill_ratio_given  = 0;
3341     info->fill_ratio_needed = 0;
3342     info->factor_mallocs    = 0;
3343   }
3344   PetscFunctionReturn(0);
3345 }
3346 
3347 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) {
3348   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3349 
3350   PetscFunctionBegin;
3351   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3352   PetscFunctionReturn(0);
3353 }
3354 
3355 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) {
3356   PetscFunctionBegin;
3357   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3358   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3359   PetscFunctionReturn(0);
3360 }
3361 
3362 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3363   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3364   PetscScalar       *z = NULL, sum1;
3365   const PetscScalar *xb;
3366   PetscScalar        x1;
3367   const MatScalar   *v, *vv;
3368   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3369   PetscBool          usecprow = a->compressedrow.use;
3370 
3371   PetscFunctionBegin;
3372   idx = a->j;
3373   v   = a->a;
3374   if (usecprow) {
3375     mbs  = a->compressedrow.nrows;
3376     ii   = a->compressedrow.i;
3377     ridx = a->compressedrow.rindex;
3378   } else {
3379     mbs = a->mbs;
3380     ii  = a->i;
3381     z   = c;
3382   }
3383 
3384   for (i = 0; i < mbs; i++) {
3385     n = ii[1] - ii[0];
3386     ii++;
3387     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3388     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3389     if (usecprow) z = c + ridx[i];
3390     jj = idx;
3391     vv = v;
3392     for (k = 0; k < cn; k++) {
3393       idx  = jj;
3394       v    = vv;
3395       sum1 = 0.0;
3396       for (j = 0; j < n; j++) {
3397         xb = b + (*idx++);
3398         x1 = xb[0 + k * bm];
3399         sum1 += v[0] * x1;
3400         v += 1;
3401       }
3402       z[0 + k * cm] = sum1;
3403     }
3404     if (!usecprow) z += 1;
3405   }
3406   PetscFunctionReturn(0);
3407 }
3408 
3409 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3410   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3411   PetscScalar       *z = NULL, sum1, sum2;
3412   const PetscScalar *xb;
3413   PetscScalar        x1, x2;
3414   const MatScalar   *v, *vv;
3415   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3416   PetscBool          usecprow = a->compressedrow.use;
3417 
3418   PetscFunctionBegin;
3419   idx = a->j;
3420   v   = a->a;
3421   if (usecprow) {
3422     mbs  = a->compressedrow.nrows;
3423     ii   = a->compressedrow.i;
3424     ridx = a->compressedrow.rindex;
3425   } else {
3426     mbs = a->mbs;
3427     ii  = a->i;
3428     z   = c;
3429   }
3430 
3431   for (i = 0; i < mbs; i++) {
3432     n = ii[1] - ii[0];
3433     ii++;
3434     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3435     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3436     if (usecprow) z = c + 2 * ridx[i];
3437     jj = idx;
3438     vv = v;
3439     for (k = 0; k < cn; k++) {
3440       idx  = jj;
3441       v    = vv;
3442       sum1 = 0.0;
3443       sum2 = 0.0;
3444       for (j = 0; j < n; j++) {
3445         xb = b + 2 * (*idx++);
3446         x1 = xb[0 + k * bm];
3447         x2 = xb[1 + k * bm];
3448         sum1 += v[0] * x1 + v[2] * x2;
3449         sum2 += v[1] * x1 + v[3] * x2;
3450         v += 4;
3451       }
3452       z[0 + k * cm] = sum1;
3453       z[1 + k * cm] = sum2;
3454     }
3455     if (!usecprow) z += 2;
3456   }
3457   PetscFunctionReturn(0);
3458 }
3459 
3460 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3461   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3462   PetscScalar       *z = NULL, sum1, sum2, sum3;
3463   const PetscScalar *xb;
3464   PetscScalar        x1, x2, x3;
3465   const MatScalar   *v, *vv;
3466   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3467   PetscBool          usecprow = a->compressedrow.use;
3468 
3469   PetscFunctionBegin;
3470   idx = a->j;
3471   v   = a->a;
3472   if (usecprow) {
3473     mbs  = a->compressedrow.nrows;
3474     ii   = a->compressedrow.i;
3475     ridx = a->compressedrow.rindex;
3476   } else {
3477     mbs = a->mbs;
3478     ii  = a->i;
3479     z   = c;
3480   }
3481 
3482   for (i = 0; i < mbs; i++) {
3483     n = ii[1] - ii[0];
3484     ii++;
3485     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3486     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3487     if (usecprow) z = c + 3 * ridx[i];
3488     jj = idx;
3489     vv = v;
3490     for (k = 0; k < cn; k++) {
3491       idx  = jj;
3492       v    = vv;
3493       sum1 = 0.0;
3494       sum2 = 0.0;
3495       sum3 = 0.0;
3496       for (j = 0; j < n; j++) {
3497         xb = b + 3 * (*idx++);
3498         x1 = xb[0 + k * bm];
3499         x2 = xb[1 + k * bm];
3500         x3 = xb[2 + k * bm];
3501         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3502         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3503         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3504         v += 9;
3505       }
3506       z[0 + k * cm] = sum1;
3507       z[1 + k * cm] = sum2;
3508       z[2 + k * cm] = sum3;
3509     }
3510     if (!usecprow) z += 3;
3511   }
3512   PetscFunctionReturn(0);
3513 }
3514 
3515 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3516   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3517   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3518   const PetscScalar *xb;
3519   PetscScalar        x1, x2, x3, x4;
3520   const MatScalar   *v, *vv;
3521   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3522   PetscBool          usecprow = a->compressedrow.use;
3523 
3524   PetscFunctionBegin;
3525   idx = a->j;
3526   v   = a->a;
3527   if (usecprow) {
3528     mbs  = a->compressedrow.nrows;
3529     ii   = a->compressedrow.i;
3530     ridx = a->compressedrow.rindex;
3531   } else {
3532     mbs = a->mbs;
3533     ii  = a->i;
3534     z   = c;
3535   }
3536 
3537   for (i = 0; i < mbs; i++) {
3538     n = ii[1] - ii[0];
3539     ii++;
3540     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3541     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3542     if (usecprow) z = c + 4 * ridx[i];
3543     jj = idx;
3544     vv = v;
3545     for (k = 0; k < cn; k++) {
3546       idx  = jj;
3547       v    = vv;
3548       sum1 = 0.0;
3549       sum2 = 0.0;
3550       sum3 = 0.0;
3551       sum4 = 0.0;
3552       for (j = 0; j < n; j++) {
3553         xb = b + 4 * (*idx++);
3554         x1 = xb[0 + k * bm];
3555         x2 = xb[1 + k * bm];
3556         x3 = xb[2 + k * bm];
3557         x4 = xb[3 + k * bm];
3558         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3559         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3560         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3561         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3562         v += 16;
3563       }
3564       z[0 + k * cm] = sum1;
3565       z[1 + k * cm] = sum2;
3566       z[2 + k * cm] = sum3;
3567       z[3 + k * cm] = sum4;
3568     }
3569     if (!usecprow) z += 4;
3570   }
3571   PetscFunctionReturn(0);
3572 }
3573 
3574 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) {
3575   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3576   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3577   const PetscScalar *xb;
3578   PetscScalar        x1, x2, x3, x4, x5;
3579   const MatScalar   *v, *vv;
3580   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3581   PetscBool          usecprow = a->compressedrow.use;
3582 
3583   PetscFunctionBegin;
3584   idx = a->j;
3585   v   = a->a;
3586   if (usecprow) {
3587     mbs  = a->compressedrow.nrows;
3588     ii   = a->compressedrow.i;
3589     ridx = a->compressedrow.rindex;
3590   } else {
3591     mbs = a->mbs;
3592     ii  = a->i;
3593     z   = c;
3594   }
3595 
3596   for (i = 0; i < mbs; i++) {
3597     n = ii[1] - ii[0];
3598     ii++;
3599     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3600     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3601     if (usecprow) z = c + 5 * ridx[i];
3602     jj = idx;
3603     vv = v;
3604     for (k = 0; k < cn; k++) {
3605       idx  = jj;
3606       v    = vv;
3607       sum1 = 0.0;
3608       sum2 = 0.0;
3609       sum3 = 0.0;
3610       sum4 = 0.0;
3611       sum5 = 0.0;
3612       for (j = 0; j < n; j++) {
3613         xb = b + 5 * (*idx++);
3614         x1 = xb[0 + k * bm];
3615         x2 = xb[1 + k * bm];
3616         x3 = xb[2 + k * bm];
3617         x4 = xb[3 + k * bm];
3618         x5 = xb[4 + k * bm];
3619         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3620         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3621         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3622         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3623         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3624         v += 25;
3625       }
3626       z[0 + k * cm] = sum1;
3627       z[1 + k * cm] = sum2;
3628       z[2 + k * cm] = sum3;
3629       z[3 + k * cm] = sum4;
3630       z[4 + k * cm] = sum5;
3631     }
3632     if (!usecprow) z += 5;
3633   }
3634   PetscFunctionReturn(0);
3635 }
3636 
3637 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) {
3638   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3639   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3640   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3641   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3642   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3643   PetscBLASInt     bbs, bcn, bbm, bcm;
3644   PetscScalar     *z = NULL;
3645   PetscScalar     *c, *b;
3646   const MatScalar *v;
3647   const PetscInt  *idx, *ii, *ridx = NULL;
3648   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3649   PetscBool        usecprow = a->compressedrow.use;
3650 
3651   PetscFunctionBegin;
3652   if (!cm || !cn) PetscFunctionReturn(0);
3653   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3654   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3655   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3656   b = bd->v;
3657   if (a->nonzerorowcnt != A->rmap->n) { PetscCall(MatZeroEntries(C)); }
3658   PetscCall(MatDenseGetArray(C, &c));
3659   switch (bs) {
3660   case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break;
3661   case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break;
3662   case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break;
3663   case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break;
3664   case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break;
3665   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3666     PetscCall(PetscBLASIntCast(bs, &bbs));
3667     PetscCall(PetscBLASIntCast(cn, &bcn));
3668     PetscCall(PetscBLASIntCast(bm, &bbm));
3669     PetscCall(PetscBLASIntCast(cm, &bcm));
3670     idx = a->j;
3671     v   = a->a;
3672     if (usecprow) {
3673       mbs  = a->compressedrow.nrows;
3674       ii   = a->compressedrow.i;
3675       ridx = a->compressedrow.rindex;
3676     } else {
3677       mbs = a->mbs;
3678       ii  = a->i;
3679       z   = c;
3680     }
3681     for (i = 0; i < mbs; i++) {
3682       n = ii[1] - ii[0];
3683       ii++;
3684       if (usecprow) z = c + bs * ridx[i];
3685       if (n) {
3686         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3687         v += bs2;
3688       }
3689       for (j = 1; j < n; j++) {
3690         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3691         v += bs2;
3692       }
3693       if (!usecprow) z += bs;
3694     }
3695   }
3696   PetscCall(MatDenseRestoreArray(C, &c));
3697   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3698   PetscFunctionReturn(0);
3699 }
3700