xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision d5b43468fb8780a8feea140ccd6fa3e6a50411cc)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8   #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
14   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
15   const PetscInt *idx;
16   PetscInt        start, end, *ai, *aj, bs, *nidx2;
17   PetscBT         table;
18 
19   PetscFunctionBegin;
20   m  = a->mbs;
21   ai = a->i;
22   aj = a->j;
23   bs = A->rmap->bs;
24 
25   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
26 
27   PetscCall(PetscBTCreate(m, &table));
28   PetscCall(PetscMalloc1(m + 1, &nidx));
29   PetscCall(PetscMalloc1(A->rmap->N + 1, &nidx2));
30 
31   for (i = 0; i < is_max; i++) {
32     /* Initialise the two local arrays */
33     isz = 0;
34     PetscCall(PetscBTMemzero(m, table));
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     PetscCall(ISGetIndices(is[i], &idx));
38     PetscCall(ISGetLocalSize(is[i], &n));
39 
40     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41     for (j = 0; j < n; ++j) {
42       ival = idx[j] / bs; /* convert the indices into block indices */
43       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
44       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
45     }
46     PetscCall(ISRestoreIndices(is[i], &idx));
47     PetscCall(ISDestroy(&is[i]));
48 
49     k = 0;
50     for (j = 0; j < ov; j++) { /* for each overlap*/
51       n = isz;
52       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
53         row   = nidx[k];
54         start = ai[row];
55         end   = ai[row + 1];
56         for (l = start; l < end; l++) {
57           val = aj[l];
58           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
59         }
60       }
61     }
62     /* expand the Index Set */
63     for (j = 0; j < isz; j++) {
64       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
65     }
66     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i));
67   }
68   PetscCall(PetscBTDestroy(&table));
69   PetscCall(PetscFree(nidx));
70   PetscCall(PetscFree(nidx2));
71   PetscFunctionReturn(0);
72 }
73 
74 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
75 {
76   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
77   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
78   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
79   const PetscInt *irow, *icol;
80   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
81   PetscInt       *aj = a->j, *ai = a->i;
82   MatScalar      *mat_a;
83   Mat             C;
84   PetscBool       flag;
85 
86   PetscFunctionBegin;
87   PetscCall(ISGetIndices(isrow, &irow));
88   PetscCall(ISGetIndices(iscol, &icol));
89   PetscCall(ISGetLocalSize(isrow, &nrows));
90   PetscCall(ISGetLocalSize(iscol, &ncols));
91 
92   PetscCall(PetscCalloc1(1 + oldcols, &smap));
93   ssmap = smap;
94   PetscCall(PetscMalloc1(1 + nrows, &lens));
95   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
96   /* determine lens of each row */
97   for (i = 0; i < nrows; i++) {
98     kstart  = ai[irow[i]];
99     kend    = kstart + a->ilen[irow[i]];
100     lens[i] = 0;
101     for (k = kstart; k < kend; k++) {
102       if (ssmap[aj[k]]) lens[i]++;
103     }
104   }
105   /* Create and fill new matrix */
106   if (scall == MAT_REUSE_MATRIX) {
107     c = (Mat_SeqBAIJ *)((*B)->data);
108 
109     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
110     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
111     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
112     PetscCall(PetscArrayzero(c->ilen, c->mbs));
113     C = *B;
114   } else {
115     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
116     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
117     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
118     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
119   }
120   c = (Mat_SeqBAIJ *)(C->data);
121   for (i = 0; i < nrows; i++) {
122     row      = irow[i];
123     kstart   = ai[row];
124     kend     = kstart + a->ilen[row];
125     mat_i    = c->i[i];
126     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
127     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
128     mat_ilen = c->ilen + i;
129     for (k = kstart; k < kend; k++) {
130       if ((tcol = ssmap[a->j[k]])) {
131         *mat_j++ = tcol - 1;
132         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
133         mat_a += bs2;
134         (*mat_ilen)++;
135       }
136     }
137   }
138   /* sort */
139   if (c->j && c->a) {
140     MatScalar *work;
141     PetscCall(PetscMalloc1(bs2, &work));
142     for (i = 0; i < nrows; i++) {
143       PetscInt ilen;
144       mat_i = c->i[i];
145       mat_j = c->j + mat_i;
146       mat_a = c->a + mat_i * bs2;
147       ilen  = c->ilen[i];
148       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
149     }
150     PetscCall(PetscFree(work));
151   }
152 
153   /* Free work space */
154   PetscCall(ISRestoreIndices(iscol, &icol));
155   PetscCall(PetscFree(smap));
156   PetscCall(PetscFree(lens));
157   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
158   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
159 
160   PetscCall(ISRestoreIndices(isrow, &irow));
161   *B = C;
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
166 {
167   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
168   IS              is1, is2;
169   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
170   const PetscInt *irow, *icol;
171 
172   PetscFunctionBegin;
173   PetscCall(ISGetIndices(isrow, &irow));
174   PetscCall(ISGetIndices(iscol, &icol));
175   PetscCall(ISGetLocalSize(isrow, &nrows));
176   PetscCall(ISGetLocalSize(iscol, &ncols));
177 
178   /* Verify if the indices corespond to each element in a block
179    and form the IS with compressed IS */
180   maxmnbs = PetscMax(a->mbs, a->nbs);
181   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
182   PetscCall(PetscArrayzero(vary, a->mbs));
183   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
184   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
185   count = 0;
186   for (i = 0; i < nrows; i++) {
187     j = irow[i] / bs;
188     if ((vary[j]--) == bs) iary[count++] = j;
189   }
190   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
191 
192   PetscCall(PetscArrayzero(vary, a->nbs));
193   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
194   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
195   count = 0;
196   for (i = 0; i < ncols; i++) {
197     j = icol[i] / bs;
198     if ((vary[j]--) == bs) iary[count++] = j;
199   }
200   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
201   PetscCall(ISRestoreIndices(isrow, &irow));
202   PetscCall(ISRestoreIndices(iscol, &icol));
203   PetscCall(PetscFree2(vary, iary));
204 
205   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
206   PetscCall(ISDestroy(&is1));
207   PetscCall(ISDestroy(&is2));
208   PetscFunctionReturn(0);
209 }
210 
211 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
212 {
213   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
214   Mat_SubSppt *submatj = c->submatis1;
215 
216   PetscFunctionBegin;
217   PetscCall((*submatj->destroy)(C));
218   PetscCall(MatDestroySubMatrix_Private(submatj));
219   PetscFunctionReturn(0);
220 }
221 
222 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
223 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
224 {
225   PetscInt     i;
226   Mat          C;
227   Mat_SeqBAIJ *c;
228   Mat_SubSppt *submatj;
229 
230   PetscFunctionBegin;
231   for (i = 0; i < n; i++) {
232     C       = (*mat)[i];
233     c       = (Mat_SeqBAIJ *)C->data;
234     submatj = c->submatis1;
235     if (submatj) {
236       if (--((PetscObject)C)->refct <= 0) {
237         PetscCall(PetscFree(C->factorprefix));
238         PetscCall((*submatj->destroy)(C));
239         PetscCall(MatDestroySubMatrix_Private(submatj));
240         PetscCall(PetscFree(C->defaultvectype));
241         PetscCall(PetscFree(C->defaultrandtype));
242         PetscCall(PetscLayoutDestroy(&C->rmap));
243         PetscCall(PetscLayoutDestroy(&C->cmap));
244         PetscCall(PetscHeaderDestroy(&C));
245       }
246     } else {
247       PetscCall(MatDestroy(&C));
248     }
249   }
250 
251   /* Destroy Dummy submatrices created for reuse */
252   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
253 
254   PetscCall(PetscFree(*mat));
255   PetscFunctionReturn(0);
256 }
257 
258 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
259 {
260   PetscInt i;
261 
262   PetscFunctionBegin;
263   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
264 
265   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
266   PetscFunctionReturn(0);
267 }
268 
269 /* -------------------------------------------------------*/
270 /* Should check that shapes of vectors and matrices match */
271 /* -------------------------------------------------------*/
272 
273 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
274 {
275   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
276   PetscScalar       *z, sum;
277   const PetscScalar *x;
278   const MatScalar   *v;
279   PetscInt           mbs, i, n;
280   const PetscInt    *idx, *ii, *ridx = NULL;
281   PetscBool          usecprow = a->compressedrow.use;
282 
283   PetscFunctionBegin;
284   PetscCall(VecGetArrayRead(xx, &x));
285   PetscCall(VecGetArrayWrite(zz, &z));
286 
287   if (usecprow) {
288     mbs  = a->compressedrow.nrows;
289     ii   = a->compressedrow.i;
290     ridx = a->compressedrow.rindex;
291     PetscCall(PetscArrayzero(z, a->mbs));
292   } else {
293     mbs = a->mbs;
294     ii  = a->i;
295   }
296 
297   for (i = 0; i < mbs; i++) {
298     n   = ii[1] - ii[0];
299     v   = a->a + ii[0];
300     idx = a->j + ii[0];
301     ii++;
302     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
303     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
304     sum = 0.0;
305     PetscSparseDensePlusDot(sum, x, v, idx, n);
306     if (usecprow) {
307       z[ridx[i]] = sum;
308     } else {
309       z[i] = sum;
310     }
311   }
312   PetscCall(VecRestoreArrayRead(xx, &x));
313   PetscCall(VecRestoreArrayWrite(zz, &z));
314   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
315   PetscFunctionReturn(0);
316 }
317 
318 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
319 {
320   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
321   PetscScalar       *z = NULL, sum1, sum2, *zarray;
322   const PetscScalar *x, *xb;
323   PetscScalar        x1, x2;
324   const MatScalar   *v;
325   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
326   PetscBool          usecprow = a->compressedrow.use;
327 
328   PetscFunctionBegin;
329   PetscCall(VecGetArrayRead(xx, &x));
330   PetscCall(VecGetArrayWrite(zz, &zarray));
331 
332   idx = a->j;
333   v   = a->a;
334   if (usecprow) {
335     mbs  = a->compressedrow.nrows;
336     ii   = a->compressedrow.i;
337     ridx = a->compressedrow.rindex;
338     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
339   } else {
340     mbs = a->mbs;
341     ii  = a->i;
342     z   = zarray;
343   }
344 
345   for (i = 0; i < mbs; i++) {
346     n = ii[1] - ii[0];
347     ii++;
348     sum1 = 0.0;
349     sum2 = 0.0;
350     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
351     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
352     for (j = 0; j < n; j++) {
353       xb = x + 2 * (*idx++);
354       x1 = xb[0];
355       x2 = xb[1];
356       sum1 += v[0] * x1 + v[2] * x2;
357       sum2 += v[1] * x1 + v[3] * x2;
358       v += 4;
359     }
360     if (usecprow) z = zarray + 2 * ridx[i];
361     z[0] = sum1;
362     z[1] = sum2;
363     if (!usecprow) z += 2;
364   }
365   PetscCall(VecRestoreArrayRead(xx, &x));
366   PetscCall(VecRestoreArrayWrite(zz, &zarray));
367   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
368   PetscFunctionReturn(0);
369 }
370 
371 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
372 {
373   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
374   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
375   const PetscScalar *x, *xb;
376   const MatScalar   *v;
377   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
378   PetscBool          usecprow = a->compressedrow.use;
379 
380 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
381   #pragma disjoint(*v, *z, *xb)
382 #endif
383 
384   PetscFunctionBegin;
385   PetscCall(VecGetArrayRead(xx, &x));
386   PetscCall(VecGetArrayWrite(zz, &zarray));
387 
388   idx = a->j;
389   v   = a->a;
390   if (usecprow) {
391     mbs  = a->compressedrow.nrows;
392     ii   = a->compressedrow.i;
393     ridx = a->compressedrow.rindex;
394     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
395   } else {
396     mbs = a->mbs;
397     ii  = a->i;
398     z   = zarray;
399   }
400 
401   for (i = 0; i < mbs; i++) {
402     n = ii[1] - ii[0];
403     ii++;
404     sum1 = 0.0;
405     sum2 = 0.0;
406     sum3 = 0.0;
407     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
408     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
409     for (j = 0; j < n; j++) {
410       xb = x + 3 * (*idx++);
411       x1 = xb[0];
412       x2 = xb[1];
413       x3 = xb[2];
414 
415       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
416       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
417       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
418       v += 9;
419     }
420     if (usecprow) z = zarray + 3 * ridx[i];
421     z[0] = sum1;
422     z[1] = sum2;
423     z[2] = sum3;
424     if (!usecprow) z += 3;
425   }
426   PetscCall(VecRestoreArrayRead(xx, &x));
427   PetscCall(VecRestoreArrayWrite(zz, &zarray));
428   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
429   PetscFunctionReturn(0);
430 }
431 
432 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
433 {
434   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
435   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
436   const PetscScalar *x, *xb;
437   const MatScalar   *v;
438   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
439   PetscBool          usecprow = a->compressedrow.use;
440 
441   PetscFunctionBegin;
442   PetscCall(VecGetArrayRead(xx, &x));
443   PetscCall(VecGetArrayWrite(zz, &zarray));
444 
445   idx = a->j;
446   v   = a->a;
447   if (usecprow) {
448     mbs  = a->compressedrow.nrows;
449     ii   = a->compressedrow.i;
450     ridx = a->compressedrow.rindex;
451     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
452   } else {
453     mbs = a->mbs;
454     ii  = a->i;
455     z   = zarray;
456   }
457 
458   for (i = 0; i < mbs; i++) {
459     n = ii[1] - ii[0];
460     ii++;
461     sum1 = 0.0;
462     sum2 = 0.0;
463     sum3 = 0.0;
464     sum4 = 0.0;
465 
466     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
467     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468     for (j = 0; j < n; j++) {
469       xb = x + 4 * (*idx++);
470       x1 = xb[0];
471       x2 = xb[1];
472       x3 = xb[2];
473       x4 = xb[3];
474       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
475       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
476       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
477       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
478       v += 16;
479     }
480     if (usecprow) z = zarray + 4 * ridx[i];
481     z[0] = sum1;
482     z[1] = sum2;
483     z[2] = sum3;
484     z[3] = sum4;
485     if (!usecprow) z += 4;
486   }
487   PetscCall(VecRestoreArrayRead(xx, &x));
488   PetscCall(VecRestoreArrayWrite(zz, &zarray));
489   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
490   PetscFunctionReturn(0);
491 }
492 
493 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
494 {
495   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
496   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
497   const PetscScalar *xb, *x;
498   const MatScalar   *v;
499   const PetscInt    *idx, *ii, *ridx = NULL;
500   PetscInt           mbs, i, j, n;
501   PetscBool          usecprow = a->compressedrow.use;
502 
503   PetscFunctionBegin;
504   PetscCall(VecGetArrayRead(xx, &x));
505   PetscCall(VecGetArrayWrite(zz, &zarray));
506 
507   idx = a->j;
508   v   = a->a;
509   if (usecprow) {
510     mbs  = a->compressedrow.nrows;
511     ii   = a->compressedrow.i;
512     ridx = a->compressedrow.rindex;
513     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
514   } else {
515     mbs = a->mbs;
516     ii  = a->i;
517     z   = zarray;
518   }
519 
520   for (i = 0; i < mbs; i++) {
521     n = ii[1] - ii[0];
522     ii++;
523     sum1 = 0.0;
524     sum2 = 0.0;
525     sum3 = 0.0;
526     sum4 = 0.0;
527     sum5 = 0.0;
528     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
529     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
530     for (j = 0; j < n; j++) {
531       xb = x + 5 * (*idx++);
532       x1 = xb[0];
533       x2 = xb[1];
534       x3 = xb[2];
535       x4 = xb[3];
536       x5 = xb[4];
537       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
538       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
539       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
540       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
541       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
542       v += 25;
543     }
544     if (usecprow) z = zarray + 5 * ridx[i];
545     z[0] = sum1;
546     z[1] = sum2;
547     z[2] = sum3;
548     z[3] = sum4;
549     z[4] = sum5;
550     if (!usecprow) z += 5;
551   }
552   PetscCall(VecRestoreArrayRead(xx, &x));
553   PetscCall(VecRestoreArrayWrite(zz, &zarray));
554   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
555   PetscFunctionReturn(0);
556 }
557 
558 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
559 {
560   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
561   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
562   const PetscScalar *x, *xb;
563   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
564   const MatScalar   *v;
565   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
566   PetscBool          usecprow = a->compressedrow.use;
567 
568   PetscFunctionBegin;
569   PetscCall(VecGetArrayRead(xx, &x));
570   PetscCall(VecGetArrayWrite(zz, &zarray));
571 
572   idx = a->j;
573   v   = a->a;
574   if (usecprow) {
575     mbs  = a->compressedrow.nrows;
576     ii   = a->compressedrow.i;
577     ridx = a->compressedrow.rindex;
578     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
579   } else {
580     mbs = a->mbs;
581     ii  = a->i;
582     z   = zarray;
583   }
584 
585   for (i = 0; i < mbs; i++) {
586     n = ii[1] - ii[0];
587     ii++;
588     sum1 = 0.0;
589     sum2 = 0.0;
590     sum3 = 0.0;
591     sum4 = 0.0;
592     sum5 = 0.0;
593     sum6 = 0.0;
594 
595     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
596     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
597     for (j = 0; j < n; j++) {
598       xb = x + 6 * (*idx++);
599       x1 = xb[0];
600       x2 = xb[1];
601       x3 = xb[2];
602       x4 = xb[3];
603       x5 = xb[4];
604       x6 = xb[5];
605       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
606       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
607       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
608       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
609       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
610       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
611       v += 36;
612     }
613     if (usecprow) z = zarray + 6 * ridx[i];
614     z[0] = sum1;
615     z[1] = sum2;
616     z[2] = sum3;
617     z[3] = sum4;
618     z[4] = sum5;
619     z[5] = sum6;
620     if (!usecprow) z += 6;
621   }
622 
623   PetscCall(VecRestoreArrayRead(xx, &x));
624   PetscCall(VecRestoreArrayWrite(zz, &zarray));
625   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
626   PetscFunctionReturn(0);
627 }
628 
629 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
630 {
631   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
632   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
633   const PetscScalar *x, *xb;
634   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
635   const MatScalar   *v;
636   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
637   PetscBool          usecprow = a->compressedrow.use;
638 
639   PetscFunctionBegin;
640   PetscCall(VecGetArrayRead(xx, &x));
641   PetscCall(VecGetArrayWrite(zz, &zarray));
642 
643   idx = a->j;
644   v   = a->a;
645   if (usecprow) {
646     mbs  = a->compressedrow.nrows;
647     ii   = a->compressedrow.i;
648     ridx = a->compressedrow.rindex;
649     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
650   } else {
651     mbs = a->mbs;
652     ii  = a->i;
653     z   = zarray;
654   }
655 
656   for (i = 0; i < mbs; i++) {
657     n = ii[1] - ii[0];
658     ii++;
659     sum1 = 0.0;
660     sum2 = 0.0;
661     sum3 = 0.0;
662     sum4 = 0.0;
663     sum5 = 0.0;
664     sum6 = 0.0;
665     sum7 = 0.0;
666 
667     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
668     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
669     for (j = 0; j < n; j++) {
670       xb = x + 7 * (*idx++);
671       x1 = xb[0];
672       x2 = xb[1];
673       x3 = xb[2];
674       x4 = xb[3];
675       x5 = xb[4];
676       x6 = xb[5];
677       x7 = xb[6];
678       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
679       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
680       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
681       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
682       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
683       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
684       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
685       v += 49;
686     }
687     if (usecprow) z = zarray + 7 * ridx[i];
688     z[0] = sum1;
689     z[1] = sum2;
690     z[2] = sum3;
691     z[3] = sum4;
692     z[4] = sum5;
693     z[5] = sum6;
694     z[6] = sum7;
695     if (!usecprow) z += 7;
696   }
697 
698   PetscCall(VecRestoreArrayRead(xx, &x));
699   PetscCall(VecRestoreArrayWrite(zz, &zarray));
700   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
701   PetscFunctionReturn(0);
702 }
703 
704 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
705 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
706 {
707   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
708   PetscScalar       *z = NULL, *work, *workt, *zarray;
709   const PetscScalar *x, *xb;
710   const MatScalar   *v;
711   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
712   const PetscInt    *idx, *ii, *ridx = NULL;
713   PetscInt           k;
714   PetscBool          usecprow = a->compressedrow.use;
715 
716   __m256d a0, a1, a2, a3, a4, a5;
717   __m256d w0, w1, w2, w3;
718   __m256d z0, z1, z2;
719   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
720 
721   PetscFunctionBegin;
722   PetscCall(VecGetArrayRead(xx, &x));
723   PetscCall(VecGetArrayWrite(zz, &zarray));
724 
725   idx = a->j;
726   v   = a->a;
727   if (usecprow) {
728     mbs  = a->compressedrow.nrows;
729     ii   = a->compressedrow.i;
730     ridx = a->compressedrow.rindex;
731     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
732   } else {
733     mbs = a->mbs;
734     ii  = a->i;
735     z   = zarray;
736   }
737 
738   if (!a->mult_work) {
739     k = PetscMax(A->rmap->n, A->cmap->n);
740     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
741   }
742 
743   work = a->mult_work;
744   for (i = 0; i < mbs; i++) {
745     n = ii[1] - ii[0];
746     ii++;
747     workt = work;
748     for (j = 0; j < n; j++) {
749       xb = x + bs * (*idx++);
750       for (k = 0; k < bs; k++) workt[k] = xb[k];
751       workt += bs;
752     }
753     if (usecprow) z = zarray + bs * ridx[i];
754 
755     z0 = _mm256_setzero_pd();
756     z1 = _mm256_setzero_pd();
757     z2 = _mm256_setzero_pd();
758 
759     for (j = 0; j < n; j++) {
760       /* first column of a */
761       w0 = _mm256_set1_pd(work[j * 9]);
762       a0 = _mm256_loadu_pd(&v[j * 81]);
763       z0 = _mm256_fmadd_pd(a0, w0, z0);
764       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
765       z1 = _mm256_fmadd_pd(a1, w0, z1);
766       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
767       z2 = _mm256_fmadd_pd(a2, w0, z2);
768 
769       /* second column of a */
770       w1 = _mm256_set1_pd(work[j * 9 + 1]);
771       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
772       z0 = _mm256_fmadd_pd(a0, w1, z0);
773       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
774       z1 = _mm256_fmadd_pd(a1, w1, z1);
775       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
776       z2 = _mm256_fmadd_pd(a2, w1, z2);
777 
778       /* third column of a */
779       w2 = _mm256_set1_pd(work[j * 9 + 2]);
780       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
781       z0 = _mm256_fmadd_pd(a3, w2, z0);
782       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
783       z1 = _mm256_fmadd_pd(a4, w2, z1);
784       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
785       z2 = _mm256_fmadd_pd(a5, w2, z2);
786 
787       /* fourth column of a */
788       w3 = _mm256_set1_pd(work[j * 9 + 3]);
789       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
790       z0 = _mm256_fmadd_pd(a0, w3, z0);
791       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
792       z1 = _mm256_fmadd_pd(a1, w3, z1);
793       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
794       z2 = _mm256_fmadd_pd(a2, w3, z2);
795 
796       /* fifth column of a */
797       w0 = _mm256_set1_pd(work[j * 9 + 4]);
798       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
799       z0 = _mm256_fmadd_pd(a3, w0, z0);
800       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
801       z1 = _mm256_fmadd_pd(a4, w0, z1);
802       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
803       z2 = _mm256_fmadd_pd(a5, w0, z2);
804 
805       /* sixth column of a */
806       w1 = _mm256_set1_pd(work[j * 9 + 5]);
807       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
808       z0 = _mm256_fmadd_pd(a0, w1, z0);
809       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
810       z1 = _mm256_fmadd_pd(a1, w1, z1);
811       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
812       z2 = _mm256_fmadd_pd(a2, w1, z2);
813 
814       /* seventh column of a */
815       w2 = _mm256_set1_pd(work[j * 9 + 6]);
816       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
817       z0 = _mm256_fmadd_pd(a0, w2, z0);
818       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
819       z1 = _mm256_fmadd_pd(a1, w2, z1);
820       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
821       z2 = _mm256_fmadd_pd(a2, w2, z2);
822 
823       /* eighth column of a */
824       w3 = _mm256_set1_pd(work[j * 9 + 7]);
825       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
826       z0 = _mm256_fmadd_pd(a3, w3, z0);
827       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
828       z1 = _mm256_fmadd_pd(a4, w3, z1);
829       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
830       z2 = _mm256_fmadd_pd(a5, w3, z2);
831 
832       /* ninth column of a */
833       w0 = _mm256_set1_pd(work[j * 9 + 8]);
834       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
835       z0 = _mm256_fmadd_pd(a0, w0, z0);
836       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
837       z1 = _mm256_fmadd_pd(a1, w0, z1);
838       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
839       z2 = _mm256_fmadd_pd(a2, w0, z2);
840     }
841 
842     _mm256_storeu_pd(&z[0], z0);
843     _mm256_storeu_pd(&z[4], z1);
844     _mm256_maskstore_pd(&z[8], mask1, z2);
845 
846     v += n * bs2;
847     if (!usecprow) z += bs;
848   }
849   PetscCall(VecRestoreArrayRead(xx, &x));
850   PetscCall(VecRestoreArrayWrite(zz, &zarray));
851   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
852   PetscFunctionReturn(0);
853 }
854 #endif
855 
856 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
857 {
858   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
859   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
860   const PetscScalar *x, *xb;
861   PetscScalar       *zarray, xv;
862   const MatScalar   *v;
863   const PetscInt    *ii, *ij = a->j, *idx;
864   PetscInt           mbs, i, j, k, n, *ridx = NULL;
865   PetscBool          usecprow = a->compressedrow.use;
866 
867   PetscFunctionBegin;
868   PetscCall(VecGetArrayRead(xx, &x));
869   PetscCall(VecGetArrayWrite(zz, &zarray));
870 
871   v = a->a;
872   if (usecprow) {
873     mbs  = a->compressedrow.nrows;
874     ii   = a->compressedrow.i;
875     ridx = a->compressedrow.rindex;
876     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
877   } else {
878     mbs = a->mbs;
879     ii  = a->i;
880     z   = zarray;
881   }
882 
883   for (i = 0; i < mbs; i++) {
884     n     = ii[i + 1] - ii[i];
885     idx   = ij + ii[i];
886     sum1  = 0.0;
887     sum2  = 0.0;
888     sum3  = 0.0;
889     sum4  = 0.0;
890     sum5  = 0.0;
891     sum6  = 0.0;
892     sum7  = 0.0;
893     sum8  = 0.0;
894     sum9  = 0.0;
895     sum10 = 0.0;
896     sum11 = 0.0;
897 
898     for (j = 0; j < n; j++) {
899       xb = x + 11 * (idx[j]);
900 
901       for (k = 0; k < 11; k++) {
902         xv = xb[k];
903         sum1 += v[0] * xv;
904         sum2 += v[1] * xv;
905         sum3 += v[2] * xv;
906         sum4 += v[3] * xv;
907         sum5 += v[4] * xv;
908         sum6 += v[5] * xv;
909         sum7 += v[6] * xv;
910         sum8 += v[7] * xv;
911         sum9 += v[8] * xv;
912         sum10 += v[9] * xv;
913         sum11 += v[10] * xv;
914         v += 11;
915       }
916     }
917     if (usecprow) z = zarray + 11 * ridx[i];
918     z[0]  = sum1;
919     z[1]  = sum2;
920     z[2]  = sum3;
921     z[3]  = sum4;
922     z[4]  = sum5;
923     z[5]  = sum6;
924     z[6]  = sum7;
925     z[7]  = sum8;
926     z[8]  = sum9;
927     z[9]  = sum10;
928     z[10] = sum11;
929 
930     if (!usecprow) z += 11;
931   }
932 
933   PetscCall(VecRestoreArrayRead(xx, &x));
934   PetscCall(VecRestoreArrayWrite(zz, &zarray));
935   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
936   PetscFunctionReturn(0);
937 }
938 
939 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
940 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
941 {
942   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
943   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
944   const PetscScalar *x, *xb;
945   PetscScalar       *zarray, xv;
946   const MatScalar   *v;
947   const PetscInt    *ii, *ij = a->j, *idx;
948   PetscInt           mbs, i, j, k, n, *ridx = NULL;
949   PetscBool          usecprow = a->compressedrow.use;
950 
951   PetscFunctionBegin;
952   PetscCall(VecGetArrayRead(xx, &x));
953   PetscCall(VecGetArrayWrite(zz, &zarray));
954 
955   v = a->a;
956   if (usecprow) {
957     mbs  = a->compressedrow.nrows;
958     ii   = a->compressedrow.i;
959     ridx = a->compressedrow.rindex;
960     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
961   } else {
962     mbs = a->mbs;
963     ii  = a->i;
964     z   = zarray;
965   }
966 
967   for (i = 0; i < mbs; i++) {
968     n     = ii[i + 1] - ii[i];
969     idx   = ij + ii[i];
970     sum1  = 0.0;
971     sum2  = 0.0;
972     sum3  = 0.0;
973     sum4  = 0.0;
974     sum5  = 0.0;
975     sum6  = 0.0;
976     sum7  = 0.0;
977     sum8  = 0.0;
978     sum9  = 0.0;
979     sum10 = 0.0;
980     sum11 = 0.0;
981     sum12 = 0.0;
982 
983     for (j = 0; j < n; j++) {
984       xb = x + 12 * (idx[j]);
985 
986       for (k = 0; k < 12; k++) {
987         xv = xb[k];
988         sum1 += v[0] * xv;
989         sum2 += v[1] * xv;
990         sum3 += v[2] * xv;
991         sum4 += v[3] * xv;
992         sum5 += v[4] * xv;
993         sum6 += v[5] * xv;
994         sum7 += v[6] * xv;
995         sum8 += v[7] * xv;
996         sum9 += v[8] * xv;
997         sum10 += v[9] * xv;
998         sum11 += v[10] * xv;
999         sum12 += v[11] * xv;
1000         v += 12;
1001       }
1002     }
1003     if (usecprow) z = zarray + 12 * ridx[i];
1004     z[0]  = sum1;
1005     z[1]  = sum2;
1006     z[2]  = sum3;
1007     z[3]  = sum4;
1008     z[4]  = sum5;
1009     z[5]  = sum6;
1010     z[6]  = sum7;
1011     z[7]  = sum8;
1012     z[8]  = sum9;
1013     z[9]  = sum10;
1014     z[10] = sum11;
1015     z[11] = sum12;
1016     if (!usecprow) z += 12;
1017   }
1018   PetscCall(VecRestoreArrayRead(xx, &x));
1019   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1020   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1025 {
1026   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1027   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1028   const PetscScalar *x, *xb;
1029   PetscScalar       *zarray, *yarray, xv;
1030   const MatScalar   *v;
1031   const PetscInt    *ii, *ij = a->j, *idx;
1032   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1033   PetscBool          usecprow = a->compressedrow.use;
1034 
1035   PetscFunctionBegin;
1036   PetscCall(VecGetArrayRead(xx, &x));
1037   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1038 
1039   v = a->a;
1040   if (usecprow) {
1041     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1042     mbs  = a->compressedrow.nrows;
1043     ii   = a->compressedrow.i;
1044     ridx = a->compressedrow.rindex;
1045   } else {
1046     ii = a->i;
1047     y  = yarray;
1048     z  = zarray;
1049   }
1050 
1051   for (i = 0; i < mbs; i++) {
1052     n   = ii[i + 1] - ii[i];
1053     idx = ij + ii[i];
1054 
1055     if (usecprow) {
1056       y = yarray + 12 * ridx[i];
1057       z = zarray + 12 * ridx[i];
1058     }
1059     sum1  = y[0];
1060     sum2  = y[1];
1061     sum3  = y[2];
1062     sum4  = y[3];
1063     sum5  = y[4];
1064     sum6  = y[5];
1065     sum7  = y[6];
1066     sum8  = y[7];
1067     sum9  = y[8];
1068     sum10 = y[9];
1069     sum11 = y[10];
1070     sum12 = y[11];
1071 
1072     for (j = 0; j < n; j++) {
1073       xb = x + 12 * (idx[j]);
1074 
1075       for (k = 0; k < 12; k++) {
1076         xv = xb[k];
1077         sum1 += v[0] * xv;
1078         sum2 += v[1] * xv;
1079         sum3 += v[2] * xv;
1080         sum4 += v[3] * xv;
1081         sum5 += v[4] * xv;
1082         sum6 += v[5] * xv;
1083         sum7 += v[6] * xv;
1084         sum8 += v[7] * xv;
1085         sum9 += v[8] * xv;
1086         sum10 += v[9] * xv;
1087         sum11 += v[10] * xv;
1088         sum12 += v[11] * xv;
1089         v += 12;
1090       }
1091     }
1092 
1093     z[0]  = sum1;
1094     z[1]  = sum2;
1095     z[2]  = sum3;
1096     z[3]  = sum4;
1097     z[4]  = sum5;
1098     z[5]  = sum6;
1099     z[6]  = sum7;
1100     z[7]  = sum8;
1101     z[8]  = sum9;
1102     z[9]  = sum10;
1103     z[10] = sum11;
1104     z[11] = sum12;
1105     if (!usecprow) {
1106       y += 12;
1107       z += 12;
1108     }
1109   }
1110   PetscCall(VecRestoreArrayRead(xx, &x));
1111   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1112   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1117 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1118 {
1119   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1120   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1121   const PetscScalar *x, *xb;
1122   PetscScalar        x1, x2, x3, x4, *zarray;
1123   const MatScalar   *v;
1124   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1125   PetscInt           mbs, i, j, n;
1126   PetscBool          usecprow = a->compressedrow.use;
1127 
1128   PetscFunctionBegin;
1129   PetscCall(VecGetArrayRead(xx, &x));
1130   PetscCall(VecGetArrayWrite(zz, &zarray));
1131 
1132   v = a->a;
1133   if (usecprow) {
1134     mbs  = a->compressedrow.nrows;
1135     ii   = a->compressedrow.i;
1136     ridx = a->compressedrow.rindex;
1137     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1138   } else {
1139     mbs = a->mbs;
1140     ii  = a->i;
1141     z   = zarray;
1142   }
1143 
1144   for (i = 0; i < mbs; i++) {
1145     n   = ii[i + 1] - ii[i];
1146     idx = ij + ii[i];
1147 
1148     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1149     for (j = 0; j < n; j++) {
1150       xb = x + 12 * (idx[j]);
1151       x1 = xb[0];
1152       x2 = xb[1];
1153       x3 = xb[2];
1154       x4 = xb[3];
1155 
1156       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1157       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1158       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1159       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1160       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1161       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1162       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1163       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1164       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1165       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1166       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1167       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1168       v += 48;
1169 
1170       x1 = xb[4];
1171       x2 = xb[5];
1172       x3 = xb[6];
1173       x4 = xb[7];
1174 
1175       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1176       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1177       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1178       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1179       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1180       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1181       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1182       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1183       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1184       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1185       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1186       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1187       v += 48;
1188 
1189       x1 = xb[8];
1190       x2 = xb[9];
1191       x3 = xb[10];
1192       x4 = xb[11];
1193       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1194       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1195       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1196       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1197       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1198       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1199       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1200       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1201       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1202       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1203       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1204       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1205       v += 48;
1206     }
1207     if (usecprow) z = zarray + 12 * ridx[i];
1208     z[0]  = sum1;
1209     z[1]  = sum2;
1210     z[2]  = sum3;
1211     z[3]  = sum4;
1212     z[4]  = sum5;
1213     z[5]  = sum6;
1214     z[6]  = sum7;
1215     z[7]  = sum8;
1216     z[8]  = sum9;
1217     z[9]  = sum10;
1218     z[10] = sum11;
1219     z[11] = sum12;
1220     if (!usecprow) z += 12;
1221   }
1222   PetscCall(VecRestoreArrayRead(xx, &x));
1223   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1224   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1225   PetscFunctionReturn(0);
1226 }
1227 
1228 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1229 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1230 {
1231   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1232   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1233   const PetscScalar *x, *xb;
1234   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1235   const MatScalar   *v;
1236   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1237   PetscInt           mbs      = a->mbs, i, j, n;
1238   PetscBool          usecprow = a->compressedrow.use;
1239 
1240   PetscFunctionBegin;
1241   PetscCall(VecGetArrayRead(xx, &x));
1242   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1243 
1244   v = a->a;
1245   if (usecprow) {
1246     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1247     mbs  = a->compressedrow.nrows;
1248     ii   = a->compressedrow.i;
1249     ridx = a->compressedrow.rindex;
1250   } else {
1251     ii = a->i;
1252     y  = yarray;
1253     z  = zarray;
1254   }
1255 
1256   for (i = 0; i < mbs; i++) {
1257     n   = ii[i + 1] - ii[i];
1258     idx = ij + ii[i];
1259 
1260     if (usecprow) {
1261       y = yarray + 12 * ridx[i];
1262       z = zarray + 12 * ridx[i];
1263     }
1264     sum1  = y[0];
1265     sum2  = y[1];
1266     sum3  = y[2];
1267     sum4  = y[3];
1268     sum5  = y[4];
1269     sum6  = y[5];
1270     sum7  = y[6];
1271     sum8  = y[7];
1272     sum9  = y[8];
1273     sum10 = y[9];
1274     sum11 = y[10];
1275     sum12 = y[11];
1276 
1277     for (j = 0; j < n; j++) {
1278       xb = x + 12 * (idx[j]);
1279       x1 = xb[0];
1280       x2 = xb[1];
1281       x3 = xb[2];
1282       x4 = xb[3];
1283 
1284       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296       v += 48;
1297 
1298       x1 = xb[4];
1299       x2 = xb[5];
1300       x3 = xb[6];
1301       x4 = xb[7];
1302 
1303       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1304       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1305       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1306       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1307       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1308       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1309       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1310       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1311       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1312       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1313       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1314       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1315       v += 48;
1316 
1317       x1 = xb[8];
1318       x2 = xb[9];
1319       x3 = xb[10];
1320       x4 = xb[11];
1321       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1322       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1323       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1324       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1325       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1326       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1327       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1328       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1329       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1330       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1331       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1332       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1333       v += 48;
1334     }
1335     z[0]  = sum1;
1336     z[1]  = sum2;
1337     z[2]  = sum3;
1338     z[3]  = sum4;
1339     z[4]  = sum5;
1340     z[5]  = sum6;
1341     z[6]  = sum7;
1342     z[7]  = sum8;
1343     z[8]  = sum9;
1344     z[9]  = sum10;
1345     z[10] = sum11;
1346     z[11] = sum12;
1347     if (!usecprow) {
1348       y += 12;
1349       z += 12;
1350     }
1351   }
1352   PetscCall(VecRestoreArrayRead(xx, &x));
1353   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1354   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1355   PetscFunctionReturn(0);
1356 }
1357 
1358 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1359 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1360 {
1361   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1362   PetscScalar       *z = NULL, *zarray;
1363   const PetscScalar *x, *work;
1364   const MatScalar   *v = a->a;
1365   PetscInt           mbs, i, j, n;
1366   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1367   PetscBool          usecprow = a->compressedrow.use;
1368   const PetscInt     bs = 12, bs2 = 144;
1369 
1370   __m256d a0, a1, a2, a3, a4, a5;
1371   __m256d w0, w1, w2, w3;
1372   __m256d z0, z1, z2;
1373 
1374   PetscFunctionBegin;
1375   PetscCall(VecGetArrayRead(xx, &x));
1376   PetscCall(VecGetArrayWrite(zz, &zarray));
1377 
1378   if (usecprow) {
1379     mbs  = a->compressedrow.nrows;
1380     ii   = a->compressedrow.i;
1381     ridx = a->compressedrow.rindex;
1382     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1383   } else {
1384     mbs = a->mbs;
1385     ii  = a->i;
1386     z   = zarray;
1387   }
1388 
1389   for (i = 0; i < mbs; i++) {
1390     z0 = _mm256_setzero_pd();
1391     z1 = _mm256_setzero_pd();
1392     z2 = _mm256_setzero_pd();
1393 
1394     n = ii[1] - ii[0];
1395     ii++;
1396     for (j = 0; j < n; j++) {
1397       work = x + bs * (*idx++);
1398 
1399       /* first column of a */
1400       w0 = _mm256_set1_pd(work[0]);
1401       a0 = _mm256_loadu_pd(v + 0);
1402       z0 = _mm256_fmadd_pd(a0, w0, z0);
1403       a1 = _mm256_loadu_pd(v + 4);
1404       z1 = _mm256_fmadd_pd(a1, w0, z1);
1405       a2 = _mm256_loadu_pd(v + 8);
1406       z2 = _mm256_fmadd_pd(a2, w0, z2);
1407 
1408       /* second column of a */
1409       w1 = _mm256_set1_pd(work[1]);
1410       a3 = _mm256_loadu_pd(v + 12);
1411       z0 = _mm256_fmadd_pd(a3, w1, z0);
1412       a4 = _mm256_loadu_pd(v + 16);
1413       z1 = _mm256_fmadd_pd(a4, w1, z1);
1414       a5 = _mm256_loadu_pd(v + 20);
1415       z2 = _mm256_fmadd_pd(a5, w1, z2);
1416 
1417       /* third column of a */
1418       w2 = _mm256_set1_pd(work[2]);
1419       a0 = _mm256_loadu_pd(v + 24);
1420       z0 = _mm256_fmadd_pd(a0, w2, z0);
1421       a1 = _mm256_loadu_pd(v + 28);
1422       z1 = _mm256_fmadd_pd(a1, w2, z1);
1423       a2 = _mm256_loadu_pd(v + 32);
1424       z2 = _mm256_fmadd_pd(a2, w2, z2);
1425 
1426       /* fourth column of a */
1427       w3 = _mm256_set1_pd(work[3]);
1428       a3 = _mm256_loadu_pd(v + 36);
1429       z0 = _mm256_fmadd_pd(a3, w3, z0);
1430       a4 = _mm256_loadu_pd(v + 40);
1431       z1 = _mm256_fmadd_pd(a4, w3, z1);
1432       a5 = _mm256_loadu_pd(v + 44);
1433       z2 = _mm256_fmadd_pd(a5, w3, z2);
1434 
1435       /* fifth column of a */
1436       w0 = _mm256_set1_pd(work[4]);
1437       a0 = _mm256_loadu_pd(v + 48);
1438       z0 = _mm256_fmadd_pd(a0, w0, z0);
1439       a1 = _mm256_loadu_pd(v + 52);
1440       z1 = _mm256_fmadd_pd(a1, w0, z1);
1441       a2 = _mm256_loadu_pd(v + 56);
1442       z2 = _mm256_fmadd_pd(a2, w0, z2);
1443 
1444       /* sixth column of a */
1445       w1 = _mm256_set1_pd(work[5]);
1446       a3 = _mm256_loadu_pd(v + 60);
1447       z0 = _mm256_fmadd_pd(a3, w1, z0);
1448       a4 = _mm256_loadu_pd(v + 64);
1449       z1 = _mm256_fmadd_pd(a4, w1, z1);
1450       a5 = _mm256_loadu_pd(v + 68);
1451       z2 = _mm256_fmadd_pd(a5, w1, z2);
1452 
1453       /* seventh column of a */
1454       w2 = _mm256_set1_pd(work[6]);
1455       a0 = _mm256_loadu_pd(v + 72);
1456       z0 = _mm256_fmadd_pd(a0, w2, z0);
1457       a1 = _mm256_loadu_pd(v + 76);
1458       z1 = _mm256_fmadd_pd(a1, w2, z1);
1459       a2 = _mm256_loadu_pd(v + 80);
1460       z2 = _mm256_fmadd_pd(a2, w2, z2);
1461 
1462       /* eighth column of a */
1463       w3 = _mm256_set1_pd(work[7]);
1464       a3 = _mm256_loadu_pd(v + 84);
1465       z0 = _mm256_fmadd_pd(a3, w3, z0);
1466       a4 = _mm256_loadu_pd(v + 88);
1467       z1 = _mm256_fmadd_pd(a4, w3, z1);
1468       a5 = _mm256_loadu_pd(v + 92);
1469       z2 = _mm256_fmadd_pd(a5, w3, z2);
1470 
1471       /* ninth column of a */
1472       w0 = _mm256_set1_pd(work[8]);
1473       a0 = _mm256_loadu_pd(v + 96);
1474       z0 = _mm256_fmadd_pd(a0, w0, z0);
1475       a1 = _mm256_loadu_pd(v + 100);
1476       z1 = _mm256_fmadd_pd(a1, w0, z1);
1477       a2 = _mm256_loadu_pd(v + 104);
1478       z2 = _mm256_fmadd_pd(a2, w0, z2);
1479 
1480       /* tenth column of a */
1481       w1 = _mm256_set1_pd(work[9]);
1482       a3 = _mm256_loadu_pd(v + 108);
1483       z0 = _mm256_fmadd_pd(a3, w1, z0);
1484       a4 = _mm256_loadu_pd(v + 112);
1485       z1 = _mm256_fmadd_pd(a4, w1, z1);
1486       a5 = _mm256_loadu_pd(v + 116);
1487       z2 = _mm256_fmadd_pd(a5, w1, z2);
1488 
1489       /* eleventh column of a */
1490       w2 = _mm256_set1_pd(work[10]);
1491       a0 = _mm256_loadu_pd(v + 120);
1492       z0 = _mm256_fmadd_pd(a0, w2, z0);
1493       a1 = _mm256_loadu_pd(v + 124);
1494       z1 = _mm256_fmadd_pd(a1, w2, z1);
1495       a2 = _mm256_loadu_pd(v + 128);
1496       z2 = _mm256_fmadd_pd(a2, w2, z2);
1497 
1498       /* twelveth column of a */
1499       w3 = _mm256_set1_pd(work[11]);
1500       a3 = _mm256_loadu_pd(v + 132);
1501       z0 = _mm256_fmadd_pd(a3, w3, z0);
1502       a4 = _mm256_loadu_pd(v + 136);
1503       z1 = _mm256_fmadd_pd(a4, w3, z1);
1504       a5 = _mm256_loadu_pd(v + 140);
1505       z2 = _mm256_fmadd_pd(a5, w3, z2);
1506 
1507       v += bs2;
1508     }
1509     if (usecprow) z = zarray + bs * ridx[i];
1510     _mm256_storeu_pd(&z[0], z0);
1511     _mm256_storeu_pd(&z[4], z1);
1512     _mm256_storeu_pd(&z[8], z2);
1513     if (!usecprow) z += bs;
1514   }
1515   PetscCall(VecRestoreArrayRead(xx, &x));
1516   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1517   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1518   PetscFunctionReturn(0);
1519 }
1520 #endif
1521 
1522 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1523 /* Default MatMult for block size 15 */
1524 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1525 {
1526   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1527   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1528   const PetscScalar *x, *xb;
1529   PetscScalar       *zarray, xv;
1530   const MatScalar   *v;
1531   const PetscInt    *ii, *ij = a->j, *idx;
1532   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1533   PetscBool          usecprow = a->compressedrow.use;
1534 
1535   PetscFunctionBegin;
1536   PetscCall(VecGetArrayRead(xx, &x));
1537   PetscCall(VecGetArrayWrite(zz, &zarray));
1538 
1539   v = a->a;
1540   if (usecprow) {
1541     mbs  = a->compressedrow.nrows;
1542     ii   = a->compressedrow.i;
1543     ridx = a->compressedrow.rindex;
1544     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1545   } else {
1546     mbs = a->mbs;
1547     ii  = a->i;
1548     z   = zarray;
1549   }
1550 
1551   for (i = 0; i < mbs; i++) {
1552     n     = ii[i + 1] - ii[i];
1553     idx   = ij + ii[i];
1554     sum1  = 0.0;
1555     sum2  = 0.0;
1556     sum3  = 0.0;
1557     sum4  = 0.0;
1558     sum5  = 0.0;
1559     sum6  = 0.0;
1560     sum7  = 0.0;
1561     sum8  = 0.0;
1562     sum9  = 0.0;
1563     sum10 = 0.0;
1564     sum11 = 0.0;
1565     sum12 = 0.0;
1566     sum13 = 0.0;
1567     sum14 = 0.0;
1568     sum15 = 0.0;
1569 
1570     for (j = 0; j < n; j++) {
1571       xb = x + 15 * (idx[j]);
1572 
1573       for (k = 0; k < 15; k++) {
1574         xv = xb[k];
1575         sum1 += v[0] * xv;
1576         sum2 += v[1] * xv;
1577         sum3 += v[2] * xv;
1578         sum4 += v[3] * xv;
1579         sum5 += v[4] * xv;
1580         sum6 += v[5] * xv;
1581         sum7 += v[6] * xv;
1582         sum8 += v[7] * xv;
1583         sum9 += v[8] * xv;
1584         sum10 += v[9] * xv;
1585         sum11 += v[10] * xv;
1586         sum12 += v[11] * xv;
1587         sum13 += v[12] * xv;
1588         sum14 += v[13] * xv;
1589         sum15 += v[14] * xv;
1590         v += 15;
1591       }
1592     }
1593     if (usecprow) z = zarray + 15 * ridx[i];
1594     z[0]  = sum1;
1595     z[1]  = sum2;
1596     z[2]  = sum3;
1597     z[3]  = sum4;
1598     z[4]  = sum5;
1599     z[5]  = sum6;
1600     z[6]  = sum7;
1601     z[7]  = sum8;
1602     z[8]  = sum9;
1603     z[9]  = sum10;
1604     z[10] = sum11;
1605     z[11] = sum12;
1606     z[12] = sum13;
1607     z[13] = sum14;
1608     z[14] = sum15;
1609 
1610     if (!usecprow) z += 15;
1611   }
1612 
1613   PetscCall(VecRestoreArrayRead(xx, &x));
1614   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1615   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1620 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1621 {
1622   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1623   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1624   const PetscScalar *x, *xb;
1625   PetscScalar        x1, x2, x3, x4, *zarray;
1626   const MatScalar   *v;
1627   const PetscInt    *ii, *ij = a->j, *idx;
1628   PetscInt           mbs, i, j, n, *ridx = NULL;
1629   PetscBool          usecprow = a->compressedrow.use;
1630 
1631   PetscFunctionBegin;
1632   PetscCall(VecGetArrayRead(xx, &x));
1633   PetscCall(VecGetArrayWrite(zz, &zarray));
1634 
1635   v = a->a;
1636   if (usecprow) {
1637     mbs  = a->compressedrow.nrows;
1638     ii   = a->compressedrow.i;
1639     ridx = a->compressedrow.rindex;
1640     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1641   } else {
1642     mbs = a->mbs;
1643     ii  = a->i;
1644     z   = zarray;
1645   }
1646 
1647   for (i = 0; i < mbs; i++) {
1648     n     = ii[i + 1] - ii[i];
1649     idx   = ij + ii[i];
1650     sum1  = 0.0;
1651     sum2  = 0.0;
1652     sum3  = 0.0;
1653     sum4  = 0.0;
1654     sum5  = 0.0;
1655     sum6  = 0.0;
1656     sum7  = 0.0;
1657     sum8  = 0.0;
1658     sum9  = 0.0;
1659     sum10 = 0.0;
1660     sum11 = 0.0;
1661     sum12 = 0.0;
1662     sum13 = 0.0;
1663     sum14 = 0.0;
1664     sum15 = 0.0;
1665 
1666     for (j = 0; j < n; j++) {
1667       xb = x + 15 * (idx[j]);
1668       x1 = xb[0];
1669       x2 = xb[1];
1670       x3 = xb[2];
1671       x4 = xb[3];
1672 
1673       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1674       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1675       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1676       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1677       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1678       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1679       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1680       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1681       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1682       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1683       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1684       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1685       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1686       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1687       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1688 
1689       v += 60;
1690 
1691       x1 = xb[4];
1692       x2 = xb[5];
1693       x3 = xb[6];
1694       x4 = xb[7];
1695 
1696       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1697       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1698       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1699       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1700       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1701       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1702       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1703       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1704       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1705       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1706       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1707       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1708       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1709       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1710       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1711       v += 60;
1712 
1713       x1 = xb[8];
1714       x2 = xb[9];
1715       x3 = xb[10];
1716       x4 = xb[11];
1717       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1718       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1719       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1720       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1721       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1722       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1723       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1724       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1725       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1726       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1727       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1728       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1729       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1730       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1731       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1732       v += 60;
1733 
1734       x1 = xb[12];
1735       x2 = xb[13];
1736       x3 = xb[14];
1737       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1738       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1739       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1740       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1741       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1742       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1743       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1744       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1745       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1746       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1747       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1748       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1749       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1750       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1751       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1752       v += 45;
1753     }
1754     if (usecprow) z = zarray + 15 * ridx[i];
1755     z[0]  = sum1;
1756     z[1]  = sum2;
1757     z[2]  = sum3;
1758     z[3]  = sum4;
1759     z[4]  = sum5;
1760     z[5]  = sum6;
1761     z[6]  = sum7;
1762     z[7]  = sum8;
1763     z[8]  = sum9;
1764     z[9]  = sum10;
1765     z[10] = sum11;
1766     z[11] = sum12;
1767     z[12] = sum13;
1768     z[13] = sum14;
1769     z[14] = sum15;
1770 
1771     if (!usecprow) z += 15;
1772   }
1773 
1774   PetscCall(VecRestoreArrayRead(xx, &x));
1775   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1776   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1781 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1782 {
1783   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1784   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1785   const PetscScalar *x, *xb;
1786   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1787   const MatScalar   *v;
1788   const PetscInt    *ii, *ij = a->j, *idx;
1789   PetscInt           mbs, i, j, n, *ridx = NULL;
1790   PetscBool          usecprow = a->compressedrow.use;
1791 
1792   PetscFunctionBegin;
1793   PetscCall(VecGetArrayRead(xx, &x));
1794   PetscCall(VecGetArrayWrite(zz, &zarray));
1795 
1796   v = a->a;
1797   if (usecprow) {
1798     mbs  = a->compressedrow.nrows;
1799     ii   = a->compressedrow.i;
1800     ridx = a->compressedrow.rindex;
1801     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1802   } else {
1803     mbs = a->mbs;
1804     ii  = a->i;
1805     z   = zarray;
1806   }
1807 
1808   for (i = 0; i < mbs; i++) {
1809     n     = ii[i + 1] - ii[i];
1810     idx   = ij + ii[i];
1811     sum1  = 0.0;
1812     sum2  = 0.0;
1813     sum3  = 0.0;
1814     sum4  = 0.0;
1815     sum5  = 0.0;
1816     sum6  = 0.0;
1817     sum7  = 0.0;
1818     sum8  = 0.0;
1819     sum9  = 0.0;
1820     sum10 = 0.0;
1821     sum11 = 0.0;
1822     sum12 = 0.0;
1823     sum13 = 0.0;
1824     sum14 = 0.0;
1825     sum15 = 0.0;
1826 
1827     for (j = 0; j < n; j++) {
1828       xb = x + 15 * (idx[j]);
1829       x1 = xb[0];
1830       x2 = xb[1];
1831       x3 = xb[2];
1832       x4 = xb[3];
1833       x5 = xb[4];
1834       x6 = xb[5];
1835       x7 = xb[6];
1836       x8 = xb[7];
1837 
1838       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1839       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1840       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1841       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1842       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1843       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1844       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1845       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1846       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1847       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1848       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1849       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1850       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1851       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1852       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1853       v += 120;
1854 
1855       x1 = xb[8];
1856       x2 = xb[9];
1857       x3 = xb[10];
1858       x4 = xb[11];
1859       x5 = xb[12];
1860       x6 = xb[13];
1861       x7 = xb[14];
1862 
1863       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1864       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1865       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1866       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1867       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1868       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1869       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1870       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1871       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1872       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1873       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1874       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1875       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1876       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1877       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1878       v += 105;
1879     }
1880     if (usecprow) z = zarray + 15 * ridx[i];
1881     z[0]  = sum1;
1882     z[1]  = sum2;
1883     z[2]  = sum3;
1884     z[3]  = sum4;
1885     z[4]  = sum5;
1886     z[5]  = sum6;
1887     z[6]  = sum7;
1888     z[7]  = sum8;
1889     z[8]  = sum9;
1890     z[9]  = sum10;
1891     z[10] = sum11;
1892     z[11] = sum12;
1893     z[12] = sum13;
1894     z[13] = sum14;
1895     z[14] = sum15;
1896 
1897     if (!usecprow) z += 15;
1898   }
1899 
1900   PetscCall(VecRestoreArrayRead(xx, &x));
1901   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1902   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1903   PetscFunctionReturn(0);
1904 }
1905 
1906 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1907 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1908 {
1909   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1910   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1911   const PetscScalar *x, *xb;
1912   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1913   const MatScalar   *v;
1914   const PetscInt    *ii, *ij = a->j, *idx;
1915   PetscInt           mbs, i, j, n, *ridx = NULL;
1916   PetscBool          usecprow = a->compressedrow.use;
1917 
1918   PetscFunctionBegin;
1919   PetscCall(VecGetArrayRead(xx, &x));
1920   PetscCall(VecGetArrayWrite(zz, &zarray));
1921 
1922   v = a->a;
1923   if (usecprow) {
1924     mbs  = a->compressedrow.nrows;
1925     ii   = a->compressedrow.i;
1926     ridx = a->compressedrow.rindex;
1927     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1928   } else {
1929     mbs = a->mbs;
1930     ii  = a->i;
1931     z   = zarray;
1932   }
1933 
1934   for (i = 0; i < mbs; i++) {
1935     n     = ii[i + 1] - ii[i];
1936     idx   = ij + ii[i];
1937     sum1  = 0.0;
1938     sum2  = 0.0;
1939     sum3  = 0.0;
1940     sum4  = 0.0;
1941     sum5  = 0.0;
1942     sum6  = 0.0;
1943     sum7  = 0.0;
1944     sum8  = 0.0;
1945     sum9  = 0.0;
1946     sum10 = 0.0;
1947     sum11 = 0.0;
1948     sum12 = 0.0;
1949     sum13 = 0.0;
1950     sum14 = 0.0;
1951     sum15 = 0.0;
1952 
1953     for (j = 0; j < n; j++) {
1954       xb  = x + 15 * (idx[j]);
1955       x1  = xb[0];
1956       x2  = xb[1];
1957       x3  = xb[2];
1958       x4  = xb[3];
1959       x5  = xb[4];
1960       x6  = xb[5];
1961       x7  = xb[6];
1962       x8  = xb[7];
1963       x9  = xb[8];
1964       x10 = xb[9];
1965       x11 = xb[10];
1966       x12 = xb[11];
1967       x13 = xb[12];
1968       x14 = xb[13];
1969       x15 = xb[14];
1970 
1971       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1972       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1973       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1974       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1975       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1976       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1977       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1978       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1979       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1980       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1981       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1982       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1983       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1984       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1985       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1986       v += 225;
1987     }
1988     if (usecprow) z = zarray + 15 * ridx[i];
1989     z[0]  = sum1;
1990     z[1]  = sum2;
1991     z[2]  = sum3;
1992     z[3]  = sum4;
1993     z[4]  = sum5;
1994     z[5]  = sum6;
1995     z[6]  = sum7;
1996     z[7]  = sum8;
1997     z[8]  = sum9;
1998     z[9]  = sum10;
1999     z[10] = sum11;
2000     z[11] = sum12;
2001     z[12] = sum13;
2002     z[13] = sum14;
2003     z[14] = sum15;
2004 
2005     if (!usecprow) z += 15;
2006   }
2007 
2008   PetscCall(VecRestoreArrayRead(xx, &x));
2009   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2010   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
2011   PetscFunctionReturn(0);
2012 }
2013 
2014 /*
2015     This will not work with MatScalar == float because it calls the BLAS
2016 */
2017 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
2018 {
2019   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2020   PetscScalar       *z = NULL, *work, *workt, *zarray;
2021   const PetscScalar *x, *xb;
2022   const MatScalar   *v;
2023   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2024   const PetscInt    *idx, *ii, *ridx = NULL;
2025   PetscInt           ncols, k;
2026   PetscBool          usecprow = a->compressedrow.use;
2027 
2028   PetscFunctionBegin;
2029   PetscCall(VecGetArrayRead(xx, &x));
2030   PetscCall(VecGetArrayWrite(zz, &zarray));
2031 
2032   idx = a->j;
2033   v   = a->a;
2034   if (usecprow) {
2035     mbs  = a->compressedrow.nrows;
2036     ii   = a->compressedrow.i;
2037     ridx = a->compressedrow.rindex;
2038     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2039   } else {
2040     mbs = a->mbs;
2041     ii  = a->i;
2042     z   = zarray;
2043   }
2044 
2045   if (!a->mult_work) {
2046     k = PetscMax(A->rmap->n, A->cmap->n);
2047     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2048   }
2049   work = a->mult_work;
2050   for (i = 0; i < mbs; i++) {
2051     n = ii[1] - ii[0];
2052     ii++;
2053     ncols = n * bs;
2054     workt = work;
2055     for (j = 0; j < n; j++) {
2056       xb = x + bs * (*idx++);
2057       for (k = 0; k < bs; k++) workt[k] = xb[k];
2058       workt += bs;
2059     }
2060     if (usecprow) z = zarray + bs * ridx[i];
2061     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2062     v += n * bs2;
2063     if (!usecprow) z += bs;
2064   }
2065   PetscCall(VecRestoreArrayRead(xx, &x));
2066   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2067   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2068   PetscFunctionReturn(0);
2069 }
2070 
2071 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2072 {
2073   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2074   const PetscScalar *x;
2075   PetscScalar       *y, *z, sum;
2076   const MatScalar   *v;
2077   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2078   const PetscInt    *idx, *ii;
2079   PetscBool          usecprow = a->compressedrow.use;
2080 
2081   PetscFunctionBegin;
2082   PetscCall(VecGetArrayRead(xx, &x));
2083   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2084 
2085   idx = a->j;
2086   v   = a->a;
2087   if (usecprow) {
2088     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2089     mbs  = a->compressedrow.nrows;
2090     ii   = a->compressedrow.i;
2091     ridx = a->compressedrow.rindex;
2092   } else {
2093     ii = a->i;
2094   }
2095 
2096   for (i = 0; i < mbs; i++) {
2097     n = ii[1] - ii[0];
2098     ii++;
2099     if (!usecprow) {
2100       sum = y[i];
2101     } else {
2102       sum = y[ridx[i]];
2103     }
2104     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2105     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2106     PetscSparseDensePlusDot(sum, x, v, idx, n);
2107     v += n;
2108     idx += n;
2109     if (usecprow) {
2110       z[ridx[i]] = sum;
2111     } else {
2112       z[i] = sum;
2113     }
2114   }
2115   PetscCall(VecRestoreArrayRead(xx, &x));
2116   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2117   PetscCall(PetscLogFlops(2.0 * a->nz));
2118   PetscFunctionReturn(0);
2119 }
2120 
2121 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2122 {
2123   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2124   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2125   const PetscScalar *x, *xb;
2126   PetscScalar        x1, x2, *yarray, *zarray;
2127   const MatScalar   *v;
2128   PetscInt           mbs = a->mbs, i, n, j;
2129   const PetscInt    *idx, *ii, *ridx = NULL;
2130   PetscBool          usecprow = a->compressedrow.use;
2131 
2132   PetscFunctionBegin;
2133   PetscCall(VecGetArrayRead(xx, &x));
2134   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2135 
2136   idx = a->j;
2137   v   = a->a;
2138   if (usecprow) {
2139     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2140     mbs  = a->compressedrow.nrows;
2141     ii   = a->compressedrow.i;
2142     ridx = a->compressedrow.rindex;
2143   } else {
2144     ii = a->i;
2145     y  = yarray;
2146     z  = zarray;
2147   }
2148 
2149   for (i = 0; i < mbs; i++) {
2150     n = ii[1] - ii[0];
2151     ii++;
2152     if (usecprow) {
2153       z = zarray + 2 * ridx[i];
2154       y = yarray + 2 * ridx[i];
2155     }
2156     sum1 = y[0];
2157     sum2 = y[1];
2158     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2159     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2160     for (j = 0; j < n; j++) {
2161       xb = x + 2 * (*idx++);
2162       x1 = xb[0];
2163       x2 = xb[1];
2164 
2165       sum1 += v[0] * x1 + v[2] * x2;
2166       sum2 += v[1] * x1 + v[3] * x2;
2167       v += 4;
2168     }
2169     z[0] = sum1;
2170     z[1] = sum2;
2171     if (!usecprow) {
2172       z += 2;
2173       y += 2;
2174     }
2175   }
2176   PetscCall(VecRestoreArrayRead(xx, &x));
2177   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2178   PetscCall(PetscLogFlops(4.0 * a->nz));
2179   PetscFunctionReturn(0);
2180 }
2181 
2182 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2183 {
2184   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2185   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2186   const PetscScalar *x, *xb;
2187   const MatScalar   *v;
2188   PetscInt           mbs = a->mbs, i, j, n;
2189   const PetscInt    *idx, *ii, *ridx = NULL;
2190   PetscBool          usecprow = a->compressedrow.use;
2191 
2192   PetscFunctionBegin;
2193   PetscCall(VecGetArrayRead(xx, &x));
2194   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2195 
2196   idx = a->j;
2197   v   = a->a;
2198   if (usecprow) {
2199     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2200     mbs  = a->compressedrow.nrows;
2201     ii   = a->compressedrow.i;
2202     ridx = a->compressedrow.rindex;
2203   } else {
2204     ii = a->i;
2205     y  = yarray;
2206     z  = zarray;
2207   }
2208 
2209   for (i = 0; i < mbs; i++) {
2210     n = ii[1] - ii[0];
2211     ii++;
2212     if (usecprow) {
2213       z = zarray + 3 * ridx[i];
2214       y = yarray + 3 * ridx[i];
2215     }
2216     sum1 = y[0];
2217     sum2 = y[1];
2218     sum3 = y[2];
2219     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2220     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2221     for (j = 0; j < n; j++) {
2222       xb = x + 3 * (*idx++);
2223       x1 = xb[0];
2224       x2 = xb[1];
2225       x3 = xb[2];
2226       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2227       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2228       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2229       v += 9;
2230     }
2231     z[0] = sum1;
2232     z[1] = sum2;
2233     z[2] = sum3;
2234     if (!usecprow) {
2235       z += 3;
2236       y += 3;
2237     }
2238   }
2239   PetscCall(VecRestoreArrayRead(xx, &x));
2240   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2241   PetscCall(PetscLogFlops(18.0 * a->nz));
2242   PetscFunctionReturn(0);
2243 }
2244 
2245 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2246 {
2247   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2248   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2249   const PetscScalar *x, *xb;
2250   const MatScalar   *v;
2251   PetscInt           mbs = a->mbs, i, j, n;
2252   const PetscInt    *idx, *ii, *ridx = NULL;
2253   PetscBool          usecprow = a->compressedrow.use;
2254 
2255   PetscFunctionBegin;
2256   PetscCall(VecGetArrayRead(xx, &x));
2257   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2258 
2259   idx = a->j;
2260   v   = a->a;
2261   if (usecprow) {
2262     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2263     mbs  = a->compressedrow.nrows;
2264     ii   = a->compressedrow.i;
2265     ridx = a->compressedrow.rindex;
2266   } else {
2267     ii = a->i;
2268     y  = yarray;
2269     z  = zarray;
2270   }
2271 
2272   for (i = 0; i < mbs; i++) {
2273     n = ii[1] - ii[0];
2274     ii++;
2275     if (usecprow) {
2276       z = zarray + 4 * ridx[i];
2277       y = yarray + 4 * ridx[i];
2278     }
2279     sum1 = y[0];
2280     sum2 = y[1];
2281     sum3 = y[2];
2282     sum4 = y[3];
2283     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2284     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2285     for (j = 0; j < n; j++) {
2286       xb = x + 4 * (*idx++);
2287       x1 = xb[0];
2288       x2 = xb[1];
2289       x3 = xb[2];
2290       x4 = xb[3];
2291       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2292       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2293       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2294       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2295       v += 16;
2296     }
2297     z[0] = sum1;
2298     z[1] = sum2;
2299     z[2] = sum3;
2300     z[3] = sum4;
2301     if (!usecprow) {
2302       z += 4;
2303       y += 4;
2304     }
2305   }
2306   PetscCall(VecRestoreArrayRead(xx, &x));
2307   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2308   PetscCall(PetscLogFlops(32.0 * a->nz));
2309   PetscFunctionReturn(0);
2310 }
2311 
2312 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2313 {
2314   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2315   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2316   const PetscScalar *x, *xb;
2317   PetscScalar       *yarray, *zarray;
2318   const MatScalar   *v;
2319   PetscInt           mbs = a->mbs, i, j, n;
2320   const PetscInt    *idx, *ii, *ridx = NULL;
2321   PetscBool          usecprow = a->compressedrow.use;
2322 
2323   PetscFunctionBegin;
2324   PetscCall(VecGetArrayRead(xx, &x));
2325   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2326 
2327   idx = a->j;
2328   v   = a->a;
2329   if (usecprow) {
2330     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2331     mbs  = a->compressedrow.nrows;
2332     ii   = a->compressedrow.i;
2333     ridx = a->compressedrow.rindex;
2334   } else {
2335     ii = a->i;
2336     y  = yarray;
2337     z  = zarray;
2338   }
2339 
2340   for (i = 0; i < mbs; i++) {
2341     n = ii[1] - ii[0];
2342     ii++;
2343     if (usecprow) {
2344       z = zarray + 5 * ridx[i];
2345       y = yarray + 5 * ridx[i];
2346     }
2347     sum1 = y[0];
2348     sum2 = y[1];
2349     sum3 = y[2];
2350     sum4 = y[3];
2351     sum5 = y[4];
2352     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2353     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2354     for (j = 0; j < n; j++) {
2355       xb = x + 5 * (*idx++);
2356       x1 = xb[0];
2357       x2 = xb[1];
2358       x3 = xb[2];
2359       x4 = xb[3];
2360       x5 = xb[4];
2361       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2362       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2363       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2364       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2365       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2366       v += 25;
2367     }
2368     z[0] = sum1;
2369     z[1] = sum2;
2370     z[2] = sum3;
2371     z[3] = sum4;
2372     z[4] = sum5;
2373     if (!usecprow) {
2374       z += 5;
2375       y += 5;
2376     }
2377   }
2378   PetscCall(VecRestoreArrayRead(xx, &x));
2379   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2380   PetscCall(PetscLogFlops(50.0 * a->nz));
2381   PetscFunctionReturn(0);
2382 }
2383 
2384 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2385 {
2386   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2387   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2388   const PetscScalar *x, *xb;
2389   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2390   const MatScalar   *v;
2391   PetscInt           mbs = a->mbs, i, j, n;
2392   const PetscInt    *idx, *ii, *ridx = NULL;
2393   PetscBool          usecprow = a->compressedrow.use;
2394 
2395   PetscFunctionBegin;
2396   PetscCall(VecGetArrayRead(xx, &x));
2397   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2398 
2399   idx = a->j;
2400   v   = a->a;
2401   if (usecprow) {
2402     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2403     mbs  = a->compressedrow.nrows;
2404     ii   = a->compressedrow.i;
2405     ridx = a->compressedrow.rindex;
2406   } else {
2407     ii = a->i;
2408     y  = yarray;
2409     z  = zarray;
2410   }
2411 
2412   for (i = 0; i < mbs; i++) {
2413     n = ii[1] - ii[0];
2414     ii++;
2415     if (usecprow) {
2416       z = zarray + 6 * ridx[i];
2417       y = yarray + 6 * ridx[i];
2418     }
2419     sum1 = y[0];
2420     sum2 = y[1];
2421     sum3 = y[2];
2422     sum4 = y[3];
2423     sum5 = y[4];
2424     sum6 = y[5];
2425     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2426     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2427     for (j = 0; j < n; j++) {
2428       xb = x + 6 * (*idx++);
2429       x1 = xb[0];
2430       x2 = xb[1];
2431       x3 = xb[2];
2432       x4 = xb[3];
2433       x5 = xb[4];
2434       x6 = xb[5];
2435       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2436       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2437       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2438       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2439       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2440       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2441       v += 36;
2442     }
2443     z[0] = sum1;
2444     z[1] = sum2;
2445     z[2] = sum3;
2446     z[3] = sum4;
2447     z[4] = sum5;
2448     z[5] = sum6;
2449     if (!usecprow) {
2450       z += 6;
2451       y += 6;
2452     }
2453   }
2454   PetscCall(VecRestoreArrayRead(xx, &x));
2455   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2456   PetscCall(PetscLogFlops(72.0 * a->nz));
2457   PetscFunctionReturn(0);
2458 }
2459 
2460 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2461 {
2462   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2463   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2464   const PetscScalar *x, *xb;
2465   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2466   const MatScalar   *v;
2467   PetscInt           mbs = a->mbs, i, j, n;
2468   const PetscInt    *idx, *ii, *ridx = NULL;
2469   PetscBool          usecprow = a->compressedrow.use;
2470 
2471   PetscFunctionBegin;
2472   PetscCall(VecGetArrayRead(xx, &x));
2473   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2474 
2475   idx = a->j;
2476   v   = a->a;
2477   if (usecprow) {
2478     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2479     mbs  = a->compressedrow.nrows;
2480     ii   = a->compressedrow.i;
2481     ridx = a->compressedrow.rindex;
2482   } else {
2483     ii = a->i;
2484     y  = yarray;
2485     z  = zarray;
2486   }
2487 
2488   for (i = 0; i < mbs; i++) {
2489     n = ii[1] - ii[0];
2490     ii++;
2491     if (usecprow) {
2492       z = zarray + 7 * ridx[i];
2493       y = yarray + 7 * ridx[i];
2494     }
2495     sum1 = y[0];
2496     sum2 = y[1];
2497     sum3 = y[2];
2498     sum4 = y[3];
2499     sum5 = y[4];
2500     sum6 = y[5];
2501     sum7 = y[6];
2502     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2503     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2504     for (j = 0; j < n; j++) {
2505       xb = x + 7 * (*idx++);
2506       x1 = xb[0];
2507       x2 = xb[1];
2508       x3 = xb[2];
2509       x4 = xb[3];
2510       x5 = xb[4];
2511       x6 = xb[5];
2512       x7 = xb[6];
2513       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2514       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2515       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2516       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2517       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2518       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2519       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2520       v += 49;
2521     }
2522     z[0] = sum1;
2523     z[1] = sum2;
2524     z[2] = sum3;
2525     z[3] = sum4;
2526     z[4] = sum5;
2527     z[5] = sum6;
2528     z[6] = sum7;
2529     if (!usecprow) {
2530       z += 7;
2531       y += 7;
2532     }
2533   }
2534   PetscCall(VecRestoreArrayRead(xx, &x));
2535   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2536   PetscCall(PetscLogFlops(98.0 * a->nz));
2537   PetscFunctionReturn(0);
2538 }
2539 
2540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2541 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2542 {
2543   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2544   PetscScalar       *z = NULL, *work, *workt, *zarray;
2545   const PetscScalar *x, *xb;
2546   const MatScalar   *v;
2547   PetscInt           mbs, i, j, n;
2548   PetscInt           k;
2549   PetscBool          usecprow = a->compressedrow.use;
2550   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2551 
2552   __m256d a0, a1, a2, a3, a4, a5;
2553   __m256d w0, w1, w2, w3;
2554   __m256d z0, z1, z2;
2555   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2556 
2557   PetscFunctionBegin;
2558   PetscCall(VecCopy(yy, zz));
2559   PetscCall(VecGetArrayRead(xx, &x));
2560   PetscCall(VecGetArray(zz, &zarray));
2561 
2562   idx = a->j;
2563   v   = a->a;
2564   if (usecprow) {
2565     mbs  = a->compressedrow.nrows;
2566     ii   = a->compressedrow.i;
2567     ridx = a->compressedrow.rindex;
2568   } else {
2569     mbs = a->mbs;
2570     ii  = a->i;
2571     z   = zarray;
2572   }
2573 
2574   if (!a->mult_work) {
2575     k = PetscMax(A->rmap->n, A->cmap->n);
2576     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2577   }
2578 
2579   work = a->mult_work;
2580   for (i = 0; i < mbs; i++) {
2581     n = ii[1] - ii[0];
2582     ii++;
2583     workt = work;
2584     for (j = 0; j < n; j++) {
2585       xb = x + bs * (*idx++);
2586       for (k = 0; k < bs; k++) workt[k] = xb[k];
2587       workt += bs;
2588     }
2589     if (usecprow) z = zarray + bs * ridx[i];
2590 
2591     z0 = _mm256_loadu_pd(&z[0]);
2592     z1 = _mm256_loadu_pd(&z[4]);
2593     z2 = _mm256_set1_pd(z[8]);
2594 
2595     for (j = 0; j < n; j++) {
2596       /* first column of a */
2597       w0 = _mm256_set1_pd(work[j * 9]);
2598       a0 = _mm256_loadu_pd(&v[j * 81]);
2599       z0 = _mm256_fmadd_pd(a0, w0, z0);
2600       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2601       z1 = _mm256_fmadd_pd(a1, w0, z1);
2602       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2603       z2 = _mm256_fmadd_pd(a2, w0, z2);
2604 
2605       /* second column of a */
2606       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2607       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2608       z0 = _mm256_fmadd_pd(a0, w1, z0);
2609       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2610       z1 = _mm256_fmadd_pd(a1, w1, z1);
2611       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2612       z2 = _mm256_fmadd_pd(a2, w1, z2);
2613 
2614       /* third column of a */
2615       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2616       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2617       z0 = _mm256_fmadd_pd(a3, w2, z0);
2618       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2619       z1 = _mm256_fmadd_pd(a4, w2, z1);
2620       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2621       z2 = _mm256_fmadd_pd(a5, w2, z2);
2622 
2623       /* fourth column of a */
2624       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2625       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2626       z0 = _mm256_fmadd_pd(a0, w3, z0);
2627       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2628       z1 = _mm256_fmadd_pd(a1, w3, z1);
2629       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2630       z2 = _mm256_fmadd_pd(a2, w3, z2);
2631 
2632       /* fifth column of a */
2633       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2634       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2635       z0 = _mm256_fmadd_pd(a3, w0, z0);
2636       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2637       z1 = _mm256_fmadd_pd(a4, w0, z1);
2638       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2639       z2 = _mm256_fmadd_pd(a5, w0, z2);
2640 
2641       /* sixth column of a */
2642       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2643       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2644       z0 = _mm256_fmadd_pd(a0, w1, z0);
2645       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2646       z1 = _mm256_fmadd_pd(a1, w1, z1);
2647       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2648       z2 = _mm256_fmadd_pd(a2, w1, z2);
2649 
2650       /* seventh column of a */
2651       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2652       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2653       z0 = _mm256_fmadd_pd(a0, w2, z0);
2654       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2655       z1 = _mm256_fmadd_pd(a1, w2, z1);
2656       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2657       z2 = _mm256_fmadd_pd(a2, w2, z2);
2658 
2659       /* eighth column of a */
2660       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2661       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2662       z0 = _mm256_fmadd_pd(a3, w3, z0);
2663       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2664       z1 = _mm256_fmadd_pd(a4, w3, z1);
2665       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2666       z2 = _mm256_fmadd_pd(a5, w3, z2);
2667 
2668       /* ninth column of a */
2669       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2670       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2671       z0 = _mm256_fmadd_pd(a0, w0, z0);
2672       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2673       z1 = _mm256_fmadd_pd(a1, w0, z1);
2674       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2675       z2 = _mm256_fmadd_pd(a2, w0, z2);
2676     }
2677 
2678     _mm256_storeu_pd(&z[0], z0);
2679     _mm256_storeu_pd(&z[4], z1);
2680     _mm256_maskstore_pd(&z[8], mask1, z2);
2681 
2682     v += n * bs2;
2683     if (!usecprow) z += bs;
2684   }
2685   PetscCall(VecRestoreArrayRead(xx, &x));
2686   PetscCall(VecRestoreArray(zz, &zarray));
2687   PetscCall(PetscLogFlops(162.0 * a->nz));
2688   PetscFunctionReturn(0);
2689 }
2690 #endif
2691 
2692 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2693 {
2694   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2695   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2696   const PetscScalar *x, *xb;
2697   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2698   const MatScalar   *v;
2699   PetscInt           mbs = a->mbs, i, j, n;
2700   const PetscInt    *idx, *ii, *ridx = NULL;
2701   PetscBool          usecprow = a->compressedrow.use;
2702 
2703   PetscFunctionBegin;
2704   PetscCall(VecGetArrayRead(xx, &x));
2705   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2706 
2707   idx = a->j;
2708   v   = a->a;
2709   if (usecprow) {
2710     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2711     mbs  = a->compressedrow.nrows;
2712     ii   = a->compressedrow.i;
2713     ridx = a->compressedrow.rindex;
2714   } else {
2715     ii = a->i;
2716     y  = yarray;
2717     z  = zarray;
2718   }
2719 
2720   for (i = 0; i < mbs; i++) {
2721     n = ii[1] - ii[0];
2722     ii++;
2723     if (usecprow) {
2724       z = zarray + 11 * ridx[i];
2725       y = yarray + 11 * ridx[i];
2726     }
2727     sum1  = y[0];
2728     sum2  = y[1];
2729     sum3  = y[2];
2730     sum4  = y[3];
2731     sum5  = y[4];
2732     sum6  = y[5];
2733     sum7  = y[6];
2734     sum8  = y[7];
2735     sum9  = y[8];
2736     sum10 = y[9];
2737     sum11 = y[10];
2738     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2739     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2740     for (j = 0; j < n; j++) {
2741       xb  = x + 11 * (*idx++);
2742       x1  = xb[0];
2743       x2  = xb[1];
2744       x3  = xb[2];
2745       x4  = xb[3];
2746       x5  = xb[4];
2747       x6  = xb[5];
2748       x7  = xb[6];
2749       x8  = xb[7];
2750       x9  = xb[8];
2751       x10 = xb[9];
2752       x11 = xb[10];
2753       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2754       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2755       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2756       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2757       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2758       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2759       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2760       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2761       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2762       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2763       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2764       v += 121;
2765     }
2766     z[0]  = sum1;
2767     z[1]  = sum2;
2768     z[2]  = sum3;
2769     z[3]  = sum4;
2770     z[4]  = sum5;
2771     z[5]  = sum6;
2772     z[6]  = sum7;
2773     z[7]  = sum8;
2774     z[8]  = sum9;
2775     z[9]  = sum10;
2776     z[10] = sum11;
2777     if (!usecprow) {
2778       z += 11;
2779       y += 11;
2780     }
2781   }
2782   PetscCall(VecRestoreArrayRead(xx, &x));
2783   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2784   PetscCall(PetscLogFlops(242.0 * a->nz));
2785   PetscFunctionReturn(0);
2786 }
2787 
2788 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2789 {
2790   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2791   PetscScalar       *z = NULL, *work, *workt, *zarray;
2792   const PetscScalar *x, *xb;
2793   const MatScalar   *v;
2794   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2795   PetscInt           ncols, k;
2796   const PetscInt    *ridx     = NULL, *idx, *ii;
2797   PetscBool          usecprow = a->compressedrow.use;
2798 
2799   PetscFunctionBegin;
2800   PetscCall(VecCopy(yy, zz));
2801   PetscCall(VecGetArrayRead(xx, &x));
2802   PetscCall(VecGetArray(zz, &zarray));
2803 
2804   idx = a->j;
2805   v   = a->a;
2806   if (usecprow) {
2807     mbs  = a->compressedrow.nrows;
2808     ii   = a->compressedrow.i;
2809     ridx = a->compressedrow.rindex;
2810   } else {
2811     mbs = a->mbs;
2812     ii  = a->i;
2813     z   = zarray;
2814   }
2815 
2816   if (!a->mult_work) {
2817     k = PetscMax(A->rmap->n, A->cmap->n);
2818     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2819   }
2820   work = a->mult_work;
2821   for (i = 0; i < mbs; i++) {
2822     n = ii[1] - ii[0];
2823     ii++;
2824     ncols = n * bs;
2825     workt = work;
2826     for (j = 0; j < n; j++) {
2827       xb = x + bs * (*idx++);
2828       for (k = 0; k < bs; k++) workt[k] = xb[k];
2829       workt += bs;
2830     }
2831     if (usecprow) z = zarray + bs * ridx[i];
2832     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2833     v += n * bs2;
2834     if (!usecprow) z += bs;
2835   }
2836   PetscCall(VecRestoreArrayRead(xx, &x));
2837   PetscCall(VecRestoreArray(zz, &zarray));
2838   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2839   PetscFunctionReturn(0);
2840 }
2841 
2842 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2843 {
2844   PetscScalar zero = 0.0;
2845 
2846   PetscFunctionBegin;
2847   PetscCall(VecSet(zz, zero));
2848   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2849   PetscFunctionReturn(0);
2850 }
2851 
2852 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2853 {
2854   PetscScalar zero = 0.0;
2855 
2856   PetscFunctionBegin;
2857   PetscCall(VecSet(zz, zero));
2858   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2859   PetscFunctionReturn(0);
2860 }
2861 
2862 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2863 {
2864   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2865   PetscScalar       *z, x1, x2, x3, x4, x5;
2866   const PetscScalar *x, *xb = NULL;
2867   const MatScalar   *v;
2868   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2869   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2870   Mat_CompressedRow  cprow    = a->compressedrow;
2871   PetscBool          usecprow = cprow.use;
2872 
2873   PetscFunctionBegin;
2874   if (yy != zz) PetscCall(VecCopy(yy, zz));
2875   PetscCall(VecGetArrayRead(xx, &x));
2876   PetscCall(VecGetArray(zz, &z));
2877 
2878   idx = a->j;
2879   v   = a->a;
2880   if (usecprow) {
2881     mbs  = cprow.nrows;
2882     ii   = cprow.i;
2883     ridx = cprow.rindex;
2884   } else {
2885     mbs = a->mbs;
2886     ii  = a->i;
2887     xb  = x;
2888   }
2889 
2890   switch (bs) {
2891   case 1:
2892     for (i = 0; i < mbs; i++) {
2893       if (usecprow) xb = x + ridx[i];
2894       x1 = xb[0];
2895       ib = idx + ii[0];
2896       n  = ii[1] - ii[0];
2897       ii++;
2898       for (j = 0; j < n; j++) {
2899         rval = ib[j];
2900         z[rval] += PetscConj(*v) * x1;
2901         v++;
2902       }
2903       if (!usecprow) xb++;
2904     }
2905     break;
2906   case 2:
2907     for (i = 0; i < mbs; i++) {
2908       if (usecprow) xb = x + 2 * ridx[i];
2909       x1 = xb[0];
2910       x2 = xb[1];
2911       ib = idx + ii[0];
2912       n  = ii[1] - ii[0];
2913       ii++;
2914       for (j = 0; j < n; j++) {
2915         rval = ib[j] * 2;
2916         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2917         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2918         v += 4;
2919       }
2920       if (!usecprow) xb += 2;
2921     }
2922     break;
2923   case 3:
2924     for (i = 0; i < mbs; i++) {
2925       if (usecprow) xb = x + 3 * ridx[i];
2926       x1 = xb[0];
2927       x2 = xb[1];
2928       x3 = xb[2];
2929       ib = idx + ii[0];
2930       n  = ii[1] - ii[0];
2931       ii++;
2932       for (j = 0; j < n; j++) {
2933         rval = ib[j] * 3;
2934         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2935         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2936         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2937         v += 9;
2938       }
2939       if (!usecprow) xb += 3;
2940     }
2941     break;
2942   case 4:
2943     for (i = 0; i < mbs; i++) {
2944       if (usecprow) xb = x + 4 * ridx[i];
2945       x1 = xb[0];
2946       x2 = xb[1];
2947       x3 = xb[2];
2948       x4 = xb[3];
2949       ib = idx + ii[0];
2950       n  = ii[1] - ii[0];
2951       ii++;
2952       for (j = 0; j < n; j++) {
2953         rval = ib[j] * 4;
2954         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2955         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2956         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2957         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2958         v += 16;
2959       }
2960       if (!usecprow) xb += 4;
2961     }
2962     break;
2963   case 5:
2964     for (i = 0; i < mbs; i++) {
2965       if (usecprow) xb = x + 5 * ridx[i];
2966       x1 = xb[0];
2967       x2 = xb[1];
2968       x3 = xb[2];
2969       x4 = xb[3];
2970       x5 = xb[4];
2971       ib = idx + ii[0];
2972       n  = ii[1] - ii[0];
2973       ii++;
2974       for (j = 0; j < n; j++) {
2975         rval = ib[j] * 5;
2976         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2977         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2978         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2979         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2980         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2981         v += 25;
2982       }
2983       if (!usecprow) xb += 5;
2984     }
2985     break;
2986   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2987     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2988 #if 0
2989     {
2990       PetscInt          ncols,k,bs2=a->bs2;
2991       PetscScalar       *work,*workt,zb;
2992       const PetscScalar *xtmp;
2993       if (!a->mult_work) {
2994         k    = PetscMax(A->rmap->n,A->cmap->n);
2995         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2996       }
2997       work = a->mult_work;
2998       xtmp = x;
2999       for (i=0; i<mbs; i++) {
3000         n     = ii[1] - ii[0]; ii++;
3001         ncols = n*bs;
3002         PetscCall(PetscArrayzero(work,ncols));
3003         if (usecprow) xtmp = x + bs*ridx[i];
3004         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
3005         v += n*bs2;
3006         if (!usecprow) xtmp += bs;
3007         workt = work;
3008         for (j=0; j<n; j++) {
3009           zb = z + bs*(*idx++);
3010           for (k=0; k<bs; k++) zb[k] += workt[k] ;
3011           workt += bs;
3012         }
3013       }
3014     }
3015 #endif
3016   }
3017   PetscCall(VecRestoreArrayRead(xx, &x));
3018   PetscCall(VecRestoreArray(zz, &z));
3019   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3020   PetscFunctionReturn(0);
3021 }
3022 
3023 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
3024 {
3025   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3026   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
3027   const PetscScalar *x, *xb = NULL;
3028   const MatScalar   *v;
3029   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3030   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
3031   Mat_CompressedRow  cprow    = a->compressedrow;
3032   PetscBool          usecprow = cprow.use;
3033 
3034   PetscFunctionBegin;
3035   if (yy != zz) PetscCall(VecCopy(yy, zz));
3036   PetscCall(VecGetArrayRead(xx, &x));
3037   PetscCall(VecGetArray(zz, &z));
3038 
3039   idx = a->j;
3040   v   = a->a;
3041   if (usecprow) {
3042     mbs  = cprow.nrows;
3043     ii   = cprow.i;
3044     ridx = cprow.rindex;
3045   } else {
3046     mbs = a->mbs;
3047     ii  = a->i;
3048     xb  = x;
3049   }
3050 
3051   switch (bs) {
3052   case 1:
3053     for (i = 0; i < mbs; i++) {
3054       if (usecprow) xb = x + ridx[i];
3055       x1 = xb[0];
3056       ib = idx + ii[0];
3057       n  = ii[1] - ii[0];
3058       ii++;
3059       for (j = 0; j < n; j++) {
3060         rval = ib[j];
3061         z[rval] += *v * x1;
3062         v++;
3063       }
3064       if (!usecprow) xb++;
3065     }
3066     break;
3067   case 2:
3068     for (i = 0; i < mbs; i++) {
3069       if (usecprow) xb = x + 2 * ridx[i];
3070       x1 = xb[0];
3071       x2 = xb[1];
3072       ib = idx + ii[0];
3073       n  = ii[1] - ii[0];
3074       ii++;
3075       for (j = 0; j < n; j++) {
3076         rval = ib[j] * 2;
3077         z[rval++] += v[0] * x1 + v[1] * x2;
3078         z[rval++] += v[2] * x1 + v[3] * x2;
3079         v += 4;
3080       }
3081       if (!usecprow) xb += 2;
3082     }
3083     break;
3084   case 3:
3085     for (i = 0; i < mbs; i++) {
3086       if (usecprow) xb = x + 3 * ridx[i];
3087       x1 = xb[0];
3088       x2 = xb[1];
3089       x3 = xb[2];
3090       ib = idx + ii[0];
3091       n  = ii[1] - ii[0];
3092       ii++;
3093       for (j = 0; j < n; j++) {
3094         rval = ib[j] * 3;
3095         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3096         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3097         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3098         v += 9;
3099       }
3100       if (!usecprow) xb += 3;
3101     }
3102     break;
3103   case 4:
3104     for (i = 0; i < mbs; i++) {
3105       if (usecprow) xb = x + 4 * ridx[i];
3106       x1 = xb[0];
3107       x2 = xb[1];
3108       x3 = xb[2];
3109       x4 = xb[3];
3110       ib = idx + ii[0];
3111       n  = ii[1] - ii[0];
3112       ii++;
3113       for (j = 0; j < n; j++) {
3114         rval = ib[j] * 4;
3115         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3116         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3117         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3118         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3119         v += 16;
3120       }
3121       if (!usecprow) xb += 4;
3122     }
3123     break;
3124   case 5:
3125     for (i = 0; i < mbs; i++) {
3126       if (usecprow) xb = x + 5 * ridx[i];
3127       x1 = xb[0];
3128       x2 = xb[1];
3129       x3 = xb[2];
3130       x4 = xb[3];
3131       x5 = xb[4];
3132       ib = idx + ii[0];
3133       n  = ii[1] - ii[0];
3134       ii++;
3135       for (j = 0; j < n; j++) {
3136         rval = ib[j] * 5;
3137         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3138         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3139         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3140         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3141         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3142         v += 25;
3143       }
3144       if (!usecprow) xb += 5;
3145     }
3146     break;
3147   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3148     PetscInt           ncols, k;
3149     PetscScalar       *work, *workt;
3150     const PetscScalar *xtmp;
3151     if (!a->mult_work) {
3152       k = PetscMax(A->rmap->n, A->cmap->n);
3153       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3154     }
3155     work = a->mult_work;
3156     xtmp = x;
3157     for (i = 0; i < mbs; i++) {
3158       n = ii[1] - ii[0];
3159       ii++;
3160       ncols = n * bs;
3161       PetscCall(PetscArrayzero(work, ncols));
3162       if (usecprow) xtmp = x + bs * ridx[i];
3163       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3164       v += n * bs2;
3165       if (!usecprow) xtmp += bs;
3166       workt = work;
3167       for (j = 0; j < n; j++) {
3168         zb = z + bs * (*idx++);
3169         for (k = 0; k < bs; k++) zb[k] += workt[k];
3170         workt += bs;
3171       }
3172     }
3173   }
3174   }
3175   PetscCall(VecRestoreArrayRead(xx, &x));
3176   PetscCall(VecRestoreArray(zz, &z));
3177   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3178   PetscFunctionReturn(0);
3179 }
3180 
3181 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3182 {
3183   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3184   PetscInt     totalnz = a->bs2 * a->nz;
3185   PetscScalar  oalpha  = alpha;
3186   PetscBLASInt one     = 1, tnz;
3187 
3188   PetscFunctionBegin;
3189   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3190   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3191   PetscCall(PetscLogFlops(totalnz));
3192   PetscFunctionReturn(0);
3193 }
3194 
3195 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3196 {
3197   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3198   MatScalar   *v   = a->a;
3199   PetscReal    sum = 0.0;
3200   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3201 
3202   PetscFunctionBegin;
3203   if (type == NORM_FROBENIUS) {
3204 #if defined(PETSC_USE_REAL___FP16)
3205     PetscBLASInt one = 1, cnt = bs2 * nz;
3206     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3207 #else
3208     for (i = 0; i < bs2 * nz; i++) {
3209       sum += PetscRealPart(PetscConj(*v) * (*v));
3210       v++;
3211     }
3212 #endif
3213     *norm = PetscSqrtReal(sum);
3214     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3215   } else if (type == NORM_1) { /* maximum column sum */
3216     PetscReal *tmp;
3217     PetscInt  *bcol = a->j;
3218     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3219     for (i = 0; i < nz; i++) {
3220       for (j = 0; j < bs; j++) {
3221         k1 = bs * (*bcol) + j; /* column index */
3222         for (k = 0; k < bs; k++) {
3223           tmp[k1] += PetscAbsScalar(*v);
3224           v++;
3225         }
3226       }
3227       bcol++;
3228     }
3229     *norm = 0.0;
3230     for (j = 0; j < A->cmap->n; j++) {
3231       if (tmp[j] > *norm) *norm = tmp[j];
3232     }
3233     PetscCall(PetscFree(tmp));
3234     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3235   } else if (type == NORM_INFINITY) { /* maximum row sum */
3236     *norm = 0.0;
3237     for (k = 0; k < bs; k++) {
3238       for (j = 0; j < a->mbs; j++) {
3239         v   = a->a + bs2 * a->i[j] + k;
3240         sum = 0.0;
3241         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3242           for (k1 = 0; k1 < bs; k1++) {
3243             sum += PetscAbsScalar(*v);
3244             v += bs;
3245           }
3246         }
3247         if (sum > *norm) *norm = sum;
3248       }
3249     }
3250     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3251   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3252   PetscFunctionReturn(0);
3253 }
3254 
3255 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3256 {
3257   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3258 
3259   PetscFunctionBegin;
3260   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3261   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3262     *flg = PETSC_FALSE;
3263     PetscFunctionReturn(0);
3264   }
3265 
3266   /* if the a->i are the same */
3267   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3268   if (!*flg) PetscFunctionReturn(0);
3269 
3270   /* if a->j are the same */
3271   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3272   if (!*flg) PetscFunctionReturn(0);
3273 
3274   /* if a->a are the same */
3275   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3276   PetscFunctionReturn(0);
3277 }
3278 
3279 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3280 {
3281   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3282   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3283   PetscScalar *x, zero = 0.0;
3284   MatScalar   *aa, *aa_j;
3285 
3286   PetscFunctionBegin;
3287   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3288   bs   = A->rmap->bs;
3289   aa   = a->a;
3290   ai   = a->i;
3291   aj   = a->j;
3292   ambs = a->mbs;
3293   bs2  = a->bs2;
3294 
3295   PetscCall(VecSet(v, zero));
3296   PetscCall(VecGetArray(v, &x));
3297   PetscCall(VecGetLocalSize(v, &n));
3298   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3299   for (i = 0; i < ambs; i++) {
3300     for (j = ai[i]; j < ai[i + 1]; j++) {
3301       if (aj[j] == i) {
3302         row  = i * bs;
3303         aa_j = aa + j * bs2;
3304         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3305         break;
3306       }
3307     }
3308   }
3309   PetscCall(VecRestoreArray(v, &x));
3310   PetscFunctionReturn(0);
3311 }
3312 
3313 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3314 {
3315   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3316   const PetscScalar *l, *r, *li, *ri;
3317   PetscScalar        x;
3318   MatScalar         *aa, *v;
3319   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3320   const PetscInt    *ai, *aj;
3321 
3322   PetscFunctionBegin;
3323   ai  = a->i;
3324   aj  = a->j;
3325   aa  = a->a;
3326   m   = A->rmap->n;
3327   n   = A->cmap->n;
3328   bs  = A->rmap->bs;
3329   mbs = a->mbs;
3330   bs2 = a->bs2;
3331   if (ll) {
3332     PetscCall(VecGetArrayRead(ll, &l));
3333     PetscCall(VecGetLocalSize(ll, &lm));
3334     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3335     for (i = 0; i < mbs; i++) { /* for each block row */
3336       M  = ai[i + 1] - ai[i];
3337       li = l + i * bs;
3338       v  = aa + bs2 * ai[i];
3339       for (j = 0; j < M; j++) { /* for each block */
3340         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3341       }
3342     }
3343     PetscCall(VecRestoreArrayRead(ll, &l));
3344     PetscCall(PetscLogFlops(a->nz));
3345   }
3346 
3347   if (rr) {
3348     PetscCall(VecGetArrayRead(rr, &r));
3349     PetscCall(VecGetLocalSize(rr, &rn));
3350     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3351     for (i = 0; i < mbs; i++) { /* for each block row */
3352       iai = ai[i];
3353       M   = ai[i + 1] - iai;
3354       v   = aa + bs2 * iai;
3355       for (j = 0; j < M; j++) { /* for each block */
3356         ri = r + bs * aj[iai + j];
3357         for (k = 0; k < bs; k++) {
3358           x = ri[k];
3359           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3360           v += bs;
3361         }
3362       }
3363     }
3364     PetscCall(VecRestoreArrayRead(rr, &r));
3365     PetscCall(PetscLogFlops(a->nz));
3366   }
3367   PetscFunctionReturn(0);
3368 }
3369 
3370 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3371 {
3372   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3373 
3374   PetscFunctionBegin;
3375   info->block_size   = a->bs2;
3376   info->nz_allocated = a->bs2 * a->maxnz;
3377   info->nz_used      = a->bs2 * a->nz;
3378   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3379   info->assemblies   = A->num_ass;
3380   info->mallocs      = A->info.mallocs;
3381   info->memory       = 0; /* REVIEW ME */
3382   if (A->factortype) {
3383     info->fill_ratio_given  = A->info.fill_ratio_given;
3384     info->fill_ratio_needed = A->info.fill_ratio_needed;
3385     info->factor_mallocs    = A->info.factor_mallocs;
3386   } else {
3387     info->fill_ratio_given  = 0;
3388     info->fill_ratio_needed = 0;
3389     info->factor_mallocs    = 0;
3390   }
3391   PetscFunctionReturn(0);
3392 }
3393 
3394 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3395 {
3396   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3397 
3398   PetscFunctionBegin;
3399   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3400   PetscFunctionReturn(0);
3401 }
3402 
3403 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3404 {
3405   PetscFunctionBegin;
3406   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3407   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3408   PetscFunctionReturn(0);
3409 }
3410 
3411 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3412 {
3413   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3414   PetscScalar       *z = NULL, sum1;
3415   const PetscScalar *xb;
3416   PetscScalar        x1;
3417   const MatScalar   *v, *vv;
3418   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3419   PetscBool          usecprow = a->compressedrow.use;
3420 
3421   PetscFunctionBegin;
3422   idx = a->j;
3423   v   = a->a;
3424   if (usecprow) {
3425     mbs  = a->compressedrow.nrows;
3426     ii   = a->compressedrow.i;
3427     ridx = a->compressedrow.rindex;
3428   } else {
3429     mbs = a->mbs;
3430     ii  = a->i;
3431     z   = c;
3432   }
3433 
3434   for (i = 0; i < mbs; i++) {
3435     n = ii[1] - ii[0];
3436     ii++;
3437     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3438     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3439     if (usecprow) z = c + ridx[i];
3440     jj = idx;
3441     vv = v;
3442     for (k = 0; k < cn; k++) {
3443       idx  = jj;
3444       v    = vv;
3445       sum1 = 0.0;
3446       for (j = 0; j < n; j++) {
3447         xb = b + (*idx++);
3448         x1 = xb[0 + k * bm];
3449         sum1 += v[0] * x1;
3450         v += 1;
3451       }
3452       z[0 + k * cm] = sum1;
3453     }
3454     if (!usecprow) z += 1;
3455   }
3456   PetscFunctionReturn(0);
3457 }
3458 
3459 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3460 {
3461   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3462   PetscScalar       *z = NULL, sum1, sum2;
3463   const PetscScalar *xb;
3464   PetscScalar        x1, x2;
3465   const MatScalar   *v, *vv;
3466   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3467   PetscBool          usecprow = a->compressedrow.use;
3468 
3469   PetscFunctionBegin;
3470   idx = a->j;
3471   v   = a->a;
3472   if (usecprow) {
3473     mbs  = a->compressedrow.nrows;
3474     ii   = a->compressedrow.i;
3475     ridx = a->compressedrow.rindex;
3476   } else {
3477     mbs = a->mbs;
3478     ii  = a->i;
3479     z   = c;
3480   }
3481 
3482   for (i = 0; i < mbs; i++) {
3483     n = ii[1] - ii[0];
3484     ii++;
3485     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3486     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3487     if (usecprow) z = c + 2 * ridx[i];
3488     jj = idx;
3489     vv = v;
3490     for (k = 0; k < cn; k++) {
3491       idx  = jj;
3492       v    = vv;
3493       sum1 = 0.0;
3494       sum2 = 0.0;
3495       for (j = 0; j < n; j++) {
3496         xb = b + 2 * (*idx++);
3497         x1 = xb[0 + k * bm];
3498         x2 = xb[1 + k * bm];
3499         sum1 += v[0] * x1 + v[2] * x2;
3500         sum2 += v[1] * x1 + v[3] * x2;
3501         v += 4;
3502       }
3503       z[0 + k * cm] = sum1;
3504       z[1 + k * cm] = sum2;
3505     }
3506     if (!usecprow) z += 2;
3507   }
3508   PetscFunctionReturn(0);
3509 }
3510 
3511 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3512 {
3513   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3514   PetscScalar       *z = NULL, sum1, sum2, sum3;
3515   const PetscScalar *xb;
3516   PetscScalar        x1, x2, x3;
3517   const MatScalar   *v, *vv;
3518   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3519   PetscBool          usecprow = a->compressedrow.use;
3520 
3521   PetscFunctionBegin;
3522   idx = a->j;
3523   v   = a->a;
3524   if (usecprow) {
3525     mbs  = a->compressedrow.nrows;
3526     ii   = a->compressedrow.i;
3527     ridx = a->compressedrow.rindex;
3528   } else {
3529     mbs = a->mbs;
3530     ii  = a->i;
3531     z   = c;
3532   }
3533 
3534   for (i = 0; i < mbs; i++) {
3535     n = ii[1] - ii[0];
3536     ii++;
3537     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3538     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3539     if (usecprow) z = c + 3 * ridx[i];
3540     jj = idx;
3541     vv = v;
3542     for (k = 0; k < cn; k++) {
3543       idx  = jj;
3544       v    = vv;
3545       sum1 = 0.0;
3546       sum2 = 0.0;
3547       sum3 = 0.0;
3548       for (j = 0; j < n; j++) {
3549         xb = b + 3 * (*idx++);
3550         x1 = xb[0 + k * bm];
3551         x2 = xb[1 + k * bm];
3552         x3 = xb[2 + k * bm];
3553         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3554         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3555         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3556         v += 9;
3557       }
3558       z[0 + k * cm] = sum1;
3559       z[1 + k * cm] = sum2;
3560       z[2 + k * cm] = sum3;
3561     }
3562     if (!usecprow) z += 3;
3563   }
3564   PetscFunctionReturn(0);
3565 }
3566 
3567 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3568 {
3569   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3570   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3571   const PetscScalar *xb;
3572   PetscScalar        x1, x2, x3, x4;
3573   const MatScalar   *v, *vv;
3574   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3575   PetscBool          usecprow = a->compressedrow.use;
3576 
3577   PetscFunctionBegin;
3578   idx = a->j;
3579   v   = a->a;
3580   if (usecprow) {
3581     mbs  = a->compressedrow.nrows;
3582     ii   = a->compressedrow.i;
3583     ridx = a->compressedrow.rindex;
3584   } else {
3585     mbs = a->mbs;
3586     ii  = a->i;
3587     z   = c;
3588   }
3589 
3590   for (i = 0; i < mbs; i++) {
3591     n = ii[1] - ii[0];
3592     ii++;
3593     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3594     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3595     if (usecprow) z = c + 4 * ridx[i];
3596     jj = idx;
3597     vv = v;
3598     for (k = 0; k < cn; k++) {
3599       idx  = jj;
3600       v    = vv;
3601       sum1 = 0.0;
3602       sum2 = 0.0;
3603       sum3 = 0.0;
3604       sum4 = 0.0;
3605       for (j = 0; j < n; j++) {
3606         xb = b + 4 * (*idx++);
3607         x1 = xb[0 + k * bm];
3608         x2 = xb[1 + k * bm];
3609         x3 = xb[2 + k * bm];
3610         x4 = xb[3 + k * bm];
3611         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3612         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3613         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3614         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3615         v += 16;
3616       }
3617       z[0 + k * cm] = sum1;
3618       z[1 + k * cm] = sum2;
3619       z[2 + k * cm] = sum3;
3620       z[3 + k * cm] = sum4;
3621     }
3622     if (!usecprow) z += 4;
3623   }
3624   PetscFunctionReturn(0);
3625 }
3626 
3627 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3628 {
3629   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3630   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3631   const PetscScalar *xb;
3632   PetscScalar        x1, x2, x3, x4, x5;
3633   const MatScalar   *v, *vv;
3634   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3635   PetscBool          usecprow = a->compressedrow.use;
3636 
3637   PetscFunctionBegin;
3638   idx = a->j;
3639   v   = a->a;
3640   if (usecprow) {
3641     mbs  = a->compressedrow.nrows;
3642     ii   = a->compressedrow.i;
3643     ridx = a->compressedrow.rindex;
3644   } else {
3645     mbs = a->mbs;
3646     ii  = a->i;
3647     z   = c;
3648   }
3649 
3650   for (i = 0; i < mbs; i++) {
3651     n = ii[1] - ii[0];
3652     ii++;
3653     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3654     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3655     if (usecprow) z = c + 5 * ridx[i];
3656     jj = idx;
3657     vv = v;
3658     for (k = 0; k < cn; k++) {
3659       idx  = jj;
3660       v    = vv;
3661       sum1 = 0.0;
3662       sum2 = 0.0;
3663       sum3 = 0.0;
3664       sum4 = 0.0;
3665       sum5 = 0.0;
3666       for (j = 0; j < n; j++) {
3667         xb = b + 5 * (*idx++);
3668         x1 = xb[0 + k * bm];
3669         x2 = xb[1 + k * bm];
3670         x3 = xb[2 + k * bm];
3671         x4 = xb[3 + k * bm];
3672         x5 = xb[4 + k * bm];
3673         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3674         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3675         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3676         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3677         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3678         v += 25;
3679       }
3680       z[0 + k * cm] = sum1;
3681       z[1 + k * cm] = sum2;
3682       z[2 + k * cm] = sum3;
3683       z[3 + k * cm] = sum4;
3684       z[4 + k * cm] = sum5;
3685     }
3686     if (!usecprow) z += 5;
3687   }
3688   PetscFunctionReturn(0);
3689 }
3690 
3691 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3692 {
3693   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3694   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3695   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3696   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3697   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3698   PetscBLASInt     bbs, bcn, bbm, bcm;
3699   PetscScalar     *z = NULL;
3700   PetscScalar     *c, *b;
3701   const MatScalar *v;
3702   const PetscInt  *idx, *ii, *ridx = NULL;
3703   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3704   PetscBool        usecprow = a->compressedrow.use;
3705 
3706   PetscFunctionBegin;
3707   if (!cm || !cn) PetscFunctionReturn(0);
3708   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3709   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3710   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3711   b = bd->v;
3712   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3713   PetscCall(MatDenseGetArray(C, &c));
3714   switch (bs) {
3715   case 1:
3716     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3717     break;
3718   case 2:
3719     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3720     break;
3721   case 3:
3722     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3723     break;
3724   case 4:
3725     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3726     break;
3727   case 5:
3728     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3729     break;
3730   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3731     PetscCall(PetscBLASIntCast(bs, &bbs));
3732     PetscCall(PetscBLASIntCast(cn, &bcn));
3733     PetscCall(PetscBLASIntCast(bm, &bbm));
3734     PetscCall(PetscBLASIntCast(cm, &bcm));
3735     idx = a->j;
3736     v   = a->a;
3737     if (usecprow) {
3738       mbs  = a->compressedrow.nrows;
3739       ii   = a->compressedrow.i;
3740       ridx = a->compressedrow.rindex;
3741     } else {
3742       mbs = a->mbs;
3743       ii  = a->i;
3744       z   = c;
3745     }
3746     for (i = 0; i < mbs; i++) {
3747       n = ii[1] - ii[0];
3748       ii++;
3749       if (usecprow) z = c + bs * ridx[i];
3750       if (n) {
3751         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3752         v += bs2;
3753       }
3754       for (j = 1; j < n; j++) {
3755         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3756         v += bs2;
3757       }
3758       if (!usecprow) z += bs;
3759     }
3760   }
3761   PetscCall(MatDenseRestoreArray(C, &c));
3762   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3763   PetscFunctionReturn(0);
3764 }
3765