xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision df4cd43f92eaa320656440c40edb1046daee8f75)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8   #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
14   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
15   const PetscInt *idx;
16   PetscInt        start, end, *ai, *aj, bs;
17   PetscBT         table;
18 
19   PetscFunctionBegin;
20   m  = a->mbs;
21   ai = a->i;
22   aj = a->j;
23   bs = A->rmap->bs;
24 
25   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
26 
27   PetscCall(PetscBTCreate(m, &table));
28   PetscCall(PetscMalloc1(m + 1, &nidx));
29 
30   for (i = 0; i < is_max; i++) {
31     /* Initialise the two local arrays */
32     isz = 0;
33     PetscCall(PetscBTMemzero(m, table));
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     PetscCall(ISGetIndices(is[i], &idx));
37     PetscCall(ISGetLocalSize(is[i], &n));
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j = 0; j < n; ++j) {
41       ival = idx[j] / bs; /* convert the indices into block indices */
42       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
43       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44     }
45     PetscCall(ISRestoreIndices(is[i], &idx));
46     PetscCall(ISDestroy(&is[i]));
47 
48     k = 0;
49     for (j = 0; j < ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row + 1];
55         for (l = start; l < end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58         }
59       }
60     }
61     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
62   }
63   PetscCall(PetscBTDestroy(&table));
64   PetscCall(PetscFree(nidx));
65   PetscFunctionReturn(PETSC_SUCCESS);
66 }
67 
68 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
69 {
70   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
71   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
72   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
73   const PetscInt *irow, *icol;
74   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
75   PetscInt       *aj = a->j, *ai = a->i;
76   MatScalar      *mat_a;
77   Mat             C;
78   PetscBool       flag;
79 
80   PetscFunctionBegin;
81   PetscCall(ISGetIndices(isrow, &irow));
82   PetscCall(ISGetIndices(iscol, &icol));
83   PetscCall(ISGetLocalSize(isrow, &nrows));
84   PetscCall(ISGetLocalSize(iscol, &ncols));
85 
86   PetscCall(PetscCalloc1(1 + oldcols, &smap));
87   ssmap = smap;
88   PetscCall(PetscMalloc1(1 + nrows, &lens));
89   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
90   /* determine lens of each row */
91   for (i = 0; i < nrows; i++) {
92     kstart  = ai[irow[i]];
93     kend    = kstart + a->ilen[irow[i]];
94     lens[i] = 0;
95     for (k = kstart; k < kend; k++) {
96       if (ssmap[aj[k]]) lens[i]++;
97     }
98   }
99   /* Create and fill new matrix */
100   if (scall == MAT_REUSE_MATRIX) {
101     c = (Mat_SeqBAIJ *)((*B)->data);
102 
103     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
104     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
105     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
106     PetscCall(PetscArrayzero(c->ilen, c->mbs));
107     C = *B;
108   } else {
109     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
110     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
111     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
112     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113   }
114   c = (Mat_SeqBAIJ *)(C->data);
115   for (i = 0; i < nrows; i++) {
116     row      = irow[i];
117     kstart   = ai[row];
118     kend     = kstart + a->ilen[row];
119     mat_i    = c->i[i];
120     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
121     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122     mat_ilen = c->ilen + i;
123     for (k = kstart; k < kend; k++) {
124       if ((tcol = ssmap[a->j[k]])) {
125         *mat_j++ = tcol - 1;
126         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127         mat_a += bs2;
128         (*mat_ilen)++;
129       }
130     }
131   }
132   /* sort */
133   if (c->j && c->a) {
134     MatScalar *work;
135     PetscCall(PetscMalloc1(bs2, &work));
136     for (i = 0; i < nrows; i++) {
137       PetscInt ilen;
138       mat_i = c->i[i];
139       mat_j = c->j + mat_i;
140       mat_a = c->a + mat_i * bs2;
141       ilen  = c->ilen[i];
142       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143     }
144     PetscCall(PetscFree(work));
145   }
146 
147   /* Free work space */
148   PetscCall(ISRestoreIndices(iscol, &icol));
149   PetscCall(PetscFree(smap));
150   PetscCall(PetscFree(lens));
151   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
152   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
153 
154   PetscCall(ISRestoreIndices(isrow, &irow));
155   *B = C;
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160 {
161   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
162   IS              is1, is2;
163   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
164   const PetscInt *irow, *icol;
165 
166   PetscFunctionBegin;
167   PetscCall(ISGetIndices(isrow, &irow));
168   PetscCall(ISGetIndices(iscol, &icol));
169   PetscCall(ISGetLocalSize(isrow, &nrows));
170   PetscCall(ISGetLocalSize(iscol, &ncols));
171 
172   /* Verify if the indices correspond to each element in a block
173    and form the IS with compressed IS */
174   maxmnbs = PetscMax(a->mbs, a->nbs);
175   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
176   PetscCall(PetscArrayzero(vary, a->mbs));
177   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
178   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
179   count = 0;
180   for (i = 0; i < nrows; i++) {
181     j = irow[i] / bs;
182     if ((vary[j]--) == bs) iary[count++] = j;
183   }
184   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
185 
186   PetscCall(PetscArrayzero(vary, a->nbs));
187   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
188   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
189   count = 0;
190   for (i = 0; i < ncols; i++) {
191     j = icol[i] / bs;
192     if ((vary[j]--) == bs) iary[count++] = j;
193   }
194   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
195   PetscCall(ISRestoreIndices(isrow, &irow));
196   PetscCall(ISRestoreIndices(iscol, &icol));
197   PetscCall(PetscFree2(vary, iary));
198 
199   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
200   PetscCall(ISDestroy(&is1));
201   PetscCall(ISDestroy(&is2));
202   PetscFunctionReturn(PETSC_SUCCESS);
203 }
204 
205 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
206 {
207   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
208   Mat_SubSppt *submatj = c->submatis1;
209 
210   PetscFunctionBegin;
211   PetscCall((*submatj->destroy)(C));
212   PetscCall(MatDestroySubMatrix_Private(submatj));
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
217 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
218 {
219   PetscInt     i;
220   Mat          C;
221   Mat_SeqBAIJ *c;
222   Mat_SubSppt *submatj;
223 
224   PetscFunctionBegin;
225   for (i = 0; i < n; i++) {
226     C       = (*mat)[i];
227     c       = (Mat_SeqBAIJ *)C->data;
228     submatj = c->submatis1;
229     if (submatj) {
230       if (--((PetscObject)C)->refct <= 0) {
231         PetscCall(PetscFree(C->factorprefix));
232         PetscCall((*submatj->destroy)(C));
233         PetscCall(MatDestroySubMatrix_Private(submatj));
234         PetscCall(PetscFree(C->defaultvectype));
235         PetscCall(PetscFree(C->defaultrandtype));
236         PetscCall(PetscLayoutDestroy(&C->rmap));
237         PetscCall(PetscLayoutDestroy(&C->cmap));
238         PetscCall(PetscHeaderDestroy(&C));
239       }
240     } else {
241       PetscCall(MatDestroy(&C));
242     }
243   }
244 
245   /* Destroy Dummy submatrices created for reuse */
246   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
247 
248   PetscCall(PetscFree(*mat));
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
253 {
254   PetscInt i;
255 
256   PetscFunctionBegin;
257   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
258 
259   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
260   PetscFunctionReturn(PETSC_SUCCESS);
261 }
262 
263 /* Should check that shapes of vectors and matrices match */
264 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
265 {
266   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
267   PetscScalar       *z, sum;
268   const PetscScalar *x;
269   const MatScalar   *v;
270   PetscInt           mbs, i, n;
271   const PetscInt    *idx, *ii, *ridx = NULL;
272   PetscBool          usecprow = a->compressedrow.use;
273 
274   PetscFunctionBegin;
275   PetscCall(VecGetArrayRead(xx, &x));
276   PetscCall(VecGetArrayWrite(zz, &z));
277 
278   if (usecprow) {
279     mbs  = a->compressedrow.nrows;
280     ii   = a->compressedrow.i;
281     ridx = a->compressedrow.rindex;
282     PetscCall(PetscArrayzero(z, a->mbs));
283   } else {
284     mbs = a->mbs;
285     ii  = a->i;
286   }
287 
288   for (i = 0; i < mbs; i++) {
289     n   = ii[1] - ii[0];
290     v   = a->a + ii[0];
291     idx = a->j + ii[0];
292     ii++;
293     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
294     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
295     sum = 0.0;
296     PetscSparseDensePlusDot(sum, x, v, idx, n);
297     if (usecprow) {
298       z[ridx[i]] = sum;
299     } else {
300       z[i] = sum;
301     }
302   }
303   PetscCall(VecRestoreArrayRead(xx, &x));
304   PetscCall(VecRestoreArrayWrite(zz, &z));
305   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
306   PetscFunctionReturn(PETSC_SUCCESS);
307 }
308 
309 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
310 {
311   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
312   PetscScalar       *z = NULL, sum1, sum2, *zarray;
313   const PetscScalar *x, *xb;
314   PetscScalar        x1, x2;
315   const MatScalar   *v;
316   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
317   PetscBool          usecprow = a->compressedrow.use;
318 
319   PetscFunctionBegin;
320   PetscCall(VecGetArrayRead(xx, &x));
321   PetscCall(VecGetArrayWrite(zz, &zarray));
322 
323   idx = a->j;
324   v   = a->a;
325   if (usecprow) {
326     mbs  = a->compressedrow.nrows;
327     ii   = a->compressedrow.i;
328     ridx = a->compressedrow.rindex;
329     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
330   } else {
331     mbs = a->mbs;
332     ii  = a->i;
333     z   = zarray;
334   }
335 
336   for (i = 0; i < mbs; i++) {
337     n = ii[1] - ii[0];
338     ii++;
339     sum1 = 0.0;
340     sum2 = 0.0;
341     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
342     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
343     for (j = 0; j < n; j++) {
344       xb = x + 2 * (*idx++);
345       x1 = xb[0];
346       x2 = xb[1];
347       sum1 += v[0] * x1 + v[2] * x2;
348       sum2 += v[1] * x1 + v[3] * x2;
349       v += 4;
350     }
351     if (usecprow) z = zarray + 2 * ridx[i];
352     z[0] = sum1;
353     z[1] = sum2;
354     if (!usecprow) z += 2;
355   }
356   PetscCall(VecRestoreArrayRead(xx, &x));
357   PetscCall(VecRestoreArrayWrite(zz, &zarray));
358   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
359   PetscFunctionReturn(PETSC_SUCCESS);
360 }
361 
362 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
363 {
364   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
365   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
366   const PetscScalar *x, *xb;
367   const MatScalar   *v;
368   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
369   PetscBool          usecprow = a->compressedrow.use;
370 
371 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
372   #pragma disjoint(*v, *z, *xb)
373 #endif
374 
375   PetscFunctionBegin;
376   PetscCall(VecGetArrayRead(xx, &x));
377   PetscCall(VecGetArrayWrite(zz, &zarray));
378 
379   idx = a->j;
380   v   = a->a;
381   if (usecprow) {
382     mbs  = a->compressedrow.nrows;
383     ii   = a->compressedrow.i;
384     ridx = a->compressedrow.rindex;
385     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
386   } else {
387     mbs = a->mbs;
388     ii  = a->i;
389     z   = zarray;
390   }
391 
392   for (i = 0; i < mbs; i++) {
393     n = ii[1] - ii[0];
394     ii++;
395     sum1 = 0.0;
396     sum2 = 0.0;
397     sum3 = 0.0;
398     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
399     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
400     for (j = 0; j < n; j++) {
401       xb = x + 3 * (*idx++);
402       x1 = xb[0];
403       x2 = xb[1];
404       x3 = xb[2];
405 
406       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
407       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
408       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
409       v += 9;
410     }
411     if (usecprow) z = zarray + 3 * ridx[i];
412     z[0] = sum1;
413     z[1] = sum2;
414     z[2] = sum3;
415     if (!usecprow) z += 3;
416   }
417   PetscCall(VecRestoreArrayRead(xx, &x));
418   PetscCall(VecRestoreArrayWrite(zz, &zarray));
419   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
420   PetscFunctionReturn(PETSC_SUCCESS);
421 }
422 
423 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
424 {
425   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
426   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
427   const PetscScalar *x, *xb;
428   const MatScalar   *v;
429   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
430   PetscBool          usecprow = a->compressedrow.use;
431 
432   PetscFunctionBegin;
433   PetscCall(VecGetArrayRead(xx, &x));
434   PetscCall(VecGetArrayWrite(zz, &zarray));
435 
436   idx = a->j;
437   v   = a->a;
438   if (usecprow) {
439     mbs  = a->compressedrow.nrows;
440     ii   = a->compressedrow.i;
441     ridx = a->compressedrow.rindex;
442     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
443   } else {
444     mbs = a->mbs;
445     ii  = a->i;
446     z   = zarray;
447   }
448 
449   for (i = 0; i < mbs; i++) {
450     n = ii[1] - ii[0];
451     ii++;
452     sum1 = 0.0;
453     sum2 = 0.0;
454     sum3 = 0.0;
455     sum4 = 0.0;
456 
457     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
458     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
459     for (j = 0; j < n; j++) {
460       xb = x + 4 * (*idx++);
461       x1 = xb[0];
462       x2 = xb[1];
463       x3 = xb[2];
464       x4 = xb[3];
465       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
466       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
467       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
468       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
469       v += 16;
470     }
471     if (usecprow) z = zarray + 4 * ridx[i];
472     z[0] = sum1;
473     z[1] = sum2;
474     z[2] = sum3;
475     z[3] = sum4;
476     if (!usecprow) z += 4;
477   }
478   PetscCall(VecRestoreArrayRead(xx, &x));
479   PetscCall(VecRestoreArrayWrite(zz, &zarray));
480   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
481   PetscFunctionReturn(PETSC_SUCCESS);
482 }
483 
484 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
485 {
486   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
487   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
488   const PetscScalar *xb, *x;
489   const MatScalar   *v;
490   const PetscInt    *idx, *ii, *ridx = NULL;
491   PetscInt           mbs, i, j, n;
492   PetscBool          usecprow = a->compressedrow.use;
493 
494   PetscFunctionBegin;
495   PetscCall(VecGetArrayRead(xx, &x));
496   PetscCall(VecGetArrayWrite(zz, &zarray));
497 
498   idx = a->j;
499   v   = a->a;
500   if (usecprow) {
501     mbs  = a->compressedrow.nrows;
502     ii   = a->compressedrow.i;
503     ridx = a->compressedrow.rindex;
504     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
505   } else {
506     mbs = a->mbs;
507     ii  = a->i;
508     z   = zarray;
509   }
510 
511   for (i = 0; i < mbs; i++) {
512     n = ii[1] - ii[0];
513     ii++;
514     sum1 = 0.0;
515     sum2 = 0.0;
516     sum3 = 0.0;
517     sum4 = 0.0;
518     sum5 = 0.0;
519     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
520     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
521     for (j = 0; j < n; j++) {
522       xb = x + 5 * (*idx++);
523       x1 = xb[0];
524       x2 = xb[1];
525       x3 = xb[2];
526       x4 = xb[3];
527       x5 = xb[4];
528       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
529       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
530       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
531       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
532       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
533       v += 25;
534     }
535     if (usecprow) z = zarray + 5 * ridx[i];
536     z[0] = sum1;
537     z[1] = sum2;
538     z[2] = sum3;
539     z[3] = sum4;
540     z[4] = sum5;
541     if (!usecprow) z += 5;
542   }
543   PetscCall(VecRestoreArrayRead(xx, &x));
544   PetscCall(VecRestoreArrayWrite(zz, &zarray));
545   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
546   PetscFunctionReturn(PETSC_SUCCESS);
547 }
548 
549 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
550 {
551   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
552   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
553   const PetscScalar *x, *xb;
554   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
555   const MatScalar   *v;
556   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
557   PetscBool          usecprow = a->compressedrow.use;
558 
559   PetscFunctionBegin;
560   PetscCall(VecGetArrayRead(xx, &x));
561   PetscCall(VecGetArrayWrite(zz, &zarray));
562 
563   idx = a->j;
564   v   = a->a;
565   if (usecprow) {
566     mbs  = a->compressedrow.nrows;
567     ii   = a->compressedrow.i;
568     ridx = a->compressedrow.rindex;
569     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
570   } else {
571     mbs = a->mbs;
572     ii  = a->i;
573     z   = zarray;
574   }
575 
576   for (i = 0; i < mbs; i++) {
577     n = ii[1] - ii[0];
578     ii++;
579     sum1 = 0.0;
580     sum2 = 0.0;
581     sum3 = 0.0;
582     sum4 = 0.0;
583     sum5 = 0.0;
584     sum6 = 0.0;
585 
586     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
587     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
588     for (j = 0; j < n; j++) {
589       xb = x + 6 * (*idx++);
590       x1 = xb[0];
591       x2 = xb[1];
592       x3 = xb[2];
593       x4 = xb[3];
594       x5 = xb[4];
595       x6 = xb[5];
596       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
597       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
598       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
599       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
600       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
601       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
602       v += 36;
603     }
604     if (usecprow) z = zarray + 6 * ridx[i];
605     z[0] = sum1;
606     z[1] = sum2;
607     z[2] = sum3;
608     z[3] = sum4;
609     z[4] = sum5;
610     z[5] = sum6;
611     if (!usecprow) z += 6;
612   }
613 
614   PetscCall(VecRestoreArrayRead(xx, &x));
615   PetscCall(VecRestoreArrayWrite(zz, &zarray));
616   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
617   PetscFunctionReturn(PETSC_SUCCESS);
618 }
619 
620 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
621 {
622   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
623   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
624   const PetscScalar *x, *xb;
625   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
626   const MatScalar   *v;
627   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
628   PetscBool          usecprow = a->compressedrow.use;
629 
630   PetscFunctionBegin;
631   PetscCall(VecGetArrayRead(xx, &x));
632   PetscCall(VecGetArrayWrite(zz, &zarray));
633 
634   idx = a->j;
635   v   = a->a;
636   if (usecprow) {
637     mbs  = a->compressedrow.nrows;
638     ii   = a->compressedrow.i;
639     ridx = a->compressedrow.rindex;
640     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
641   } else {
642     mbs = a->mbs;
643     ii  = a->i;
644     z   = zarray;
645   }
646 
647   for (i = 0; i < mbs; i++) {
648     n = ii[1] - ii[0];
649     ii++;
650     sum1 = 0.0;
651     sum2 = 0.0;
652     sum3 = 0.0;
653     sum4 = 0.0;
654     sum5 = 0.0;
655     sum6 = 0.0;
656     sum7 = 0.0;
657 
658     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
659     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
660     for (j = 0; j < n; j++) {
661       xb = x + 7 * (*idx++);
662       x1 = xb[0];
663       x2 = xb[1];
664       x3 = xb[2];
665       x4 = xb[3];
666       x5 = xb[4];
667       x6 = xb[5];
668       x7 = xb[6];
669       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
670       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
671       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
672       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
673       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
674       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
675       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
676       v += 49;
677     }
678     if (usecprow) z = zarray + 7 * ridx[i];
679     z[0] = sum1;
680     z[1] = sum2;
681     z[2] = sum3;
682     z[3] = sum4;
683     z[4] = sum5;
684     z[5] = sum6;
685     z[6] = sum7;
686     if (!usecprow) z += 7;
687   }
688 
689   PetscCall(VecRestoreArrayRead(xx, &x));
690   PetscCall(VecRestoreArrayWrite(zz, &zarray));
691   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
692   PetscFunctionReturn(PETSC_SUCCESS);
693 }
694 
695 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
696 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
697 {
698   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
699   PetscScalar       *z = NULL, *work, *workt, *zarray;
700   const PetscScalar *x, *xb;
701   const MatScalar   *v;
702   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
703   const PetscInt    *idx, *ii, *ridx = NULL;
704   PetscInt           k;
705   PetscBool          usecprow = a->compressedrow.use;
706 
707   __m256d a0, a1, a2, a3, a4, a5;
708   __m256d w0, w1, w2, w3;
709   __m256d z0, z1, z2;
710   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
711 
712   PetscFunctionBegin;
713   PetscCall(VecGetArrayRead(xx, &x));
714   PetscCall(VecGetArrayWrite(zz, &zarray));
715 
716   idx = a->j;
717   v   = a->a;
718   if (usecprow) {
719     mbs  = a->compressedrow.nrows;
720     ii   = a->compressedrow.i;
721     ridx = a->compressedrow.rindex;
722     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
723   } else {
724     mbs = a->mbs;
725     ii  = a->i;
726     z   = zarray;
727   }
728 
729   if (!a->mult_work) {
730     k = PetscMax(A->rmap->n, A->cmap->n);
731     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
732   }
733 
734   work = a->mult_work;
735   for (i = 0; i < mbs; i++) {
736     n = ii[1] - ii[0];
737     ii++;
738     workt = work;
739     for (j = 0; j < n; j++) {
740       xb = x + bs * (*idx++);
741       for (k = 0; k < bs; k++) workt[k] = xb[k];
742       workt += bs;
743     }
744     if (usecprow) z = zarray + bs * ridx[i];
745 
746     z0 = _mm256_setzero_pd();
747     z1 = _mm256_setzero_pd();
748     z2 = _mm256_setzero_pd();
749 
750     for (j = 0; j < n; j++) {
751       /* first column of a */
752       w0 = _mm256_set1_pd(work[j * 9]);
753       a0 = _mm256_loadu_pd(&v[j * 81]);
754       z0 = _mm256_fmadd_pd(a0, w0, z0);
755       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
756       z1 = _mm256_fmadd_pd(a1, w0, z1);
757       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
758       z2 = _mm256_fmadd_pd(a2, w0, z2);
759 
760       /* second column of a */
761       w1 = _mm256_set1_pd(work[j * 9 + 1]);
762       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
763       z0 = _mm256_fmadd_pd(a0, w1, z0);
764       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
765       z1 = _mm256_fmadd_pd(a1, w1, z1);
766       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
767       z2 = _mm256_fmadd_pd(a2, w1, z2);
768 
769       /* third column of a */
770       w2 = _mm256_set1_pd(work[j * 9 + 2]);
771       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
772       z0 = _mm256_fmadd_pd(a3, w2, z0);
773       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
774       z1 = _mm256_fmadd_pd(a4, w2, z1);
775       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
776       z2 = _mm256_fmadd_pd(a5, w2, z2);
777 
778       /* fourth column of a */
779       w3 = _mm256_set1_pd(work[j * 9 + 3]);
780       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
781       z0 = _mm256_fmadd_pd(a0, w3, z0);
782       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
783       z1 = _mm256_fmadd_pd(a1, w3, z1);
784       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
785       z2 = _mm256_fmadd_pd(a2, w3, z2);
786 
787       /* fifth column of a */
788       w0 = _mm256_set1_pd(work[j * 9 + 4]);
789       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
790       z0 = _mm256_fmadd_pd(a3, w0, z0);
791       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
792       z1 = _mm256_fmadd_pd(a4, w0, z1);
793       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
794       z2 = _mm256_fmadd_pd(a5, w0, z2);
795 
796       /* sixth column of a */
797       w1 = _mm256_set1_pd(work[j * 9 + 5]);
798       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
799       z0 = _mm256_fmadd_pd(a0, w1, z0);
800       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
801       z1 = _mm256_fmadd_pd(a1, w1, z1);
802       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
803       z2 = _mm256_fmadd_pd(a2, w1, z2);
804 
805       /* seventh column of a */
806       w2 = _mm256_set1_pd(work[j * 9 + 6]);
807       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
808       z0 = _mm256_fmadd_pd(a0, w2, z0);
809       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
810       z1 = _mm256_fmadd_pd(a1, w2, z1);
811       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
812       z2 = _mm256_fmadd_pd(a2, w2, z2);
813 
814       /* eighth column of a */
815       w3 = _mm256_set1_pd(work[j * 9 + 7]);
816       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
817       z0 = _mm256_fmadd_pd(a3, w3, z0);
818       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
819       z1 = _mm256_fmadd_pd(a4, w3, z1);
820       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
821       z2 = _mm256_fmadd_pd(a5, w3, z2);
822 
823       /* ninth column of a */
824       w0 = _mm256_set1_pd(work[j * 9 + 8]);
825       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
826       z0 = _mm256_fmadd_pd(a0, w0, z0);
827       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
828       z1 = _mm256_fmadd_pd(a1, w0, z1);
829       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
830       z2 = _mm256_fmadd_pd(a2, w0, z2);
831     }
832 
833     _mm256_storeu_pd(&z[0], z0);
834     _mm256_storeu_pd(&z[4], z1);
835     _mm256_maskstore_pd(&z[8], mask1, z2);
836 
837     v += n * bs2;
838     if (!usecprow) z += bs;
839   }
840   PetscCall(VecRestoreArrayRead(xx, &x));
841   PetscCall(VecRestoreArrayWrite(zz, &zarray));
842   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
843   PetscFunctionReturn(PETSC_SUCCESS);
844 }
845 #endif
846 
847 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
848 {
849   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
850   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
851   const PetscScalar *x, *xb;
852   PetscScalar       *zarray, xv;
853   const MatScalar   *v;
854   const PetscInt    *ii, *ij = a->j, *idx;
855   PetscInt           mbs, i, j, k, n, *ridx = NULL;
856   PetscBool          usecprow = a->compressedrow.use;
857 
858   PetscFunctionBegin;
859   PetscCall(VecGetArrayRead(xx, &x));
860   PetscCall(VecGetArrayWrite(zz, &zarray));
861 
862   v = a->a;
863   if (usecprow) {
864     mbs  = a->compressedrow.nrows;
865     ii   = a->compressedrow.i;
866     ridx = a->compressedrow.rindex;
867     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
868   } else {
869     mbs = a->mbs;
870     ii  = a->i;
871     z   = zarray;
872   }
873 
874   for (i = 0; i < mbs; i++) {
875     n     = ii[i + 1] - ii[i];
876     idx   = ij + ii[i];
877     sum1  = 0.0;
878     sum2  = 0.0;
879     sum3  = 0.0;
880     sum4  = 0.0;
881     sum5  = 0.0;
882     sum6  = 0.0;
883     sum7  = 0.0;
884     sum8  = 0.0;
885     sum9  = 0.0;
886     sum10 = 0.0;
887     sum11 = 0.0;
888 
889     for (j = 0; j < n; j++) {
890       xb = x + 11 * (idx[j]);
891 
892       for (k = 0; k < 11; k++) {
893         xv = xb[k];
894         sum1 += v[0] * xv;
895         sum2 += v[1] * xv;
896         sum3 += v[2] * xv;
897         sum4 += v[3] * xv;
898         sum5 += v[4] * xv;
899         sum6 += v[5] * xv;
900         sum7 += v[6] * xv;
901         sum8 += v[7] * xv;
902         sum9 += v[8] * xv;
903         sum10 += v[9] * xv;
904         sum11 += v[10] * xv;
905         v += 11;
906       }
907     }
908     if (usecprow) z = zarray + 11 * ridx[i];
909     z[0]  = sum1;
910     z[1]  = sum2;
911     z[2]  = sum3;
912     z[3]  = sum4;
913     z[4]  = sum5;
914     z[5]  = sum6;
915     z[6]  = sum7;
916     z[7]  = sum8;
917     z[8]  = sum9;
918     z[9]  = sum10;
919     z[10] = sum11;
920 
921     if (!usecprow) z += 11;
922   }
923 
924   PetscCall(VecRestoreArrayRead(xx, &x));
925   PetscCall(VecRestoreArrayWrite(zz, &zarray));
926   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
927   PetscFunctionReturn(PETSC_SUCCESS);
928 }
929 
930 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
931 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
932 {
933   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
934   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
935   const PetscScalar *x, *xb;
936   PetscScalar       *zarray, xv;
937   const MatScalar   *v;
938   const PetscInt    *ii, *ij = a->j, *idx;
939   PetscInt           mbs, i, j, k, n, *ridx = NULL;
940   PetscBool          usecprow = a->compressedrow.use;
941 
942   PetscFunctionBegin;
943   PetscCall(VecGetArrayRead(xx, &x));
944   PetscCall(VecGetArrayWrite(zz, &zarray));
945 
946   v = a->a;
947   if (usecprow) {
948     mbs  = a->compressedrow.nrows;
949     ii   = a->compressedrow.i;
950     ridx = a->compressedrow.rindex;
951     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
952   } else {
953     mbs = a->mbs;
954     ii  = a->i;
955     z   = zarray;
956   }
957 
958   for (i = 0; i < mbs; i++) {
959     n     = ii[i + 1] - ii[i];
960     idx   = ij + ii[i];
961     sum1  = 0.0;
962     sum2  = 0.0;
963     sum3  = 0.0;
964     sum4  = 0.0;
965     sum5  = 0.0;
966     sum6  = 0.0;
967     sum7  = 0.0;
968     sum8  = 0.0;
969     sum9  = 0.0;
970     sum10 = 0.0;
971     sum11 = 0.0;
972     sum12 = 0.0;
973 
974     for (j = 0; j < n; j++) {
975       xb = x + 12 * (idx[j]);
976 
977       for (k = 0; k < 12; k++) {
978         xv = xb[k];
979         sum1 += v[0] * xv;
980         sum2 += v[1] * xv;
981         sum3 += v[2] * xv;
982         sum4 += v[3] * xv;
983         sum5 += v[4] * xv;
984         sum6 += v[5] * xv;
985         sum7 += v[6] * xv;
986         sum8 += v[7] * xv;
987         sum9 += v[8] * xv;
988         sum10 += v[9] * xv;
989         sum11 += v[10] * xv;
990         sum12 += v[11] * xv;
991         v += 12;
992       }
993     }
994     if (usecprow) z = zarray + 12 * ridx[i];
995     z[0]  = sum1;
996     z[1]  = sum2;
997     z[2]  = sum3;
998     z[3]  = sum4;
999     z[4]  = sum5;
1000     z[5]  = sum6;
1001     z[6]  = sum7;
1002     z[7]  = sum8;
1003     z[8]  = sum9;
1004     z[9]  = sum10;
1005     z[10] = sum11;
1006     z[11] = sum12;
1007     if (!usecprow) z += 12;
1008   }
1009   PetscCall(VecRestoreArrayRead(xx, &x));
1010   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1011   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1012   PetscFunctionReturn(PETSC_SUCCESS);
1013 }
1014 
1015 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1016 {
1017   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1018   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1019   const PetscScalar *x, *xb;
1020   PetscScalar       *zarray, *yarray, xv;
1021   const MatScalar   *v;
1022   const PetscInt    *ii, *ij = a->j, *idx;
1023   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1024   PetscBool          usecprow = a->compressedrow.use;
1025 
1026   PetscFunctionBegin;
1027   PetscCall(VecGetArrayRead(xx, &x));
1028   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1029 
1030   v = a->a;
1031   if (usecprow) {
1032     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1033     mbs  = a->compressedrow.nrows;
1034     ii   = a->compressedrow.i;
1035     ridx = a->compressedrow.rindex;
1036   } else {
1037     ii = a->i;
1038     y  = yarray;
1039     z  = zarray;
1040   }
1041 
1042   for (i = 0; i < mbs; i++) {
1043     n   = ii[i + 1] - ii[i];
1044     idx = ij + ii[i];
1045 
1046     if (usecprow) {
1047       y = yarray + 12 * ridx[i];
1048       z = zarray + 12 * ridx[i];
1049     }
1050     sum1  = y[0];
1051     sum2  = y[1];
1052     sum3  = y[2];
1053     sum4  = y[3];
1054     sum5  = y[4];
1055     sum6  = y[5];
1056     sum7  = y[6];
1057     sum8  = y[7];
1058     sum9  = y[8];
1059     sum10 = y[9];
1060     sum11 = y[10];
1061     sum12 = y[11];
1062 
1063     for (j = 0; j < n; j++) {
1064       xb = x + 12 * (idx[j]);
1065 
1066       for (k = 0; k < 12; k++) {
1067         xv = xb[k];
1068         sum1 += v[0] * xv;
1069         sum2 += v[1] * xv;
1070         sum3 += v[2] * xv;
1071         sum4 += v[3] * xv;
1072         sum5 += v[4] * xv;
1073         sum6 += v[5] * xv;
1074         sum7 += v[6] * xv;
1075         sum8 += v[7] * xv;
1076         sum9 += v[8] * xv;
1077         sum10 += v[9] * xv;
1078         sum11 += v[10] * xv;
1079         sum12 += v[11] * xv;
1080         v += 12;
1081       }
1082     }
1083 
1084     z[0]  = sum1;
1085     z[1]  = sum2;
1086     z[2]  = sum3;
1087     z[3]  = sum4;
1088     z[4]  = sum5;
1089     z[5]  = sum6;
1090     z[6]  = sum7;
1091     z[7]  = sum8;
1092     z[8]  = sum9;
1093     z[9]  = sum10;
1094     z[10] = sum11;
1095     z[11] = sum12;
1096     if (!usecprow) {
1097       y += 12;
1098       z += 12;
1099     }
1100   }
1101   PetscCall(VecRestoreArrayRead(xx, &x));
1102   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1103   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1104   PetscFunctionReturn(PETSC_SUCCESS);
1105 }
1106 
1107 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1108 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1109 {
1110   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1111   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1112   const PetscScalar *x, *xb;
1113   PetscScalar        x1, x2, x3, x4, *zarray;
1114   const MatScalar   *v;
1115   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1116   PetscInt           mbs, i, j, n;
1117   PetscBool          usecprow = a->compressedrow.use;
1118 
1119   PetscFunctionBegin;
1120   PetscCall(VecGetArrayRead(xx, &x));
1121   PetscCall(VecGetArrayWrite(zz, &zarray));
1122 
1123   v = a->a;
1124   if (usecprow) {
1125     mbs  = a->compressedrow.nrows;
1126     ii   = a->compressedrow.i;
1127     ridx = a->compressedrow.rindex;
1128     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1129   } else {
1130     mbs = a->mbs;
1131     ii  = a->i;
1132     z   = zarray;
1133   }
1134 
1135   for (i = 0; i < mbs; i++) {
1136     n   = ii[i + 1] - ii[i];
1137     idx = ij + ii[i];
1138 
1139     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1140     for (j = 0; j < n; j++) {
1141       xb = x + 12 * (idx[j]);
1142       x1 = xb[0];
1143       x2 = xb[1];
1144       x3 = xb[2];
1145       x4 = xb[3];
1146 
1147       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1148       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1149       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1150       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1151       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1152       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1153       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1154       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1155       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1156       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1157       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1158       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1159       v += 48;
1160 
1161       x1 = xb[4];
1162       x2 = xb[5];
1163       x3 = xb[6];
1164       x4 = xb[7];
1165 
1166       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1167       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1168       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1169       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1170       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1171       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1172       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1173       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1174       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1175       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1176       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1177       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1178       v += 48;
1179 
1180       x1 = xb[8];
1181       x2 = xb[9];
1182       x3 = xb[10];
1183       x4 = xb[11];
1184       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1185       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1186       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1187       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1188       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1189       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1190       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1191       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1192       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1193       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1194       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1195       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1196       v += 48;
1197     }
1198     if (usecprow) z = zarray + 12 * ridx[i];
1199     z[0]  = sum1;
1200     z[1]  = sum2;
1201     z[2]  = sum3;
1202     z[3]  = sum4;
1203     z[4]  = sum5;
1204     z[5]  = sum6;
1205     z[6]  = sum7;
1206     z[7]  = sum8;
1207     z[8]  = sum9;
1208     z[9]  = sum10;
1209     z[10] = sum11;
1210     z[11] = sum12;
1211     if (!usecprow) z += 12;
1212   }
1213   PetscCall(VecRestoreArrayRead(xx, &x));
1214   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1215   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1216   PetscFunctionReturn(PETSC_SUCCESS);
1217 }
1218 
1219 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1220 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1221 {
1222   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1223   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1224   const PetscScalar *x, *xb;
1225   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1226   const MatScalar   *v;
1227   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1228   PetscInt           mbs      = a->mbs, i, j, n;
1229   PetscBool          usecprow = a->compressedrow.use;
1230 
1231   PetscFunctionBegin;
1232   PetscCall(VecGetArrayRead(xx, &x));
1233   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1234 
1235   v = a->a;
1236   if (usecprow) {
1237     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1238     mbs  = a->compressedrow.nrows;
1239     ii   = a->compressedrow.i;
1240     ridx = a->compressedrow.rindex;
1241   } else {
1242     ii = a->i;
1243     y  = yarray;
1244     z  = zarray;
1245   }
1246 
1247   for (i = 0; i < mbs; i++) {
1248     n   = ii[i + 1] - ii[i];
1249     idx = ij + ii[i];
1250 
1251     if (usecprow) {
1252       y = yarray + 12 * ridx[i];
1253       z = zarray + 12 * ridx[i];
1254     }
1255     sum1  = y[0];
1256     sum2  = y[1];
1257     sum3  = y[2];
1258     sum4  = y[3];
1259     sum5  = y[4];
1260     sum6  = y[5];
1261     sum7  = y[6];
1262     sum8  = y[7];
1263     sum9  = y[8];
1264     sum10 = y[9];
1265     sum11 = y[10];
1266     sum12 = y[11];
1267 
1268     for (j = 0; j < n; j++) {
1269       xb = x + 12 * (idx[j]);
1270       x1 = xb[0];
1271       x2 = xb[1];
1272       x3 = xb[2];
1273       x4 = xb[3];
1274 
1275       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1276       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1277       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1278       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1279       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1280       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1281       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1282       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1283       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1284       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1285       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1286       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1287       v += 48;
1288 
1289       x1 = xb[4];
1290       x2 = xb[5];
1291       x3 = xb[6];
1292       x4 = xb[7];
1293 
1294       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1295       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1296       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1297       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1298       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1299       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1300       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1301       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1302       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1303       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1304       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1305       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1306       v += 48;
1307 
1308       x1 = xb[8];
1309       x2 = xb[9];
1310       x3 = xb[10];
1311       x4 = xb[11];
1312       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1313       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1314       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1315       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1316       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1317       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1318       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1319       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1320       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1321       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1322       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1323       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1324       v += 48;
1325     }
1326     z[0]  = sum1;
1327     z[1]  = sum2;
1328     z[2]  = sum3;
1329     z[3]  = sum4;
1330     z[4]  = sum5;
1331     z[5]  = sum6;
1332     z[6]  = sum7;
1333     z[7]  = sum8;
1334     z[8]  = sum9;
1335     z[9]  = sum10;
1336     z[10] = sum11;
1337     z[11] = sum12;
1338     if (!usecprow) {
1339       y += 12;
1340       z += 12;
1341     }
1342   }
1343   PetscCall(VecRestoreArrayRead(xx, &x));
1344   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1345   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1346   PetscFunctionReturn(PETSC_SUCCESS);
1347 }
1348 
1349 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1350 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1351 {
1352   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1353   PetscScalar       *z = NULL, *zarray;
1354   const PetscScalar *x, *work;
1355   const MatScalar   *v = a->a;
1356   PetscInt           mbs, i, j, n;
1357   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1358   PetscBool          usecprow = a->compressedrow.use;
1359   const PetscInt     bs = 12, bs2 = 144;
1360 
1361   __m256d a0, a1, a2, a3, a4, a5;
1362   __m256d w0, w1, w2, w3;
1363   __m256d z0, z1, z2;
1364 
1365   PetscFunctionBegin;
1366   PetscCall(VecGetArrayRead(xx, &x));
1367   PetscCall(VecGetArrayWrite(zz, &zarray));
1368 
1369   if (usecprow) {
1370     mbs  = a->compressedrow.nrows;
1371     ii   = a->compressedrow.i;
1372     ridx = a->compressedrow.rindex;
1373     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1374   } else {
1375     mbs = a->mbs;
1376     ii  = a->i;
1377     z   = zarray;
1378   }
1379 
1380   for (i = 0; i < mbs; i++) {
1381     z0 = _mm256_setzero_pd();
1382     z1 = _mm256_setzero_pd();
1383     z2 = _mm256_setzero_pd();
1384 
1385     n = ii[1] - ii[0];
1386     ii++;
1387     for (j = 0; j < n; j++) {
1388       work = x + bs * (*idx++);
1389 
1390       /* first column of a */
1391       w0 = _mm256_set1_pd(work[0]);
1392       a0 = _mm256_loadu_pd(v + 0);
1393       z0 = _mm256_fmadd_pd(a0, w0, z0);
1394       a1 = _mm256_loadu_pd(v + 4);
1395       z1 = _mm256_fmadd_pd(a1, w0, z1);
1396       a2 = _mm256_loadu_pd(v + 8);
1397       z2 = _mm256_fmadd_pd(a2, w0, z2);
1398 
1399       /* second column of a */
1400       w1 = _mm256_set1_pd(work[1]);
1401       a3 = _mm256_loadu_pd(v + 12);
1402       z0 = _mm256_fmadd_pd(a3, w1, z0);
1403       a4 = _mm256_loadu_pd(v + 16);
1404       z1 = _mm256_fmadd_pd(a4, w1, z1);
1405       a5 = _mm256_loadu_pd(v + 20);
1406       z2 = _mm256_fmadd_pd(a5, w1, z2);
1407 
1408       /* third column of a */
1409       w2 = _mm256_set1_pd(work[2]);
1410       a0 = _mm256_loadu_pd(v + 24);
1411       z0 = _mm256_fmadd_pd(a0, w2, z0);
1412       a1 = _mm256_loadu_pd(v + 28);
1413       z1 = _mm256_fmadd_pd(a1, w2, z1);
1414       a2 = _mm256_loadu_pd(v + 32);
1415       z2 = _mm256_fmadd_pd(a2, w2, z2);
1416 
1417       /* fourth column of a */
1418       w3 = _mm256_set1_pd(work[3]);
1419       a3 = _mm256_loadu_pd(v + 36);
1420       z0 = _mm256_fmadd_pd(a3, w3, z0);
1421       a4 = _mm256_loadu_pd(v + 40);
1422       z1 = _mm256_fmadd_pd(a4, w3, z1);
1423       a5 = _mm256_loadu_pd(v + 44);
1424       z2 = _mm256_fmadd_pd(a5, w3, z2);
1425 
1426       /* fifth column of a */
1427       w0 = _mm256_set1_pd(work[4]);
1428       a0 = _mm256_loadu_pd(v + 48);
1429       z0 = _mm256_fmadd_pd(a0, w0, z0);
1430       a1 = _mm256_loadu_pd(v + 52);
1431       z1 = _mm256_fmadd_pd(a1, w0, z1);
1432       a2 = _mm256_loadu_pd(v + 56);
1433       z2 = _mm256_fmadd_pd(a2, w0, z2);
1434 
1435       /* sixth column of a */
1436       w1 = _mm256_set1_pd(work[5]);
1437       a3 = _mm256_loadu_pd(v + 60);
1438       z0 = _mm256_fmadd_pd(a3, w1, z0);
1439       a4 = _mm256_loadu_pd(v + 64);
1440       z1 = _mm256_fmadd_pd(a4, w1, z1);
1441       a5 = _mm256_loadu_pd(v + 68);
1442       z2 = _mm256_fmadd_pd(a5, w1, z2);
1443 
1444       /* seventh column of a */
1445       w2 = _mm256_set1_pd(work[6]);
1446       a0 = _mm256_loadu_pd(v + 72);
1447       z0 = _mm256_fmadd_pd(a0, w2, z0);
1448       a1 = _mm256_loadu_pd(v + 76);
1449       z1 = _mm256_fmadd_pd(a1, w2, z1);
1450       a2 = _mm256_loadu_pd(v + 80);
1451       z2 = _mm256_fmadd_pd(a2, w2, z2);
1452 
1453       /* eighth column of a */
1454       w3 = _mm256_set1_pd(work[7]);
1455       a3 = _mm256_loadu_pd(v + 84);
1456       z0 = _mm256_fmadd_pd(a3, w3, z0);
1457       a4 = _mm256_loadu_pd(v + 88);
1458       z1 = _mm256_fmadd_pd(a4, w3, z1);
1459       a5 = _mm256_loadu_pd(v + 92);
1460       z2 = _mm256_fmadd_pd(a5, w3, z2);
1461 
1462       /* ninth column of a */
1463       w0 = _mm256_set1_pd(work[8]);
1464       a0 = _mm256_loadu_pd(v + 96);
1465       z0 = _mm256_fmadd_pd(a0, w0, z0);
1466       a1 = _mm256_loadu_pd(v + 100);
1467       z1 = _mm256_fmadd_pd(a1, w0, z1);
1468       a2 = _mm256_loadu_pd(v + 104);
1469       z2 = _mm256_fmadd_pd(a2, w0, z2);
1470 
1471       /* tenth column of a */
1472       w1 = _mm256_set1_pd(work[9]);
1473       a3 = _mm256_loadu_pd(v + 108);
1474       z0 = _mm256_fmadd_pd(a3, w1, z0);
1475       a4 = _mm256_loadu_pd(v + 112);
1476       z1 = _mm256_fmadd_pd(a4, w1, z1);
1477       a5 = _mm256_loadu_pd(v + 116);
1478       z2 = _mm256_fmadd_pd(a5, w1, z2);
1479 
1480       /* eleventh column of a */
1481       w2 = _mm256_set1_pd(work[10]);
1482       a0 = _mm256_loadu_pd(v + 120);
1483       z0 = _mm256_fmadd_pd(a0, w2, z0);
1484       a1 = _mm256_loadu_pd(v + 124);
1485       z1 = _mm256_fmadd_pd(a1, w2, z1);
1486       a2 = _mm256_loadu_pd(v + 128);
1487       z2 = _mm256_fmadd_pd(a2, w2, z2);
1488 
1489       /* twelveth column of a */
1490       w3 = _mm256_set1_pd(work[11]);
1491       a3 = _mm256_loadu_pd(v + 132);
1492       z0 = _mm256_fmadd_pd(a3, w3, z0);
1493       a4 = _mm256_loadu_pd(v + 136);
1494       z1 = _mm256_fmadd_pd(a4, w3, z1);
1495       a5 = _mm256_loadu_pd(v + 140);
1496       z2 = _mm256_fmadd_pd(a5, w3, z2);
1497 
1498       v += bs2;
1499     }
1500     if (usecprow) z = zarray + bs * ridx[i];
1501     _mm256_storeu_pd(&z[0], z0);
1502     _mm256_storeu_pd(&z[4], z1);
1503     _mm256_storeu_pd(&z[8], z2);
1504     if (!usecprow) z += bs;
1505   }
1506   PetscCall(VecRestoreArrayRead(xx, &x));
1507   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1508   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1509   PetscFunctionReturn(PETSC_SUCCESS);
1510 }
1511 #endif
1512 
1513 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1514 /* Default MatMult for block size 15 */
1515 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1516 {
1517   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1518   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1519   const PetscScalar *x, *xb;
1520   PetscScalar       *zarray, xv;
1521   const MatScalar   *v;
1522   const PetscInt    *ii, *ij = a->j, *idx;
1523   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1524   PetscBool          usecprow = a->compressedrow.use;
1525 
1526   PetscFunctionBegin;
1527   PetscCall(VecGetArrayRead(xx, &x));
1528   PetscCall(VecGetArrayWrite(zz, &zarray));
1529 
1530   v = a->a;
1531   if (usecprow) {
1532     mbs  = a->compressedrow.nrows;
1533     ii   = a->compressedrow.i;
1534     ridx = a->compressedrow.rindex;
1535     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1536   } else {
1537     mbs = a->mbs;
1538     ii  = a->i;
1539     z   = zarray;
1540   }
1541 
1542   for (i = 0; i < mbs; i++) {
1543     n     = ii[i + 1] - ii[i];
1544     idx   = ij + ii[i];
1545     sum1  = 0.0;
1546     sum2  = 0.0;
1547     sum3  = 0.0;
1548     sum4  = 0.0;
1549     sum5  = 0.0;
1550     sum6  = 0.0;
1551     sum7  = 0.0;
1552     sum8  = 0.0;
1553     sum9  = 0.0;
1554     sum10 = 0.0;
1555     sum11 = 0.0;
1556     sum12 = 0.0;
1557     sum13 = 0.0;
1558     sum14 = 0.0;
1559     sum15 = 0.0;
1560 
1561     for (j = 0; j < n; j++) {
1562       xb = x + 15 * (idx[j]);
1563 
1564       for (k = 0; k < 15; k++) {
1565         xv = xb[k];
1566         sum1 += v[0] * xv;
1567         sum2 += v[1] * xv;
1568         sum3 += v[2] * xv;
1569         sum4 += v[3] * xv;
1570         sum5 += v[4] * xv;
1571         sum6 += v[5] * xv;
1572         sum7 += v[6] * xv;
1573         sum8 += v[7] * xv;
1574         sum9 += v[8] * xv;
1575         sum10 += v[9] * xv;
1576         sum11 += v[10] * xv;
1577         sum12 += v[11] * xv;
1578         sum13 += v[12] * xv;
1579         sum14 += v[13] * xv;
1580         sum15 += v[14] * xv;
1581         v += 15;
1582       }
1583     }
1584     if (usecprow) z = zarray + 15 * ridx[i];
1585     z[0]  = sum1;
1586     z[1]  = sum2;
1587     z[2]  = sum3;
1588     z[3]  = sum4;
1589     z[4]  = sum5;
1590     z[5]  = sum6;
1591     z[6]  = sum7;
1592     z[7]  = sum8;
1593     z[8]  = sum9;
1594     z[9]  = sum10;
1595     z[10] = sum11;
1596     z[11] = sum12;
1597     z[12] = sum13;
1598     z[13] = sum14;
1599     z[14] = sum15;
1600 
1601     if (!usecprow) z += 15;
1602   }
1603 
1604   PetscCall(VecRestoreArrayRead(xx, &x));
1605   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1606   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1607   PetscFunctionReturn(PETSC_SUCCESS);
1608 }
1609 
1610 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1611 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1612 {
1613   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1614   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1615   const PetscScalar *x, *xb;
1616   PetscScalar        x1, x2, x3, x4, *zarray;
1617   const MatScalar   *v;
1618   const PetscInt    *ii, *ij = a->j, *idx;
1619   PetscInt           mbs, i, j, n, *ridx = NULL;
1620   PetscBool          usecprow = a->compressedrow.use;
1621 
1622   PetscFunctionBegin;
1623   PetscCall(VecGetArrayRead(xx, &x));
1624   PetscCall(VecGetArrayWrite(zz, &zarray));
1625 
1626   v = a->a;
1627   if (usecprow) {
1628     mbs  = a->compressedrow.nrows;
1629     ii   = a->compressedrow.i;
1630     ridx = a->compressedrow.rindex;
1631     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1632   } else {
1633     mbs = a->mbs;
1634     ii  = a->i;
1635     z   = zarray;
1636   }
1637 
1638   for (i = 0; i < mbs; i++) {
1639     n     = ii[i + 1] - ii[i];
1640     idx   = ij + ii[i];
1641     sum1  = 0.0;
1642     sum2  = 0.0;
1643     sum3  = 0.0;
1644     sum4  = 0.0;
1645     sum5  = 0.0;
1646     sum6  = 0.0;
1647     sum7  = 0.0;
1648     sum8  = 0.0;
1649     sum9  = 0.0;
1650     sum10 = 0.0;
1651     sum11 = 0.0;
1652     sum12 = 0.0;
1653     sum13 = 0.0;
1654     sum14 = 0.0;
1655     sum15 = 0.0;
1656 
1657     for (j = 0; j < n; j++) {
1658       xb = x + 15 * (idx[j]);
1659       x1 = xb[0];
1660       x2 = xb[1];
1661       x3 = xb[2];
1662       x4 = xb[3];
1663 
1664       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1665       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1666       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1667       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1668       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1669       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1670       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1671       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1672       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1673       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1674       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1675       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1676       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1677       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1678       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1679 
1680       v += 60;
1681 
1682       x1 = xb[4];
1683       x2 = xb[5];
1684       x3 = xb[6];
1685       x4 = xb[7];
1686 
1687       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1688       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1689       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1690       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1691       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1692       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1693       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1694       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1695       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1696       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1697       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1698       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1699       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1700       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1701       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1702       v += 60;
1703 
1704       x1 = xb[8];
1705       x2 = xb[9];
1706       x3 = xb[10];
1707       x4 = xb[11];
1708       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1709       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1710       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1711       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1712       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1713       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1714       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1715       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1716       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1717       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1718       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1719       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1720       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1721       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1722       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1723       v += 60;
1724 
1725       x1 = xb[12];
1726       x2 = xb[13];
1727       x3 = xb[14];
1728       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1729       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1730       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1731       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1732       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1733       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1734       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1735       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1736       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1737       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1738       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1739       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1740       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1741       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1742       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1743       v += 45;
1744     }
1745     if (usecprow) z = zarray + 15 * ridx[i];
1746     z[0]  = sum1;
1747     z[1]  = sum2;
1748     z[2]  = sum3;
1749     z[3]  = sum4;
1750     z[4]  = sum5;
1751     z[5]  = sum6;
1752     z[6]  = sum7;
1753     z[7]  = sum8;
1754     z[8]  = sum9;
1755     z[9]  = sum10;
1756     z[10] = sum11;
1757     z[11] = sum12;
1758     z[12] = sum13;
1759     z[13] = sum14;
1760     z[14] = sum15;
1761 
1762     if (!usecprow) z += 15;
1763   }
1764 
1765   PetscCall(VecRestoreArrayRead(xx, &x));
1766   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1767   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1768   PetscFunctionReturn(PETSC_SUCCESS);
1769 }
1770 
1771 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1772 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1773 {
1774   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1775   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1776   const PetscScalar *x, *xb;
1777   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1778   const MatScalar   *v;
1779   const PetscInt    *ii, *ij = a->j, *idx;
1780   PetscInt           mbs, i, j, n, *ridx = NULL;
1781   PetscBool          usecprow = a->compressedrow.use;
1782 
1783   PetscFunctionBegin;
1784   PetscCall(VecGetArrayRead(xx, &x));
1785   PetscCall(VecGetArrayWrite(zz, &zarray));
1786 
1787   v = a->a;
1788   if (usecprow) {
1789     mbs  = a->compressedrow.nrows;
1790     ii   = a->compressedrow.i;
1791     ridx = a->compressedrow.rindex;
1792     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1793   } else {
1794     mbs = a->mbs;
1795     ii  = a->i;
1796     z   = zarray;
1797   }
1798 
1799   for (i = 0; i < mbs; i++) {
1800     n     = ii[i + 1] - ii[i];
1801     idx   = ij + ii[i];
1802     sum1  = 0.0;
1803     sum2  = 0.0;
1804     sum3  = 0.0;
1805     sum4  = 0.0;
1806     sum5  = 0.0;
1807     sum6  = 0.0;
1808     sum7  = 0.0;
1809     sum8  = 0.0;
1810     sum9  = 0.0;
1811     sum10 = 0.0;
1812     sum11 = 0.0;
1813     sum12 = 0.0;
1814     sum13 = 0.0;
1815     sum14 = 0.0;
1816     sum15 = 0.0;
1817 
1818     for (j = 0; j < n; j++) {
1819       xb = x + 15 * (idx[j]);
1820       x1 = xb[0];
1821       x2 = xb[1];
1822       x3 = xb[2];
1823       x4 = xb[3];
1824       x5 = xb[4];
1825       x6 = xb[5];
1826       x7 = xb[6];
1827       x8 = xb[7];
1828 
1829       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1830       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1831       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1832       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1833       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1834       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1835       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1836       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1837       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1838       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1839       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1840       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1841       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1842       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1843       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1844       v += 120;
1845 
1846       x1 = xb[8];
1847       x2 = xb[9];
1848       x3 = xb[10];
1849       x4 = xb[11];
1850       x5 = xb[12];
1851       x6 = xb[13];
1852       x7 = xb[14];
1853 
1854       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1855       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1856       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1857       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1858       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1859       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1860       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1861       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1862       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1863       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1864       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1865       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1866       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1867       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1868       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1869       v += 105;
1870     }
1871     if (usecprow) z = zarray + 15 * ridx[i];
1872     z[0]  = sum1;
1873     z[1]  = sum2;
1874     z[2]  = sum3;
1875     z[3]  = sum4;
1876     z[4]  = sum5;
1877     z[5]  = sum6;
1878     z[6]  = sum7;
1879     z[7]  = sum8;
1880     z[8]  = sum9;
1881     z[9]  = sum10;
1882     z[10] = sum11;
1883     z[11] = sum12;
1884     z[12] = sum13;
1885     z[13] = sum14;
1886     z[14] = sum15;
1887 
1888     if (!usecprow) z += 15;
1889   }
1890 
1891   PetscCall(VecRestoreArrayRead(xx, &x));
1892   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1893   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1894   PetscFunctionReturn(PETSC_SUCCESS);
1895 }
1896 
1897 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1898 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1899 {
1900   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1901   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1902   const PetscScalar *x, *xb;
1903   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1904   const MatScalar   *v;
1905   const PetscInt    *ii, *ij = a->j, *idx;
1906   PetscInt           mbs, i, j, n, *ridx = NULL;
1907   PetscBool          usecprow = a->compressedrow.use;
1908 
1909   PetscFunctionBegin;
1910   PetscCall(VecGetArrayRead(xx, &x));
1911   PetscCall(VecGetArrayWrite(zz, &zarray));
1912 
1913   v = a->a;
1914   if (usecprow) {
1915     mbs  = a->compressedrow.nrows;
1916     ii   = a->compressedrow.i;
1917     ridx = a->compressedrow.rindex;
1918     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1919   } else {
1920     mbs = a->mbs;
1921     ii  = a->i;
1922     z   = zarray;
1923   }
1924 
1925   for (i = 0; i < mbs; i++) {
1926     n     = ii[i + 1] - ii[i];
1927     idx   = ij + ii[i];
1928     sum1  = 0.0;
1929     sum2  = 0.0;
1930     sum3  = 0.0;
1931     sum4  = 0.0;
1932     sum5  = 0.0;
1933     sum6  = 0.0;
1934     sum7  = 0.0;
1935     sum8  = 0.0;
1936     sum9  = 0.0;
1937     sum10 = 0.0;
1938     sum11 = 0.0;
1939     sum12 = 0.0;
1940     sum13 = 0.0;
1941     sum14 = 0.0;
1942     sum15 = 0.0;
1943 
1944     for (j = 0; j < n; j++) {
1945       xb  = x + 15 * (idx[j]);
1946       x1  = xb[0];
1947       x2  = xb[1];
1948       x3  = xb[2];
1949       x4  = xb[3];
1950       x5  = xb[4];
1951       x6  = xb[5];
1952       x7  = xb[6];
1953       x8  = xb[7];
1954       x9  = xb[8];
1955       x10 = xb[9];
1956       x11 = xb[10];
1957       x12 = xb[11];
1958       x13 = xb[12];
1959       x14 = xb[13];
1960       x15 = xb[14];
1961 
1962       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1963       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1964       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1965       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1966       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1967       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1968       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1969       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1970       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1971       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1972       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1973       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1974       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1975       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1976       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1977       v += 225;
1978     }
1979     if (usecprow) z = zarray + 15 * ridx[i];
1980     z[0]  = sum1;
1981     z[1]  = sum2;
1982     z[2]  = sum3;
1983     z[3]  = sum4;
1984     z[4]  = sum5;
1985     z[5]  = sum6;
1986     z[6]  = sum7;
1987     z[7]  = sum8;
1988     z[8]  = sum9;
1989     z[9]  = sum10;
1990     z[10] = sum11;
1991     z[11] = sum12;
1992     z[12] = sum13;
1993     z[13] = sum14;
1994     z[14] = sum15;
1995 
1996     if (!usecprow) z += 15;
1997   }
1998 
1999   PetscCall(VecRestoreArrayRead(xx, &x));
2000   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2001   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
2002   PetscFunctionReturn(PETSC_SUCCESS);
2003 }
2004 
2005 /*
2006     This will not work with MatScalar == float because it calls the BLAS
2007 */
2008 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
2009 {
2010   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2011   PetscScalar       *z = NULL, *work, *workt, *zarray;
2012   const PetscScalar *x, *xb;
2013   const MatScalar   *v;
2014   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2015   const PetscInt    *idx, *ii, *ridx = NULL;
2016   PetscInt           ncols, k;
2017   PetscBool          usecprow = a->compressedrow.use;
2018 
2019   PetscFunctionBegin;
2020   PetscCall(VecGetArrayRead(xx, &x));
2021   PetscCall(VecGetArrayWrite(zz, &zarray));
2022 
2023   idx = a->j;
2024   v   = a->a;
2025   if (usecprow) {
2026     mbs  = a->compressedrow.nrows;
2027     ii   = a->compressedrow.i;
2028     ridx = a->compressedrow.rindex;
2029     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2030   } else {
2031     mbs = a->mbs;
2032     ii  = a->i;
2033     z   = zarray;
2034   }
2035 
2036   if (!a->mult_work) {
2037     k = PetscMax(A->rmap->n, A->cmap->n);
2038     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2039   }
2040   work = a->mult_work;
2041   for (i = 0; i < mbs; i++) {
2042     n = ii[1] - ii[0];
2043     ii++;
2044     ncols = n * bs;
2045     workt = work;
2046     for (j = 0; j < n; j++) {
2047       xb = x + bs * (*idx++);
2048       for (k = 0; k < bs; k++) workt[k] = xb[k];
2049       workt += bs;
2050     }
2051     if (usecprow) z = zarray + bs * ridx[i];
2052     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2053     v += n * bs2;
2054     if (!usecprow) z += bs;
2055   }
2056   PetscCall(VecRestoreArrayRead(xx, &x));
2057   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2058   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2059   PetscFunctionReturn(PETSC_SUCCESS);
2060 }
2061 
2062 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2063 {
2064   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2065   const PetscScalar *x;
2066   PetscScalar       *y, *z, sum;
2067   const MatScalar   *v;
2068   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2069   const PetscInt    *idx, *ii;
2070   PetscBool          usecprow = a->compressedrow.use;
2071 
2072   PetscFunctionBegin;
2073   PetscCall(VecGetArrayRead(xx, &x));
2074   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2075 
2076   idx = a->j;
2077   v   = a->a;
2078   if (usecprow) {
2079     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2080     mbs  = a->compressedrow.nrows;
2081     ii   = a->compressedrow.i;
2082     ridx = a->compressedrow.rindex;
2083   } else {
2084     ii = a->i;
2085   }
2086 
2087   for (i = 0; i < mbs; i++) {
2088     n = ii[1] - ii[0];
2089     ii++;
2090     if (!usecprow) {
2091       sum = y[i];
2092     } else {
2093       sum = y[ridx[i]];
2094     }
2095     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2096     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2097     PetscSparseDensePlusDot(sum, x, v, idx, n);
2098     v += n;
2099     idx += n;
2100     if (usecprow) {
2101       z[ridx[i]] = sum;
2102     } else {
2103       z[i] = sum;
2104     }
2105   }
2106   PetscCall(VecRestoreArrayRead(xx, &x));
2107   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2108   PetscCall(PetscLogFlops(2.0 * a->nz));
2109   PetscFunctionReturn(PETSC_SUCCESS);
2110 }
2111 
2112 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2113 {
2114   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2115   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2116   const PetscScalar *x, *xb;
2117   PetscScalar        x1, x2, *yarray, *zarray;
2118   const MatScalar   *v;
2119   PetscInt           mbs = a->mbs, i, n, j;
2120   const PetscInt    *idx, *ii, *ridx = NULL;
2121   PetscBool          usecprow = a->compressedrow.use;
2122 
2123   PetscFunctionBegin;
2124   PetscCall(VecGetArrayRead(xx, &x));
2125   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2126 
2127   idx = a->j;
2128   v   = a->a;
2129   if (usecprow) {
2130     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2131     mbs  = a->compressedrow.nrows;
2132     ii   = a->compressedrow.i;
2133     ridx = a->compressedrow.rindex;
2134   } else {
2135     ii = a->i;
2136     y  = yarray;
2137     z  = zarray;
2138   }
2139 
2140   for (i = 0; i < mbs; i++) {
2141     n = ii[1] - ii[0];
2142     ii++;
2143     if (usecprow) {
2144       z = zarray + 2 * ridx[i];
2145       y = yarray + 2 * ridx[i];
2146     }
2147     sum1 = y[0];
2148     sum2 = y[1];
2149     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2150     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2151     for (j = 0; j < n; j++) {
2152       xb = x + 2 * (*idx++);
2153       x1 = xb[0];
2154       x2 = xb[1];
2155 
2156       sum1 += v[0] * x1 + v[2] * x2;
2157       sum2 += v[1] * x1 + v[3] * x2;
2158       v += 4;
2159     }
2160     z[0] = sum1;
2161     z[1] = sum2;
2162     if (!usecprow) {
2163       z += 2;
2164       y += 2;
2165     }
2166   }
2167   PetscCall(VecRestoreArrayRead(xx, &x));
2168   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2169   PetscCall(PetscLogFlops(4.0 * a->nz));
2170   PetscFunctionReturn(PETSC_SUCCESS);
2171 }
2172 
2173 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2174 {
2175   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2176   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2177   const PetscScalar *x, *xb;
2178   const MatScalar   *v;
2179   PetscInt           mbs = a->mbs, i, j, n;
2180   const PetscInt    *idx, *ii, *ridx = NULL;
2181   PetscBool          usecprow = a->compressedrow.use;
2182 
2183   PetscFunctionBegin;
2184   PetscCall(VecGetArrayRead(xx, &x));
2185   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2186 
2187   idx = a->j;
2188   v   = a->a;
2189   if (usecprow) {
2190     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2191     mbs  = a->compressedrow.nrows;
2192     ii   = a->compressedrow.i;
2193     ridx = a->compressedrow.rindex;
2194   } else {
2195     ii = a->i;
2196     y  = yarray;
2197     z  = zarray;
2198   }
2199 
2200   for (i = 0; i < mbs; i++) {
2201     n = ii[1] - ii[0];
2202     ii++;
2203     if (usecprow) {
2204       z = zarray + 3 * ridx[i];
2205       y = yarray + 3 * ridx[i];
2206     }
2207     sum1 = y[0];
2208     sum2 = y[1];
2209     sum3 = y[2];
2210     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2211     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2212     for (j = 0; j < n; j++) {
2213       xb = x + 3 * (*idx++);
2214       x1 = xb[0];
2215       x2 = xb[1];
2216       x3 = xb[2];
2217       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2218       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2219       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2220       v += 9;
2221     }
2222     z[0] = sum1;
2223     z[1] = sum2;
2224     z[2] = sum3;
2225     if (!usecprow) {
2226       z += 3;
2227       y += 3;
2228     }
2229   }
2230   PetscCall(VecRestoreArrayRead(xx, &x));
2231   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2232   PetscCall(PetscLogFlops(18.0 * a->nz));
2233   PetscFunctionReturn(PETSC_SUCCESS);
2234 }
2235 
2236 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2237 {
2238   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2239   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2240   const PetscScalar *x, *xb;
2241   const MatScalar   *v;
2242   PetscInt           mbs = a->mbs, i, j, n;
2243   const PetscInt    *idx, *ii, *ridx = NULL;
2244   PetscBool          usecprow = a->compressedrow.use;
2245 
2246   PetscFunctionBegin;
2247   PetscCall(VecGetArrayRead(xx, &x));
2248   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2249 
2250   idx = a->j;
2251   v   = a->a;
2252   if (usecprow) {
2253     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2254     mbs  = a->compressedrow.nrows;
2255     ii   = a->compressedrow.i;
2256     ridx = a->compressedrow.rindex;
2257   } else {
2258     ii = a->i;
2259     y  = yarray;
2260     z  = zarray;
2261   }
2262 
2263   for (i = 0; i < mbs; i++) {
2264     n = ii[1] - ii[0];
2265     ii++;
2266     if (usecprow) {
2267       z = zarray + 4 * ridx[i];
2268       y = yarray + 4 * ridx[i];
2269     }
2270     sum1 = y[0];
2271     sum2 = y[1];
2272     sum3 = y[2];
2273     sum4 = y[3];
2274     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2275     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2276     for (j = 0; j < n; j++) {
2277       xb = x + 4 * (*idx++);
2278       x1 = xb[0];
2279       x2 = xb[1];
2280       x3 = xb[2];
2281       x4 = xb[3];
2282       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2283       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2284       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2285       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2286       v += 16;
2287     }
2288     z[0] = sum1;
2289     z[1] = sum2;
2290     z[2] = sum3;
2291     z[3] = sum4;
2292     if (!usecprow) {
2293       z += 4;
2294       y += 4;
2295     }
2296   }
2297   PetscCall(VecRestoreArrayRead(xx, &x));
2298   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2299   PetscCall(PetscLogFlops(32.0 * a->nz));
2300   PetscFunctionReturn(PETSC_SUCCESS);
2301 }
2302 
2303 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2304 {
2305   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2306   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2307   const PetscScalar *x, *xb;
2308   PetscScalar       *yarray, *zarray;
2309   const MatScalar   *v;
2310   PetscInt           mbs = a->mbs, i, j, n;
2311   const PetscInt    *idx, *ii, *ridx = NULL;
2312   PetscBool          usecprow = a->compressedrow.use;
2313 
2314   PetscFunctionBegin;
2315   PetscCall(VecGetArrayRead(xx, &x));
2316   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2317 
2318   idx = a->j;
2319   v   = a->a;
2320   if (usecprow) {
2321     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2322     mbs  = a->compressedrow.nrows;
2323     ii   = a->compressedrow.i;
2324     ridx = a->compressedrow.rindex;
2325   } else {
2326     ii = a->i;
2327     y  = yarray;
2328     z  = zarray;
2329   }
2330 
2331   for (i = 0; i < mbs; i++) {
2332     n = ii[1] - ii[0];
2333     ii++;
2334     if (usecprow) {
2335       z = zarray + 5 * ridx[i];
2336       y = yarray + 5 * ridx[i];
2337     }
2338     sum1 = y[0];
2339     sum2 = y[1];
2340     sum3 = y[2];
2341     sum4 = y[3];
2342     sum5 = y[4];
2343     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2344     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2345     for (j = 0; j < n; j++) {
2346       xb = x + 5 * (*idx++);
2347       x1 = xb[0];
2348       x2 = xb[1];
2349       x3 = xb[2];
2350       x4 = xb[3];
2351       x5 = xb[4];
2352       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2353       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2354       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2355       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2356       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2357       v += 25;
2358     }
2359     z[0] = sum1;
2360     z[1] = sum2;
2361     z[2] = sum3;
2362     z[3] = sum4;
2363     z[4] = sum5;
2364     if (!usecprow) {
2365       z += 5;
2366       y += 5;
2367     }
2368   }
2369   PetscCall(VecRestoreArrayRead(xx, &x));
2370   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2371   PetscCall(PetscLogFlops(50.0 * a->nz));
2372   PetscFunctionReturn(PETSC_SUCCESS);
2373 }
2374 
2375 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2376 {
2377   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2378   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2379   const PetscScalar *x, *xb;
2380   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2381   const MatScalar   *v;
2382   PetscInt           mbs = a->mbs, i, j, n;
2383   const PetscInt    *idx, *ii, *ridx = NULL;
2384   PetscBool          usecprow = a->compressedrow.use;
2385 
2386   PetscFunctionBegin;
2387   PetscCall(VecGetArrayRead(xx, &x));
2388   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2389 
2390   idx = a->j;
2391   v   = a->a;
2392   if (usecprow) {
2393     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2394     mbs  = a->compressedrow.nrows;
2395     ii   = a->compressedrow.i;
2396     ridx = a->compressedrow.rindex;
2397   } else {
2398     ii = a->i;
2399     y  = yarray;
2400     z  = zarray;
2401   }
2402 
2403   for (i = 0; i < mbs; i++) {
2404     n = ii[1] - ii[0];
2405     ii++;
2406     if (usecprow) {
2407       z = zarray + 6 * ridx[i];
2408       y = yarray + 6 * ridx[i];
2409     }
2410     sum1 = y[0];
2411     sum2 = y[1];
2412     sum3 = y[2];
2413     sum4 = y[3];
2414     sum5 = y[4];
2415     sum6 = y[5];
2416     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2417     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2418     for (j = 0; j < n; j++) {
2419       xb = x + 6 * (*idx++);
2420       x1 = xb[0];
2421       x2 = xb[1];
2422       x3 = xb[2];
2423       x4 = xb[3];
2424       x5 = xb[4];
2425       x6 = xb[5];
2426       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2427       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2428       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2429       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2430       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2431       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2432       v += 36;
2433     }
2434     z[0] = sum1;
2435     z[1] = sum2;
2436     z[2] = sum3;
2437     z[3] = sum4;
2438     z[4] = sum5;
2439     z[5] = sum6;
2440     if (!usecprow) {
2441       z += 6;
2442       y += 6;
2443     }
2444   }
2445   PetscCall(VecRestoreArrayRead(xx, &x));
2446   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2447   PetscCall(PetscLogFlops(72.0 * a->nz));
2448   PetscFunctionReturn(PETSC_SUCCESS);
2449 }
2450 
2451 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2452 {
2453   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2454   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2455   const PetscScalar *x, *xb;
2456   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2457   const MatScalar   *v;
2458   PetscInt           mbs = a->mbs, i, j, n;
2459   const PetscInt    *idx, *ii, *ridx = NULL;
2460   PetscBool          usecprow = a->compressedrow.use;
2461 
2462   PetscFunctionBegin;
2463   PetscCall(VecGetArrayRead(xx, &x));
2464   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2465 
2466   idx = a->j;
2467   v   = a->a;
2468   if (usecprow) {
2469     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2470     mbs  = a->compressedrow.nrows;
2471     ii   = a->compressedrow.i;
2472     ridx = a->compressedrow.rindex;
2473   } else {
2474     ii = a->i;
2475     y  = yarray;
2476     z  = zarray;
2477   }
2478 
2479   for (i = 0; i < mbs; i++) {
2480     n = ii[1] - ii[0];
2481     ii++;
2482     if (usecprow) {
2483       z = zarray + 7 * ridx[i];
2484       y = yarray + 7 * ridx[i];
2485     }
2486     sum1 = y[0];
2487     sum2 = y[1];
2488     sum3 = y[2];
2489     sum4 = y[3];
2490     sum5 = y[4];
2491     sum6 = y[5];
2492     sum7 = y[6];
2493     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2494     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2495     for (j = 0; j < n; j++) {
2496       xb = x + 7 * (*idx++);
2497       x1 = xb[0];
2498       x2 = xb[1];
2499       x3 = xb[2];
2500       x4 = xb[3];
2501       x5 = xb[4];
2502       x6 = xb[5];
2503       x7 = xb[6];
2504       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2505       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2506       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2507       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2508       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2509       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2510       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2511       v += 49;
2512     }
2513     z[0] = sum1;
2514     z[1] = sum2;
2515     z[2] = sum3;
2516     z[3] = sum4;
2517     z[4] = sum5;
2518     z[5] = sum6;
2519     z[6] = sum7;
2520     if (!usecprow) {
2521       z += 7;
2522       y += 7;
2523     }
2524   }
2525   PetscCall(VecRestoreArrayRead(xx, &x));
2526   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2527   PetscCall(PetscLogFlops(98.0 * a->nz));
2528   PetscFunctionReturn(PETSC_SUCCESS);
2529 }
2530 
2531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2532 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2533 {
2534   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2535   PetscScalar       *z = NULL, *work, *workt, *zarray;
2536   const PetscScalar *x, *xb;
2537   const MatScalar   *v;
2538   PetscInt           mbs, i, j, n;
2539   PetscInt           k;
2540   PetscBool          usecprow = a->compressedrow.use;
2541   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2542 
2543   __m256d a0, a1, a2, a3, a4, a5;
2544   __m256d w0, w1, w2, w3;
2545   __m256d z0, z1, z2;
2546   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2547 
2548   PetscFunctionBegin;
2549   PetscCall(VecCopy(yy, zz));
2550   PetscCall(VecGetArrayRead(xx, &x));
2551   PetscCall(VecGetArray(zz, &zarray));
2552 
2553   idx = a->j;
2554   v   = a->a;
2555   if (usecprow) {
2556     mbs  = a->compressedrow.nrows;
2557     ii   = a->compressedrow.i;
2558     ridx = a->compressedrow.rindex;
2559   } else {
2560     mbs = a->mbs;
2561     ii  = a->i;
2562     z   = zarray;
2563   }
2564 
2565   if (!a->mult_work) {
2566     k = PetscMax(A->rmap->n, A->cmap->n);
2567     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2568   }
2569 
2570   work = a->mult_work;
2571   for (i = 0; i < mbs; i++) {
2572     n = ii[1] - ii[0];
2573     ii++;
2574     workt = work;
2575     for (j = 0; j < n; j++) {
2576       xb = x + bs * (*idx++);
2577       for (k = 0; k < bs; k++) workt[k] = xb[k];
2578       workt += bs;
2579     }
2580     if (usecprow) z = zarray + bs * ridx[i];
2581 
2582     z0 = _mm256_loadu_pd(&z[0]);
2583     z1 = _mm256_loadu_pd(&z[4]);
2584     z2 = _mm256_set1_pd(z[8]);
2585 
2586     for (j = 0; j < n; j++) {
2587       /* first column of a */
2588       w0 = _mm256_set1_pd(work[j * 9]);
2589       a0 = _mm256_loadu_pd(&v[j * 81]);
2590       z0 = _mm256_fmadd_pd(a0, w0, z0);
2591       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2592       z1 = _mm256_fmadd_pd(a1, w0, z1);
2593       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2594       z2 = _mm256_fmadd_pd(a2, w0, z2);
2595 
2596       /* second column of a */
2597       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2598       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2599       z0 = _mm256_fmadd_pd(a0, w1, z0);
2600       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2601       z1 = _mm256_fmadd_pd(a1, w1, z1);
2602       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2603       z2 = _mm256_fmadd_pd(a2, w1, z2);
2604 
2605       /* third column of a */
2606       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2607       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2608       z0 = _mm256_fmadd_pd(a3, w2, z0);
2609       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2610       z1 = _mm256_fmadd_pd(a4, w2, z1);
2611       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2612       z2 = _mm256_fmadd_pd(a5, w2, z2);
2613 
2614       /* fourth column of a */
2615       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2616       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2617       z0 = _mm256_fmadd_pd(a0, w3, z0);
2618       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2619       z1 = _mm256_fmadd_pd(a1, w3, z1);
2620       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2621       z2 = _mm256_fmadd_pd(a2, w3, z2);
2622 
2623       /* fifth column of a */
2624       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2625       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2626       z0 = _mm256_fmadd_pd(a3, w0, z0);
2627       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2628       z1 = _mm256_fmadd_pd(a4, w0, z1);
2629       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2630       z2 = _mm256_fmadd_pd(a5, w0, z2);
2631 
2632       /* sixth column of a */
2633       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2634       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2635       z0 = _mm256_fmadd_pd(a0, w1, z0);
2636       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2637       z1 = _mm256_fmadd_pd(a1, w1, z1);
2638       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2639       z2 = _mm256_fmadd_pd(a2, w1, z2);
2640 
2641       /* seventh column of a */
2642       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2643       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2644       z0 = _mm256_fmadd_pd(a0, w2, z0);
2645       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2646       z1 = _mm256_fmadd_pd(a1, w2, z1);
2647       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2648       z2 = _mm256_fmadd_pd(a2, w2, z2);
2649 
2650       /* eighth column of a */
2651       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2652       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2653       z0 = _mm256_fmadd_pd(a3, w3, z0);
2654       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2655       z1 = _mm256_fmadd_pd(a4, w3, z1);
2656       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2657       z2 = _mm256_fmadd_pd(a5, w3, z2);
2658 
2659       /* ninth column of a */
2660       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2661       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2662       z0 = _mm256_fmadd_pd(a0, w0, z0);
2663       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2664       z1 = _mm256_fmadd_pd(a1, w0, z1);
2665       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2666       z2 = _mm256_fmadd_pd(a2, w0, z2);
2667     }
2668 
2669     _mm256_storeu_pd(&z[0], z0);
2670     _mm256_storeu_pd(&z[4], z1);
2671     _mm256_maskstore_pd(&z[8], mask1, z2);
2672 
2673     v += n * bs2;
2674     if (!usecprow) z += bs;
2675   }
2676   PetscCall(VecRestoreArrayRead(xx, &x));
2677   PetscCall(VecRestoreArray(zz, &zarray));
2678   PetscCall(PetscLogFlops(162.0 * a->nz));
2679   PetscFunctionReturn(PETSC_SUCCESS);
2680 }
2681 #endif
2682 
2683 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2684 {
2685   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2686   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2687   const PetscScalar *x, *xb;
2688   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2689   const MatScalar   *v;
2690   PetscInt           mbs = a->mbs, i, j, n;
2691   const PetscInt    *idx, *ii, *ridx = NULL;
2692   PetscBool          usecprow = a->compressedrow.use;
2693 
2694   PetscFunctionBegin;
2695   PetscCall(VecGetArrayRead(xx, &x));
2696   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2697 
2698   idx = a->j;
2699   v   = a->a;
2700   if (usecprow) {
2701     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2702     mbs  = a->compressedrow.nrows;
2703     ii   = a->compressedrow.i;
2704     ridx = a->compressedrow.rindex;
2705   } else {
2706     ii = a->i;
2707     y  = yarray;
2708     z  = zarray;
2709   }
2710 
2711   for (i = 0; i < mbs; i++) {
2712     n = ii[1] - ii[0];
2713     ii++;
2714     if (usecprow) {
2715       z = zarray + 11 * ridx[i];
2716       y = yarray + 11 * ridx[i];
2717     }
2718     sum1  = y[0];
2719     sum2  = y[1];
2720     sum3  = y[2];
2721     sum4  = y[3];
2722     sum5  = y[4];
2723     sum6  = y[5];
2724     sum7  = y[6];
2725     sum8  = y[7];
2726     sum9  = y[8];
2727     sum10 = y[9];
2728     sum11 = y[10];
2729     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2730     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2731     for (j = 0; j < n; j++) {
2732       xb  = x + 11 * (*idx++);
2733       x1  = xb[0];
2734       x2  = xb[1];
2735       x3  = xb[2];
2736       x4  = xb[3];
2737       x5  = xb[4];
2738       x6  = xb[5];
2739       x7  = xb[6];
2740       x8  = xb[7];
2741       x9  = xb[8];
2742       x10 = xb[9];
2743       x11 = xb[10];
2744       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2745       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2746       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2747       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2748       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2749       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2750       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2751       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2752       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2753       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2754       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2755       v += 121;
2756     }
2757     z[0]  = sum1;
2758     z[1]  = sum2;
2759     z[2]  = sum3;
2760     z[3]  = sum4;
2761     z[4]  = sum5;
2762     z[5]  = sum6;
2763     z[6]  = sum7;
2764     z[7]  = sum8;
2765     z[8]  = sum9;
2766     z[9]  = sum10;
2767     z[10] = sum11;
2768     if (!usecprow) {
2769       z += 11;
2770       y += 11;
2771     }
2772   }
2773   PetscCall(VecRestoreArrayRead(xx, &x));
2774   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2775   PetscCall(PetscLogFlops(242.0 * a->nz));
2776   PetscFunctionReturn(PETSC_SUCCESS);
2777 }
2778 
2779 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2780 {
2781   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2782   PetscScalar       *z = NULL, *work, *workt, *zarray;
2783   const PetscScalar *x, *xb;
2784   const MatScalar   *v;
2785   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2786   PetscInt           ncols, k;
2787   const PetscInt    *ridx     = NULL, *idx, *ii;
2788   PetscBool          usecprow = a->compressedrow.use;
2789 
2790   PetscFunctionBegin;
2791   PetscCall(VecCopy(yy, zz));
2792   PetscCall(VecGetArrayRead(xx, &x));
2793   PetscCall(VecGetArray(zz, &zarray));
2794 
2795   idx = a->j;
2796   v   = a->a;
2797   if (usecprow) {
2798     mbs  = a->compressedrow.nrows;
2799     ii   = a->compressedrow.i;
2800     ridx = a->compressedrow.rindex;
2801   } else {
2802     mbs = a->mbs;
2803     ii  = a->i;
2804     z   = zarray;
2805   }
2806 
2807   if (!a->mult_work) {
2808     k = PetscMax(A->rmap->n, A->cmap->n);
2809     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2810   }
2811   work = a->mult_work;
2812   for (i = 0; i < mbs; i++) {
2813     n = ii[1] - ii[0];
2814     ii++;
2815     ncols = n * bs;
2816     workt = work;
2817     for (j = 0; j < n; j++) {
2818       xb = x + bs * (*idx++);
2819       for (k = 0; k < bs; k++) workt[k] = xb[k];
2820       workt += bs;
2821     }
2822     if (usecprow) z = zarray + bs * ridx[i];
2823     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2824     v += n * bs2;
2825     if (!usecprow) z += bs;
2826   }
2827   PetscCall(VecRestoreArrayRead(xx, &x));
2828   PetscCall(VecRestoreArray(zz, &zarray));
2829   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2830   PetscFunctionReturn(PETSC_SUCCESS);
2831 }
2832 
2833 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2834 {
2835   PetscScalar zero = 0.0;
2836 
2837   PetscFunctionBegin;
2838   PetscCall(VecSet(zz, zero));
2839   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2840   PetscFunctionReturn(PETSC_SUCCESS);
2841 }
2842 
2843 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2844 {
2845   PetscScalar zero = 0.0;
2846 
2847   PetscFunctionBegin;
2848   PetscCall(VecSet(zz, zero));
2849   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2850   PetscFunctionReturn(PETSC_SUCCESS);
2851 }
2852 
2853 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2854 {
2855   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2856   PetscScalar       *z, x1, x2, x3, x4, x5;
2857   const PetscScalar *x, *xb = NULL;
2858   const MatScalar   *v;
2859   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2860   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2861   Mat_CompressedRow  cprow    = a->compressedrow;
2862   PetscBool          usecprow = cprow.use;
2863 
2864   PetscFunctionBegin;
2865   if (yy != zz) PetscCall(VecCopy(yy, zz));
2866   PetscCall(VecGetArrayRead(xx, &x));
2867   PetscCall(VecGetArray(zz, &z));
2868 
2869   idx = a->j;
2870   v   = a->a;
2871   if (usecprow) {
2872     mbs  = cprow.nrows;
2873     ii   = cprow.i;
2874     ridx = cprow.rindex;
2875   } else {
2876     mbs = a->mbs;
2877     ii  = a->i;
2878     xb  = x;
2879   }
2880 
2881   switch (bs) {
2882   case 1:
2883     for (i = 0; i < mbs; i++) {
2884       if (usecprow) xb = x + ridx[i];
2885       x1 = xb[0];
2886       ib = idx + ii[0];
2887       n  = ii[1] - ii[0];
2888       ii++;
2889       for (j = 0; j < n; j++) {
2890         rval = ib[j];
2891         z[rval] += PetscConj(*v) * x1;
2892         v++;
2893       }
2894       if (!usecprow) xb++;
2895     }
2896     break;
2897   case 2:
2898     for (i = 0; i < mbs; i++) {
2899       if (usecprow) xb = x + 2 * ridx[i];
2900       x1 = xb[0];
2901       x2 = xb[1];
2902       ib = idx + ii[0];
2903       n  = ii[1] - ii[0];
2904       ii++;
2905       for (j = 0; j < n; j++) {
2906         rval = ib[j] * 2;
2907         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2908         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2909         v += 4;
2910       }
2911       if (!usecprow) xb += 2;
2912     }
2913     break;
2914   case 3:
2915     for (i = 0; i < mbs; i++) {
2916       if (usecprow) xb = x + 3 * ridx[i];
2917       x1 = xb[0];
2918       x2 = xb[1];
2919       x3 = xb[2];
2920       ib = idx + ii[0];
2921       n  = ii[1] - ii[0];
2922       ii++;
2923       for (j = 0; j < n; j++) {
2924         rval = ib[j] * 3;
2925         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2926         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2927         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2928         v += 9;
2929       }
2930       if (!usecprow) xb += 3;
2931     }
2932     break;
2933   case 4:
2934     for (i = 0; i < mbs; i++) {
2935       if (usecprow) xb = x + 4 * ridx[i];
2936       x1 = xb[0];
2937       x2 = xb[1];
2938       x3 = xb[2];
2939       x4 = xb[3];
2940       ib = idx + ii[0];
2941       n  = ii[1] - ii[0];
2942       ii++;
2943       for (j = 0; j < n; j++) {
2944         rval = ib[j] * 4;
2945         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2946         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2947         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2948         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2949         v += 16;
2950       }
2951       if (!usecprow) xb += 4;
2952     }
2953     break;
2954   case 5:
2955     for (i = 0; i < mbs; i++) {
2956       if (usecprow) xb = x + 5 * ridx[i];
2957       x1 = xb[0];
2958       x2 = xb[1];
2959       x3 = xb[2];
2960       x4 = xb[3];
2961       x5 = xb[4];
2962       ib = idx + ii[0];
2963       n  = ii[1] - ii[0];
2964       ii++;
2965       for (j = 0; j < n; j++) {
2966         rval = ib[j] * 5;
2967         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2968         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2969         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2970         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2971         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2972         v += 25;
2973       }
2974       if (!usecprow) xb += 5;
2975     }
2976     break;
2977   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2978     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2979 #if 0
2980     {
2981       PetscInt          ncols,k,bs2=a->bs2;
2982       PetscScalar       *work,*workt,zb;
2983       const PetscScalar *xtmp;
2984       if (!a->mult_work) {
2985         k    = PetscMax(A->rmap->n,A->cmap->n);
2986         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2987       }
2988       work = a->mult_work;
2989       xtmp = x;
2990       for (i=0; i<mbs; i++) {
2991         n     = ii[1] - ii[0]; ii++;
2992         ncols = n*bs;
2993         PetscCall(PetscArrayzero(work,ncols));
2994         if (usecprow) xtmp = x + bs*ridx[i];
2995         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2996         v += n*bs2;
2997         if (!usecprow) xtmp += bs;
2998         workt = work;
2999         for (j=0; j<n; j++) {
3000           zb = z + bs*(*idx++);
3001           for (k=0; k<bs; k++) zb[k] += workt[k] ;
3002           workt += bs;
3003         }
3004       }
3005     }
3006 #endif
3007   }
3008   PetscCall(VecRestoreArrayRead(xx, &x));
3009   PetscCall(VecRestoreArray(zz, &z));
3010   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3011   PetscFunctionReturn(PETSC_SUCCESS);
3012 }
3013 
3014 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
3015 {
3016   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3017   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
3018   const PetscScalar *x, *xb = NULL;
3019   const MatScalar   *v;
3020   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3021   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
3022   Mat_CompressedRow  cprow    = a->compressedrow;
3023   PetscBool          usecprow = cprow.use;
3024 
3025   PetscFunctionBegin;
3026   if (yy != zz) PetscCall(VecCopy(yy, zz));
3027   PetscCall(VecGetArrayRead(xx, &x));
3028   PetscCall(VecGetArray(zz, &z));
3029 
3030   idx = a->j;
3031   v   = a->a;
3032   if (usecprow) {
3033     mbs  = cprow.nrows;
3034     ii   = cprow.i;
3035     ridx = cprow.rindex;
3036   } else {
3037     mbs = a->mbs;
3038     ii  = a->i;
3039     xb  = x;
3040   }
3041 
3042   switch (bs) {
3043   case 1:
3044     for (i = 0; i < mbs; i++) {
3045       if (usecprow) xb = x + ridx[i];
3046       x1 = xb[0];
3047       ib = idx + ii[0];
3048       n  = ii[1] - ii[0];
3049       ii++;
3050       for (j = 0; j < n; j++) {
3051         rval = ib[j];
3052         z[rval] += *v * x1;
3053         v++;
3054       }
3055       if (!usecprow) xb++;
3056     }
3057     break;
3058   case 2:
3059     for (i = 0; i < mbs; i++) {
3060       if (usecprow) xb = x + 2 * ridx[i];
3061       x1 = xb[0];
3062       x2 = xb[1];
3063       ib = idx + ii[0];
3064       n  = ii[1] - ii[0];
3065       ii++;
3066       for (j = 0; j < n; j++) {
3067         rval = ib[j] * 2;
3068         z[rval++] += v[0] * x1 + v[1] * x2;
3069         z[rval++] += v[2] * x1 + v[3] * x2;
3070         v += 4;
3071       }
3072       if (!usecprow) xb += 2;
3073     }
3074     break;
3075   case 3:
3076     for (i = 0; i < mbs; i++) {
3077       if (usecprow) xb = x + 3 * ridx[i];
3078       x1 = xb[0];
3079       x2 = xb[1];
3080       x3 = xb[2];
3081       ib = idx + ii[0];
3082       n  = ii[1] - ii[0];
3083       ii++;
3084       for (j = 0; j < n; j++) {
3085         rval = ib[j] * 3;
3086         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3087         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3088         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3089         v += 9;
3090       }
3091       if (!usecprow) xb += 3;
3092     }
3093     break;
3094   case 4:
3095     for (i = 0; i < mbs; i++) {
3096       if (usecprow) xb = x + 4 * ridx[i];
3097       x1 = xb[0];
3098       x2 = xb[1];
3099       x3 = xb[2];
3100       x4 = xb[3];
3101       ib = idx + ii[0];
3102       n  = ii[1] - ii[0];
3103       ii++;
3104       for (j = 0; j < n; j++) {
3105         rval = ib[j] * 4;
3106         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3107         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3108         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3109         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3110         v += 16;
3111       }
3112       if (!usecprow) xb += 4;
3113     }
3114     break;
3115   case 5:
3116     for (i = 0; i < mbs; i++) {
3117       if (usecprow) xb = x + 5 * ridx[i];
3118       x1 = xb[0];
3119       x2 = xb[1];
3120       x3 = xb[2];
3121       x4 = xb[3];
3122       x5 = xb[4];
3123       ib = idx + ii[0];
3124       n  = ii[1] - ii[0];
3125       ii++;
3126       for (j = 0; j < n; j++) {
3127         rval = ib[j] * 5;
3128         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3129         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3130         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3131         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3132         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3133         v += 25;
3134       }
3135       if (!usecprow) xb += 5;
3136     }
3137     break;
3138   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3139     PetscInt           ncols, k;
3140     PetscScalar       *work, *workt;
3141     const PetscScalar *xtmp;
3142     if (!a->mult_work) {
3143       k = PetscMax(A->rmap->n, A->cmap->n);
3144       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3145     }
3146     work = a->mult_work;
3147     xtmp = x;
3148     for (i = 0; i < mbs; i++) {
3149       n = ii[1] - ii[0];
3150       ii++;
3151       ncols = n * bs;
3152       PetscCall(PetscArrayzero(work, ncols));
3153       if (usecprow) xtmp = x + bs * ridx[i];
3154       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3155       v += n * bs2;
3156       if (!usecprow) xtmp += bs;
3157       workt = work;
3158       for (j = 0; j < n; j++) {
3159         zb = z + bs * (*idx++);
3160         for (k = 0; k < bs; k++) zb[k] += workt[k];
3161         workt += bs;
3162       }
3163     }
3164   }
3165   }
3166   PetscCall(VecRestoreArrayRead(xx, &x));
3167   PetscCall(VecRestoreArray(zz, &z));
3168   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3169   PetscFunctionReturn(PETSC_SUCCESS);
3170 }
3171 
3172 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3173 {
3174   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3175   PetscInt     totalnz = a->bs2 * a->nz;
3176   PetscScalar  oalpha  = alpha;
3177   PetscBLASInt one     = 1, tnz;
3178 
3179   PetscFunctionBegin;
3180   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3181   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3182   PetscCall(PetscLogFlops(totalnz));
3183   PetscFunctionReturn(PETSC_SUCCESS);
3184 }
3185 
3186 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3187 {
3188   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3189   MatScalar   *v   = a->a;
3190   PetscReal    sum = 0.0;
3191   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3192 
3193   PetscFunctionBegin;
3194   if (type == NORM_FROBENIUS) {
3195 #if defined(PETSC_USE_REAL___FP16)
3196     PetscBLASInt one = 1, cnt = bs2 * nz;
3197     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3198 #else
3199     for (i = 0; i < bs2 * nz; i++) {
3200       sum += PetscRealPart(PetscConj(*v) * (*v));
3201       v++;
3202     }
3203 #endif
3204     *norm = PetscSqrtReal(sum);
3205     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3206   } else if (type == NORM_1) { /* maximum column sum */
3207     PetscReal *tmp;
3208     PetscInt  *bcol = a->j;
3209     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3210     for (i = 0; i < nz; i++) {
3211       for (j = 0; j < bs; j++) {
3212         k1 = bs * (*bcol) + j; /* column index */
3213         for (k = 0; k < bs; k++) {
3214           tmp[k1] += PetscAbsScalar(*v);
3215           v++;
3216         }
3217       }
3218       bcol++;
3219     }
3220     *norm = 0.0;
3221     for (j = 0; j < A->cmap->n; j++) {
3222       if (tmp[j] > *norm) *norm = tmp[j];
3223     }
3224     PetscCall(PetscFree(tmp));
3225     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3226   } else if (type == NORM_INFINITY) { /* maximum row sum */
3227     *norm = 0.0;
3228     for (k = 0; k < bs; k++) {
3229       for (j = 0; j < a->mbs; j++) {
3230         v   = a->a + bs2 * a->i[j] + k;
3231         sum = 0.0;
3232         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3233           for (k1 = 0; k1 < bs; k1++) {
3234             sum += PetscAbsScalar(*v);
3235             v += bs;
3236           }
3237         }
3238         if (sum > *norm) *norm = sum;
3239       }
3240     }
3241     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3242   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3243   PetscFunctionReturn(PETSC_SUCCESS);
3244 }
3245 
3246 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3247 {
3248   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3249 
3250   PetscFunctionBegin;
3251   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3252   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3253     *flg = PETSC_FALSE;
3254     PetscFunctionReturn(PETSC_SUCCESS);
3255   }
3256 
3257   /* if the a->i are the same */
3258   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3259   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3260 
3261   /* if a->j are the same */
3262   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3263   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3264 
3265   /* if a->a are the same */
3266   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3267   PetscFunctionReturn(PETSC_SUCCESS);
3268 }
3269 
3270 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3271 {
3272   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3273   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3274   PetscScalar *x, zero = 0.0;
3275   MatScalar   *aa, *aa_j;
3276 
3277   PetscFunctionBegin;
3278   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3279   bs   = A->rmap->bs;
3280   aa   = a->a;
3281   ai   = a->i;
3282   aj   = a->j;
3283   ambs = a->mbs;
3284   bs2  = a->bs2;
3285 
3286   PetscCall(VecSet(v, zero));
3287   PetscCall(VecGetArray(v, &x));
3288   PetscCall(VecGetLocalSize(v, &n));
3289   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3290   for (i = 0; i < ambs; i++) {
3291     for (j = ai[i]; j < ai[i + 1]; j++) {
3292       if (aj[j] == i) {
3293         row  = i * bs;
3294         aa_j = aa + j * bs2;
3295         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3296         break;
3297       }
3298     }
3299   }
3300   PetscCall(VecRestoreArray(v, &x));
3301   PetscFunctionReturn(PETSC_SUCCESS);
3302 }
3303 
3304 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3305 {
3306   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3307   const PetscScalar *l, *r, *li, *ri;
3308   PetscScalar        x;
3309   MatScalar         *aa, *v;
3310   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3311   const PetscInt    *ai, *aj;
3312 
3313   PetscFunctionBegin;
3314   ai  = a->i;
3315   aj  = a->j;
3316   aa  = a->a;
3317   m   = A->rmap->n;
3318   n   = A->cmap->n;
3319   bs  = A->rmap->bs;
3320   mbs = a->mbs;
3321   bs2 = a->bs2;
3322   if (ll) {
3323     PetscCall(VecGetArrayRead(ll, &l));
3324     PetscCall(VecGetLocalSize(ll, &lm));
3325     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3326     for (i = 0; i < mbs; i++) { /* for each block row */
3327       M  = ai[i + 1] - ai[i];
3328       li = l + i * bs;
3329       v  = aa + bs2 * ai[i];
3330       for (j = 0; j < M; j++) { /* for each block */
3331         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3332       }
3333     }
3334     PetscCall(VecRestoreArrayRead(ll, &l));
3335     PetscCall(PetscLogFlops(a->nz));
3336   }
3337 
3338   if (rr) {
3339     PetscCall(VecGetArrayRead(rr, &r));
3340     PetscCall(VecGetLocalSize(rr, &rn));
3341     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3342     for (i = 0; i < mbs; i++) { /* for each block row */
3343       iai = ai[i];
3344       M   = ai[i + 1] - iai;
3345       v   = aa + bs2 * iai;
3346       for (j = 0; j < M; j++) { /* for each block */
3347         ri = r + bs * aj[iai + j];
3348         for (k = 0; k < bs; k++) {
3349           x = ri[k];
3350           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3351           v += bs;
3352         }
3353       }
3354     }
3355     PetscCall(VecRestoreArrayRead(rr, &r));
3356     PetscCall(PetscLogFlops(a->nz));
3357   }
3358   PetscFunctionReturn(PETSC_SUCCESS);
3359 }
3360 
3361 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3362 {
3363   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3364 
3365   PetscFunctionBegin;
3366   info->block_size   = a->bs2;
3367   info->nz_allocated = a->bs2 * a->maxnz;
3368   info->nz_used      = a->bs2 * a->nz;
3369   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3370   info->assemblies   = A->num_ass;
3371   info->mallocs      = A->info.mallocs;
3372   info->memory       = 0; /* REVIEW ME */
3373   if (A->factortype) {
3374     info->fill_ratio_given  = A->info.fill_ratio_given;
3375     info->fill_ratio_needed = A->info.fill_ratio_needed;
3376     info->factor_mallocs    = A->info.factor_mallocs;
3377   } else {
3378     info->fill_ratio_given  = 0;
3379     info->fill_ratio_needed = 0;
3380     info->factor_mallocs    = 0;
3381   }
3382   PetscFunctionReturn(PETSC_SUCCESS);
3383 }
3384 
3385 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3386 {
3387   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3388 
3389   PetscFunctionBegin;
3390   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3391   PetscFunctionReturn(PETSC_SUCCESS);
3392 }
3393 
3394 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3395 {
3396   PetscFunctionBegin;
3397   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3398   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3399   PetscFunctionReturn(PETSC_SUCCESS);
3400 }
3401 
3402 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3403 {
3404   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3405   PetscScalar       *z = NULL, sum1;
3406   const PetscScalar *xb;
3407   PetscScalar        x1;
3408   const MatScalar   *v, *vv;
3409   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3410   PetscBool          usecprow = a->compressedrow.use;
3411 
3412   PetscFunctionBegin;
3413   idx = a->j;
3414   v   = a->a;
3415   if (usecprow) {
3416     mbs  = a->compressedrow.nrows;
3417     ii   = a->compressedrow.i;
3418     ridx = a->compressedrow.rindex;
3419   } else {
3420     mbs = a->mbs;
3421     ii  = a->i;
3422     z   = c;
3423   }
3424 
3425   for (i = 0; i < mbs; i++) {
3426     n = ii[1] - ii[0];
3427     ii++;
3428     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3429     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3430     if (usecprow) z = c + ridx[i];
3431     jj = idx;
3432     vv = v;
3433     for (k = 0; k < cn; k++) {
3434       idx  = jj;
3435       v    = vv;
3436       sum1 = 0.0;
3437       for (j = 0; j < n; j++) {
3438         xb = b + (*idx++);
3439         x1 = xb[0 + k * bm];
3440         sum1 += v[0] * x1;
3441         v += 1;
3442       }
3443       z[0 + k * cm] = sum1;
3444     }
3445     if (!usecprow) z += 1;
3446   }
3447   PetscFunctionReturn(PETSC_SUCCESS);
3448 }
3449 
3450 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3451 {
3452   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3453   PetscScalar       *z = NULL, sum1, sum2;
3454   const PetscScalar *xb;
3455   PetscScalar        x1, x2;
3456   const MatScalar   *v, *vv;
3457   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3458   PetscBool          usecprow = a->compressedrow.use;
3459 
3460   PetscFunctionBegin;
3461   idx = a->j;
3462   v   = a->a;
3463   if (usecprow) {
3464     mbs  = a->compressedrow.nrows;
3465     ii   = a->compressedrow.i;
3466     ridx = a->compressedrow.rindex;
3467   } else {
3468     mbs = a->mbs;
3469     ii  = a->i;
3470     z   = c;
3471   }
3472 
3473   for (i = 0; i < mbs; i++) {
3474     n = ii[1] - ii[0];
3475     ii++;
3476     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3477     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3478     if (usecprow) z = c + 2 * ridx[i];
3479     jj = idx;
3480     vv = v;
3481     for (k = 0; k < cn; k++) {
3482       idx  = jj;
3483       v    = vv;
3484       sum1 = 0.0;
3485       sum2 = 0.0;
3486       for (j = 0; j < n; j++) {
3487         xb = b + 2 * (*idx++);
3488         x1 = xb[0 + k * bm];
3489         x2 = xb[1 + k * bm];
3490         sum1 += v[0] * x1 + v[2] * x2;
3491         sum2 += v[1] * x1 + v[3] * x2;
3492         v += 4;
3493       }
3494       z[0 + k * cm] = sum1;
3495       z[1 + k * cm] = sum2;
3496     }
3497     if (!usecprow) z += 2;
3498   }
3499   PetscFunctionReturn(PETSC_SUCCESS);
3500 }
3501 
3502 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3503 {
3504   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3505   PetscScalar       *z = NULL, sum1, sum2, sum3;
3506   const PetscScalar *xb;
3507   PetscScalar        x1, x2, x3;
3508   const MatScalar   *v, *vv;
3509   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3510   PetscBool          usecprow = a->compressedrow.use;
3511 
3512   PetscFunctionBegin;
3513   idx = a->j;
3514   v   = a->a;
3515   if (usecprow) {
3516     mbs  = a->compressedrow.nrows;
3517     ii   = a->compressedrow.i;
3518     ridx = a->compressedrow.rindex;
3519   } else {
3520     mbs = a->mbs;
3521     ii  = a->i;
3522     z   = c;
3523   }
3524 
3525   for (i = 0; i < mbs; i++) {
3526     n = ii[1] - ii[0];
3527     ii++;
3528     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3529     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3530     if (usecprow) z = c + 3 * ridx[i];
3531     jj = idx;
3532     vv = v;
3533     for (k = 0; k < cn; k++) {
3534       idx  = jj;
3535       v    = vv;
3536       sum1 = 0.0;
3537       sum2 = 0.0;
3538       sum3 = 0.0;
3539       for (j = 0; j < n; j++) {
3540         xb = b + 3 * (*idx++);
3541         x1 = xb[0 + k * bm];
3542         x2 = xb[1 + k * bm];
3543         x3 = xb[2 + k * bm];
3544         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3545         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3546         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3547         v += 9;
3548       }
3549       z[0 + k * cm] = sum1;
3550       z[1 + k * cm] = sum2;
3551       z[2 + k * cm] = sum3;
3552     }
3553     if (!usecprow) z += 3;
3554   }
3555   PetscFunctionReturn(PETSC_SUCCESS);
3556 }
3557 
3558 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3559 {
3560   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3561   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3562   const PetscScalar *xb;
3563   PetscScalar        x1, x2, x3, x4;
3564   const MatScalar   *v, *vv;
3565   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3566   PetscBool          usecprow = a->compressedrow.use;
3567 
3568   PetscFunctionBegin;
3569   idx = a->j;
3570   v   = a->a;
3571   if (usecprow) {
3572     mbs  = a->compressedrow.nrows;
3573     ii   = a->compressedrow.i;
3574     ridx = a->compressedrow.rindex;
3575   } else {
3576     mbs = a->mbs;
3577     ii  = a->i;
3578     z   = c;
3579   }
3580 
3581   for (i = 0; i < mbs; i++) {
3582     n = ii[1] - ii[0];
3583     ii++;
3584     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3585     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3586     if (usecprow) z = c + 4 * ridx[i];
3587     jj = idx;
3588     vv = v;
3589     for (k = 0; k < cn; k++) {
3590       idx  = jj;
3591       v    = vv;
3592       sum1 = 0.0;
3593       sum2 = 0.0;
3594       sum3 = 0.0;
3595       sum4 = 0.0;
3596       for (j = 0; j < n; j++) {
3597         xb = b + 4 * (*idx++);
3598         x1 = xb[0 + k * bm];
3599         x2 = xb[1 + k * bm];
3600         x3 = xb[2 + k * bm];
3601         x4 = xb[3 + k * bm];
3602         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3603         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3604         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3605         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3606         v += 16;
3607       }
3608       z[0 + k * cm] = sum1;
3609       z[1 + k * cm] = sum2;
3610       z[2 + k * cm] = sum3;
3611       z[3 + k * cm] = sum4;
3612     }
3613     if (!usecprow) z += 4;
3614   }
3615   PetscFunctionReturn(PETSC_SUCCESS);
3616 }
3617 
3618 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3619 {
3620   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3621   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3622   const PetscScalar *xb;
3623   PetscScalar        x1, x2, x3, x4, x5;
3624   const MatScalar   *v, *vv;
3625   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3626   PetscBool          usecprow = a->compressedrow.use;
3627 
3628   PetscFunctionBegin;
3629   idx = a->j;
3630   v   = a->a;
3631   if (usecprow) {
3632     mbs  = a->compressedrow.nrows;
3633     ii   = a->compressedrow.i;
3634     ridx = a->compressedrow.rindex;
3635   } else {
3636     mbs = a->mbs;
3637     ii  = a->i;
3638     z   = c;
3639   }
3640 
3641   for (i = 0; i < mbs; i++) {
3642     n = ii[1] - ii[0];
3643     ii++;
3644     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3645     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3646     if (usecprow) z = c + 5 * ridx[i];
3647     jj = idx;
3648     vv = v;
3649     for (k = 0; k < cn; k++) {
3650       idx  = jj;
3651       v    = vv;
3652       sum1 = 0.0;
3653       sum2 = 0.0;
3654       sum3 = 0.0;
3655       sum4 = 0.0;
3656       sum5 = 0.0;
3657       for (j = 0; j < n; j++) {
3658         xb = b + 5 * (*idx++);
3659         x1 = xb[0 + k * bm];
3660         x2 = xb[1 + k * bm];
3661         x3 = xb[2 + k * bm];
3662         x4 = xb[3 + k * bm];
3663         x5 = xb[4 + k * bm];
3664         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3665         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3666         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3667         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3668         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3669         v += 25;
3670       }
3671       z[0 + k * cm] = sum1;
3672       z[1 + k * cm] = sum2;
3673       z[2 + k * cm] = sum3;
3674       z[3 + k * cm] = sum4;
3675       z[4 + k * cm] = sum5;
3676     }
3677     if (!usecprow) z += 5;
3678   }
3679   PetscFunctionReturn(PETSC_SUCCESS);
3680 }
3681 
3682 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3683 {
3684   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3685   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3686   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3687   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3688   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3689   PetscBLASInt     bbs, bcn, bbm, bcm;
3690   PetscScalar     *z = NULL;
3691   PetscScalar     *c, *b;
3692   const MatScalar *v;
3693   const PetscInt  *idx, *ii, *ridx = NULL;
3694   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3695   PetscBool        usecprow = a->compressedrow.use;
3696 
3697   PetscFunctionBegin;
3698   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3699   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3700   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3701   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3702   b = bd->v;
3703   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3704   PetscCall(MatDenseGetArray(C, &c));
3705   switch (bs) {
3706   case 1:
3707     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3708     break;
3709   case 2:
3710     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3711     break;
3712   case 3:
3713     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3714     break;
3715   case 4:
3716     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3717     break;
3718   case 5:
3719     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3720     break;
3721   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3722     PetscCall(PetscBLASIntCast(bs, &bbs));
3723     PetscCall(PetscBLASIntCast(cn, &bcn));
3724     PetscCall(PetscBLASIntCast(bm, &bbm));
3725     PetscCall(PetscBLASIntCast(cm, &bcm));
3726     idx = a->j;
3727     v   = a->a;
3728     if (usecprow) {
3729       mbs  = a->compressedrow.nrows;
3730       ii   = a->compressedrow.i;
3731       ridx = a->compressedrow.rindex;
3732     } else {
3733       mbs = a->mbs;
3734       ii  = a->i;
3735       z   = c;
3736     }
3737     for (i = 0; i < mbs; i++) {
3738       n = ii[1] - ii[0];
3739       ii++;
3740       if (usecprow) z = c + bs * ridx[i];
3741       if (n) {
3742         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3743         v += bs2;
3744       }
3745       for (j = 1; j < n; j++) {
3746         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3747         v += bs2;
3748       }
3749       if (!usecprow) z += bs;
3750     }
3751   }
3752   PetscCall(MatDenseRestoreArray(C, &c));
3753   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3754   PetscFunctionReturn(PETSC_SUCCESS);
3755 }
3756