xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 36c8fdc5cf2ffa9413161a64c8ffb18fe1bfee3f)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8   #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
14   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
15   const PetscInt *idx;
16   PetscInt        start, end, *ai, *aj, bs;
17   PetscBT         table;
18 
19   PetscFunctionBegin;
20   m  = a->mbs;
21   ai = a->i;
22   aj = a->j;
23   bs = A->rmap->bs;
24 
25   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
26 
27   PetscCall(PetscBTCreate(m, &table));
28   PetscCall(PetscMalloc1(m + 1, &nidx));
29 
30   for (i = 0; i < is_max; i++) {
31     /* Initialise the two local arrays */
32     isz = 0;
33     PetscCall(PetscBTMemzero(m, table));
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     PetscCall(ISGetIndices(is[i], &idx));
37     PetscCall(ISGetLocalSize(is[i], &n));
38 
39     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40     for (j = 0; j < n; ++j) {
41       ival = idx[j] / bs; /* convert the indices into block indices */
42       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
43       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44     }
45     PetscCall(ISRestoreIndices(is[i], &idx));
46     PetscCall(ISDestroy(&is[i]));
47 
48     k = 0;
49     for (j = 0; j < ov; j++) { /* for each overlap*/
50       n = isz;
51       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52         row   = nidx[k];
53         start = ai[row];
54         end   = ai[row + 1];
55         for (l = start; l < end; l++) {
56           val = aj[l];
57           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58         }
59       }
60     }
61     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
62   }
63   PetscCall(PetscBTDestroy(&table));
64   PetscCall(PetscFree(nidx));
65   PetscFunctionReturn(PETSC_SUCCESS);
66 }
67 
68 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
69 {
70   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
71   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
72   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
73   const PetscInt *irow, *icol;
74   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
75   PetscInt       *aj = a->j, *ai = a->i;
76   MatScalar      *mat_a;
77   Mat             C;
78   PetscBool       flag;
79 
80   PetscFunctionBegin;
81   PetscCall(ISGetIndices(isrow, &irow));
82   PetscCall(ISGetIndices(iscol, &icol));
83   PetscCall(ISGetLocalSize(isrow, &nrows));
84   PetscCall(ISGetLocalSize(iscol, &ncols));
85 
86   PetscCall(PetscCalloc1(1 + oldcols, &smap));
87   ssmap = smap;
88   PetscCall(PetscMalloc1(1 + nrows, &lens));
89   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
90   /* determine lens of each row */
91   for (i = 0; i < nrows; i++) {
92     kstart  = ai[irow[i]];
93     kend    = kstart + a->ilen[irow[i]];
94     lens[i] = 0;
95     for (k = kstart; k < kend; k++) {
96       if (ssmap[aj[k]]) lens[i]++;
97     }
98   }
99   /* Create and fill new matrix */
100   if (scall == MAT_REUSE_MATRIX) {
101     c = (Mat_SeqBAIJ *)((*B)->data);
102 
103     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
104     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
105     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
106     PetscCall(PetscArrayzero(c->ilen, c->mbs));
107     C = *B;
108   } else {
109     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
110     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
111     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
112     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113   }
114   c = (Mat_SeqBAIJ *)(C->data);
115   for (i = 0; i < nrows; i++) {
116     row      = irow[i];
117     kstart   = ai[row];
118     kend     = kstart + a->ilen[row];
119     mat_i    = c->i[i];
120     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
121     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122     mat_ilen = c->ilen + i;
123     for (k = kstart; k < kend; k++) {
124       if ((tcol = ssmap[a->j[k]])) {
125         *mat_j++ = tcol - 1;
126         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127         mat_a += bs2;
128         (*mat_ilen)++;
129       }
130     }
131   }
132   /* sort */
133   if (c->j && c->a) {
134     MatScalar *work;
135     PetscCall(PetscMalloc1(bs2, &work));
136     for (i = 0; i < nrows; i++) {
137       PetscInt ilen;
138       mat_i = c->i[i];
139       mat_j = c->j + mat_i;
140       mat_a = c->a + mat_i * bs2;
141       ilen  = c->ilen[i];
142       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143     }
144     PetscCall(PetscFree(work));
145   }
146 
147   /* Free work space */
148   PetscCall(ISRestoreIndices(iscol, &icol));
149   PetscCall(PetscFree(smap));
150   PetscCall(PetscFree(lens));
151   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
152   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
153 
154   PetscCall(ISRestoreIndices(isrow, &irow));
155   *B = C;
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160 {
161   IS is1, is2;
162 
163   PetscFunctionBegin;
164   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
165   if (isrow == iscol) {
166     is2 = is1;
167     PetscCall(PetscObjectReference((PetscObject)is2));
168   } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
169   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
170   PetscCall(ISDestroy(&is1));
171   PetscCall(ISDestroy(&is2));
172   PetscFunctionReturn(PETSC_SUCCESS);
173 }
174 
175 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
176 {
177   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
178   Mat_SubSppt *submatj = c->submatis1;
179 
180   PetscFunctionBegin;
181   PetscCall((*submatj->destroy)(C));
182   PetscCall(MatDestroySubMatrix_Private(submatj));
183   PetscFunctionReturn(PETSC_SUCCESS);
184 }
185 
186 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
187 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
188 {
189   PetscInt     i;
190   Mat          C;
191   Mat_SeqBAIJ *c;
192   Mat_SubSppt *submatj;
193 
194   PetscFunctionBegin;
195   for (i = 0; i < n; i++) {
196     C       = (*mat)[i];
197     c       = (Mat_SeqBAIJ *)C->data;
198     submatj = c->submatis1;
199     if (submatj) {
200       if (--((PetscObject)C)->refct <= 0) {
201         PetscCall(PetscFree(C->factorprefix));
202         PetscCall((*submatj->destroy)(C));
203         PetscCall(MatDestroySubMatrix_Private(submatj));
204         PetscCall(PetscFree(C->defaultvectype));
205         PetscCall(PetscFree(C->defaultrandtype));
206         PetscCall(PetscLayoutDestroy(&C->rmap));
207         PetscCall(PetscLayoutDestroy(&C->cmap));
208         PetscCall(PetscHeaderDestroy(&C));
209       }
210     } else {
211       PetscCall(MatDestroy(&C));
212     }
213   }
214 
215   /* Destroy Dummy submatrices created for reuse */
216   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
217 
218   PetscCall(PetscFree(*mat));
219   PetscFunctionReturn(PETSC_SUCCESS);
220 }
221 
222 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
223 {
224   PetscInt i;
225 
226   PetscFunctionBegin;
227   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
228 
229   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 /* Should check that shapes of vectors and matrices match */
234 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
235 {
236   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
237   PetscScalar       *z, sum;
238   const PetscScalar *x;
239   const MatScalar   *v;
240   PetscInt           mbs, i, n;
241   const PetscInt    *idx, *ii, *ridx = NULL;
242   PetscBool          usecprow = a->compressedrow.use;
243 
244   PetscFunctionBegin;
245   PetscCall(VecGetArrayRead(xx, &x));
246   PetscCall(VecGetArrayWrite(zz, &z));
247 
248   if (usecprow) {
249     mbs  = a->compressedrow.nrows;
250     ii   = a->compressedrow.i;
251     ridx = a->compressedrow.rindex;
252     PetscCall(PetscArrayzero(z, a->mbs));
253   } else {
254     mbs = a->mbs;
255     ii  = a->i;
256   }
257 
258   for (i = 0; i < mbs; i++) {
259     n   = ii[1] - ii[0];
260     v   = a->a + ii[0];
261     idx = a->j + ii[0];
262     ii++;
263     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
264     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
265     sum = 0.0;
266     PetscSparseDensePlusDot(sum, x, v, idx, n);
267     if (usecprow) {
268       z[ridx[i]] = sum;
269     } else {
270       z[i] = sum;
271     }
272   }
273   PetscCall(VecRestoreArrayRead(xx, &x));
274   PetscCall(VecRestoreArrayWrite(zz, &z));
275   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
282   PetscScalar       *z = NULL, sum1, sum2, *zarray;
283   const PetscScalar *x, *xb;
284   PetscScalar        x1, x2;
285   const MatScalar   *v;
286   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
287   PetscBool          usecprow = a->compressedrow.use;
288 
289   PetscFunctionBegin;
290   PetscCall(VecGetArrayRead(xx, &x));
291   PetscCall(VecGetArrayWrite(zz, &zarray));
292 
293   idx = a->j;
294   v   = a->a;
295   if (usecprow) {
296     mbs  = a->compressedrow.nrows;
297     ii   = a->compressedrow.i;
298     ridx = a->compressedrow.rindex;
299     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
300   } else {
301     mbs = a->mbs;
302     ii  = a->i;
303     z   = zarray;
304   }
305 
306   for (i = 0; i < mbs; i++) {
307     n = ii[1] - ii[0];
308     ii++;
309     sum1 = 0.0;
310     sum2 = 0.0;
311     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
312     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
313     for (j = 0; j < n; j++) {
314       xb = x + 2 * (*idx++);
315       x1 = xb[0];
316       x2 = xb[1];
317       sum1 += v[0] * x1 + v[2] * x2;
318       sum2 += v[1] * x1 + v[3] * x2;
319       v += 4;
320     }
321     if (usecprow) z = zarray + 2 * ridx[i];
322     z[0] = sum1;
323     z[1] = sum2;
324     if (!usecprow) z += 2;
325   }
326   PetscCall(VecRestoreArrayRead(xx, &x));
327   PetscCall(VecRestoreArrayWrite(zz, &zarray));
328   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
329   PetscFunctionReturn(PETSC_SUCCESS);
330 }
331 
332 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
333 {
334   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
335   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
336   const PetscScalar *x, *xb;
337   const MatScalar   *v;
338   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
339   PetscBool          usecprow = a->compressedrow.use;
340 
341 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342   #pragma disjoint(*v, *z, *xb)
343 #endif
344 
345   PetscFunctionBegin;
346   PetscCall(VecGetArrayRead(xx, &x));
347   PetscCall(VecGetArrayWrite(zz, &zarray));
348 
349   idx = a->j;
350   v   = a->a;
351   if (usecprow) {
352     mbs  = a->compressedrow.nrows;
353     ii   = a->compressedrow.i;
354     ridx = a->compressedrow.rindex;
355     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
356   } else {
357     mbs = a->mbs;
358     ii  = a->i;
359     z   = zarray;
360   }
361 
362   for (i = 0; i < mbs; i++) {
363     n = ii[1] - ii[0];
364     ii++;
365     sum1 = 0.0;
366     sum2 = 0.0;
367     sum3 = 0.0;
368     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
369     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
370     for (j = 0; j < n; j++) {
371       xb = x + 3 * (*idx++);
372       x1 = xb[0];
373       x2 = xb[1];
374       x3 = xb[2];
375 
376       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
377       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
378       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
379       v += 9;
380     }
381     if (usecprow) z = zarray + 3 * ridx[i];
382     z[0] = sum1;
383     z[1] = sum2;
384     z[2] = sum3;
385     if (!usecprow) z += 3;
386   }
387   PetscCall(VecRestoreArrayRead(xx, &x));
388   PetscCall(VecRestoreArrayWrite(zz, &zarray));
389   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
394 {
395   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
396   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
397   const PetscScalar *x, *xb;
398   const MatScalar   *v;
399   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
400   PetscBool          usecprow = a->compressedrow.use;
401 
402   PetscFunctionBegin;
403   PetscCall(VecGetArrayRead(xx, &x));
404   PetscCall(VecGetArrayWrite(zz, &zarray));
405 
406   idx = a->j;
407   v   = a->a;
408   if (usecprow) {
409     mbs  = a->compressedrow.nrows;
410     ii   = a->compressedrow.i;
411     ridx = a->compressedrow.rindex;
412     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
413   } else {
414     mbs = a->mbs;
415     ii  = a->i;
416     z   = zarray;
417   }
418 
419   for (i = 0; i < mbs; i++) {
420     n = ii[1] - ii[0];
421     ii++;
422     sum1 = 0.0;
423     sum2 = 0.0;
424     sum3 = 0.0;
425     sum4 = 0.0;
426 
427     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
428     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429     for (j = 0; j < n; j++) {
430       xb = x + 4 * (*idx++);
431       x1 = xb[0];
432       x2 = xb[1];
433       x3 = xb[2];
434       x4 = xb[3];
435       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
436       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
437       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
438       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
439       v += 16;
440     }
441     if (usecprow) z = zarray + 4 * ridx[i];
442     z[0] = sum1;
443     z[1] = sum2;
444     z[2] = sum3;
445     z[3] = sum4;
446     if (!usecprow) z += 4;
447   }
448   PetscCall(VecRestoreArrayRead(xx, &x));
449   PetscCall(VecRestoreArrayWrite(zz, &zarray));
450   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
451   PetscFunctionReturn(PETSC_SUCCESS);
452 }
453 
454 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
455 {
456   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
457   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
458   const PetscScalar *xb, *x;
459   const MatScalar   *v;
460   const PetscInt    *idx, *ii, *ridx = NULL;
461   PetscInt           mbs, i, j, n;
462   PetscBool          usecprow = a->compressedrow.use;
463 
464   PetscFunctionBegin;
465   PetscCall(VecGetArrayRead(xx, &x));
466   PetscCall(VecGetArrayWrite(zz, &zarray));
467 
468   idx = a->j;
469   v   = a->a;
470   if (usecprow) {
471     mbs  = a->compressedrow.nrows;
472     ii   = a->compressedrow.i;
473     ridx = a->compressedrow.rindex;
474     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
475   } else {
476     mbs = a->mbs;
477     ii  = a->i;
478     z   = zarray;
479   }
480 
481   for (i = 0; i < mbs; i++) {
482     n = ii[1] - ii[0];
483     ii++;
484     sum1 = 0.0;
485     sum2 = 0.0;
486     sum3 = 0.0;
487     sum4 = 0.0;
488     sum5 = 0.0;
489     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
490     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
491     for (j = 0; j < n; j++) {
492       xb = x + 5 * (*idx++);
493       x1 = xb[0];
494       x2 = xb[1];
495       x3 = xb[2];
496       x4 = xb[3];
497       x5 = xb[4];
498       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
499       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
500       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
501       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
502       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
503       v += 25;
504     }
505     if (usecprow) z = zarray + 5 * ridx[i];
506     z[0] = sum1;
507     z[1] = sum2;
508     z[2] = sum3;
509     z[3] = sum4;
510     z[4] = sum5;
511     if (!usecprow) z += 5;
512   }
513   PetscCall(VecRestoreArrayRead(xx, &x));
514   PetscCall(VecRestoreArrayWrite(zz, &zarray));
515   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
516   PetscFunctionReturn(PETSC_SUCCESS);
517 }
518 
519 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
520 {
521   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
522   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
523   const PetscScalar *x, *xb;
524   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
525   const MatScalar   *v;
526   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
527   PetscBool          usecprow = a->compressedrow.use;
528 
529   PetscFunctionBegin;
530   PetscCall(VecGetArrayRead(xx, &x));
531   PetscCall(VecGetArrayWrite(zz, &zarray));
532 
533   idx = a->j;
534   v   = a->a;
535   if (usecprow) {
536     mbs  = a->compressedrow.nrows;
537     ii   = a->compressedrow.i;
538     ridx = a->compressedrow.rindex;
539     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
540   } else {
541     mbs = a->mbs;
542     ii  = a->i;
543     z   = zarray;
544   }
545 
546   for (i = 0; i < mbs; i++) {
547     n = ii[1] - ii[0];
548     ii++;
549     sum1 = 0.0;
550     sum2 = 0.0;
551     sum3 = 0.0;
552     sum4 = 0.0;
553     sum5 = 0.0;
554     sum6 = 0.0;
555 
556     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
557     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
558     for (j = 0; j < n; j++) {
559       xb = x + 6 * (*idx++);
560       x1 = xb[0];
561       x2 = xb[1];
562       x3 = xb[2];
563       x4 = xb[3];
564       x5 = xb[4];
565       x6 = xb[5];
566       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
567       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
568       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
569       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
570       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
571       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
572       v += 36;
573     }
574     if (usecprow) z = zarray + 6 * ridx[i];
575     z[0] = sum1;
576     z[1] = sum2;
577     z[2] = sum3;
578     z[3] = sum4;
579     z[4] = sum5;
580     z[5] = sum6;
581     if (!usecprow) z += 6;
582   }
583 
584   PetscCall(VecRestoreArrayRead(xx, &x));
585   PetscCall(VecRestoreArrayWrite(zz, &zarray));
586   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
587   PetscFunctionReturn(PETSC_SUCCESS);
588 }
589 
590 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
591 {
592   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
593   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
594   const PetscScalar *x, *xb;
595   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
596   const MatScalar   *v;
597   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
598   PetscBool          usecprow = a->compressedrow.use;
599 
600   PetscFunctionBegin;
601   PetscCall(VecGetArrayRead(xx, &x));
602   PetscCall(VecGetArrayWrite(zz, &zarray));
603 
604   idx = a->j;
605   v   = a->a;
606   if (usecprow) {
607     mbs  = a->compressedrow.nrows;
608     ii   = a->compressedrow.i;
609     ridx = a->compressedrow.rindex;
610     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
611   } else {
612     mbs = a->mbs;
613     ii  = a->i;
614     z   = zarray;
615   }
616 
617   for (i = 0; i < mbs; i++) {
618     n = ii[1] - ii[0];
619     ii++;
620     sum1 = 0.0;
621     sum2 = 0.0;
622     sum3 = 0.0;
623     sum4 = 0.0;
624     sum5 = 0.0;
625     sum6 = 0.0;
626     sum7 = 0.0;
627 
628     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
629     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
630     for (j = 0; j < n; j++) {
631       xb = x + 7 * (*idx++);
632       x1 = xb[0];
633       x2 = xb[1];
634       x3 = xb[2];
635       x4 = xb[3];
636       x5 = xb[4];
637       x6 = xb[5];
638       x7 = xb[6];
639       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
640       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
641       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
642       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
643       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
644       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
645       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
646       v += 49;
647     }
648     if (usecprow) z = zarray + 7 * ridx[i];
649     z[0] = sum1;
650     z[1] = sum2;
651     z[2] = sum3;
652     z[3] = sum4;
653     z[4] = sum5;
654     z[5] = sum6;
655     z[6] = sum7;
656     if (!usecprow) z += 7;
657   }
658 
659   PetscCall(VecRestoreArrayRead(xx, &x));
660   PetscCall(VecRestoreArrayWrite(zz, &zarray));
661   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
662   PetscFunctionReturn(PETSC_SUCCESS);
663 }
664 
665 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
666 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
667 {
668   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
669   PetscScalar       *z = NULL, *work, *workt, *zarray;
670   const PetscScalar *x, *xb;
671   const MatScalar   *v;
672   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
673   const PetscInt    *idx, *ii, *ridx = NULL;
674   PetscInt           k;
675   PetscBool          usecprow = a->compressedrow.use;
676 
677   __m256d a0, a1, a2, a3, a4, a5;
678   __m256d w0, w1, w2, w3;
679   __m256d z0, z1, z2;
680   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
681 
682   PetscFunctionBegin;
683   PetscCall(VecGetArrayRead(xx, &x));
684   PetscCall(VecGetArrayWrite(zz, &zarray));
685 
686   idx = a->j;
687   v   = a->a;
688   if (usecprow) {
689     mbs  = a->compressedrow.nrows;
690     ii   = a->compressedrow.i;
691     ridx = a->compressedrow.rindex;
692     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
693   } else {
694     mbs = a->mbs;
695     ii  = a->i;
696     z   = zarray;
697   }
698 
699   if (!a->mult_work) {
700     k = PetscMax(A->rmap->n, A->cmap->n);
701     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
702   }
703 
704   work = a->mult_work;
705   for (i = 0; i < mbs; i++) {
706     n = ii[1] - ii[0];
707     ii++;
708     workt = work;
709     for (j = 0; j < n; j++) {
710       xb = x + bs * (*idx++);
711       for (k = 0; k < bs; k++) workt[k] = xb[k];
712       workt += bs;
713     }
714     if (usecprow) z = zarray + bs * ridx[i];
715 
716     z0 = _mm256_setzero_pd();
717     z1 = _mm256_setzero_pd();
718     z2 = _mm256_setzero_pd();
719 
720     for (j = 0; j < n; j++) {
721       /* first column of a */
722       w0 = _mm256_set1_pd(work[j * 9]);
723       a0 = _mm256_loadu_pd(&v[j * 81]);
724       z0 = _mm256_fmadd_pd(a0, w0, z0);
725       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
726       z1 = _mm256_fmadd_pd(a1, w0, z1);
727       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
728       z2 = _mm256_fmadd_pd(a2, w0, z2);
729 
730       /* second column of a */
731       w1 = _mm256_set1_pd(work[j * 9 + 1]);
732       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
733       z0 = _mm256_fmadd_pd(a0, w1, z0);
734       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
735       z1 = _mm256_fmadd_pd(a1, w1, z1);
736       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
737       z2 = _mm256_fmadd_pd(a2, w1, z2);
738 
739       /* third column of a */
740       w2 = _mm256_set1_pd(work[j * 9 + 2]);
741       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
742       z0 = _mm256_fmadd_pd(a3, w2, z0);
743       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
744       z1 = _mm256_fmadd_pd(a4, w2, z1);
745       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
746       z2 = _mm256_fmadd_pd(a5, w2, z2);
747 
748       /* fourth column of a */
749       w3 = _mm256_set1_pd(work[j * 9 + 3]);
750       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
751       z0 = _mm256_fmadd_pd(a0, w3, z0);
752       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
753       z1 = _mm256_fmadd_pd(a1, w3, z1);
754       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
755       z2 = _mm256_fmadd_pd(a2, w3, z2);
756 
757       /* fifth column of a */
758       w0 = _mm256_set1_pd(work[j * 9 + 4]);
759       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
760       z0 = _mm256_fmadd_pd(a3, w0, z0);
761       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
762       z1 = _mm256_fmadd_pd(a4, w0, z1);
763       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
764       z2 = _mm256_fmadd_pd(a5, w0, z2);
765 
766       /* sixth column of a */
767       w1 = _mm256_set1_pd(work[j * 9 + 5]);
768       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
769       z0 = _mm256_fmadd_pd(a0, w1, z0);
770       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
771       z1 = _mm256_fmadd_pd(a1, w1, z1);
772       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
773       z2 = _mm256_fmadd_pd(a2, w1, z2);
774 
775       /* seventh column of a */
776       w2 = _mm256_set1_pd(work[j * 9 + 6]);
777       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
778       z0 = _mm256_fmadd_pd(a0, w2, z0);
779       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
780       z1 = _mm256_fmadd_pd(a1, w2, z1);
781       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
782       z2 = _mm256_fmadd_pd(a2, w2, z2);
783 
784       /* eighth column of a */
785       w3 = _mm256_set1_pd(work[j * 9 + 7]);
786       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
787       z0 = _mm256_fmadd_pd(a3, w3, z0);
788       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
789       z1 = _mm256_fmadd_pd(a4, w3, z1);
790       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
791       z2 = _mm256_fmadd_pd(a5, w3, z2);
792 
793       /* ninth column of a */
794       w0 = _mm256_set1_pd(work[j * 9 + 8]);
795       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
796       z0 = _mm256_fmadd_pd(a0, w0, z0);
797       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
798       z1 = _mm256_fmadd_pd(a1, w0, z1);
799       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
800       z2 = _mm256_fmadd_pd(a2, w0, z2);
801     }
802 
803     _mm256_storeu_pd(&z[0], z0);
804     _mm256_storeu_pd(&z[4], z1);
805     _mm256_maskstore_pd(&z[8], mask1, z2);
806 
807     v += n * bs2;
808     if (!usecprow) z += bs;
809   }
810   PetscCall(VecRestoreArrayRead(xx, &x));
811   PetscCall(VecRestoreArrayWrite(zz, &zarray));
812   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
813   PetscFunctionReturn(PETSC_SUCCESS);
814 }
815 #endif
816 
817 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
818 {
819   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
820   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
821   const PetscScalar *x, *xb;
822   PetscScalar       *zarray, xv;
823   const MatScalar   *v;
824   const PetscInt    *ii, *ij = a->j, *idx;
825   PetscInt           mbs, i, j, k, n, *ridx = NULL;
826   PetscBool          usecprow = a->compressedrow.use;
827 
828   PetscFunctionBegin;
829   PetscCall(VecGetArrayRead(xx, &x));
830   PetscCall(VecGetArrayWrite(zz, &zarray));
831 
832   v = a->a;
833   if (usecprow) {
834     mbs  = a->compressedrow.nrows;
835     ii   = a->compressedrow.i;
836     ridx = a->compressedrow.rindex;
837     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
838   } else {
839     mbs = a->mbs;
840     ii  = a->i;
841     z   = zarray;
842   }
843 
844   for (i = 0; i < mbs; i++) {
845     n     = ii[i + 1] - ii[i];
846     idx   = ij + ii[i];
847     sum1  = 0.0;
848     sum2  = 0.0;
849     sum3  = 0.0;
850     sum4  = 0.0;
851     sum5  = 0.0;
852     sum6  = 0.0;
853     sum7  = 0.0;
854     sum8  = 0.0;
855     sum9  = 0.0;
856     sum10 = 0.0;
857     sum11 = 0.0;
858 
859     for (j = 0; j < n; j++) {
860       xb = x + 11 * (idx[j]);
861 
862       for (k = 0; k < 11; k++) {
863         xv = xb[k];
864         sum1 += v[0] * xv;
865         sum2 += v[1] * xv;
866         sum3 += v[2] * xv;
867         sum4 += v[3] * xv;
868         sum5 += v[4] * xv;
869         sum6 += v[5] * xv;
870         sum7 += v[6] * xv;
871         sum8 += v[7] * xv;
872         sum9 += v[8] * xv;
873         sum10 += v[9] * xv;
874         sum11 += v[10] * xv;
875         v += 11;
876       }
877     }
878     if (usecprow) z = zarray + 11 * ridx[i];
879     z[0]  = sum1;
880     z[1]  = sum2;
881     z[2]  = sum3;
882     z[3]  = sum4;
883     z[4]  = sum5;
884     z[5]  = sum6;
885     z[6]  = sum7;
886     z[7]  = sum8;
887     z[8]  = sum9;
888     z[9]  = sum10;
889     z[10] = sum11;
890 
891     if (!usecprow) z += 11;
892   }
893 
894   PetscCall(VecRestoreArrayRead(xx, &x));
895   PetscCall(VecRestoreArrayWrite(zz, &zarray));
896   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
897   PetscFunctionReturn(PETSC_SUCCESS);
898 }
899 
900 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
901 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
902 {
903   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
904   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
905   const PetscScalar *x, *xb;
906   PetscScalar       *zarray, xv;
907   const MatScalar   *v;
908   const PetscInt    *ii, *ij = a->j, *idx;
909   PetscInt           mbs, i, j, k, n, *ridx = NULL;
910   PetscBool          usecprow = a->compressedrow.use;
911 
912   PetscFunctionBegin;
913   PetscCall(VecGetArrayRead(xx, &x));
914   PetscCall(VecGetArrayWrite(zz, &zarray));
915 
916   v = a->a;
917   if (usecprow) {
918     mbs  = a->compressedrow.nrows;
919     ii   = a->compressedrow.i;
920     ridx = a->compressedrow.rindex;
921     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
922   } else {
923     mbs = a->mbs;
924     ii  = a->i;
925     z   = zarray;
926   }
927 
928   for (i = 0; i < mbs; i++) {
929     n     = ii[i + 1] - ii[i];
930     idx   = ij + ii[i];
931     sum1  = 0.0;
932     sum2  = 0.0;
933     sum3  = 0.0;
934     sum4  = 0.0;
935     sum5  = 0.0;
936     sum6  = 0.0;
937     sum7  = 0.0;
938     sum8  = 0.0;
939     sum9  = 0.0;
940     sum10 = 0.0;
941     sum11 = 0.0;
942     sum12 = 0.0;
943 
944     for (j = 0; j < n; j++) {
945       xb = x + 12 * (idx[j]);
946 
947       for (k = 0; k < 12; k++) {
948         xv = xb[k];
949         sum1 += v[0] * xv;
950         sum2 += v[1] * xv;
951         sum3 += v[2] * xv;
952         sum4 += v[3] * xv;
953         sum5 += v[4] * xv;
954         sum6 += v[5] * xv;
955         sum7 += v[6] * xv;
956         sum8 += v[7] * xv;
957         sum9 += v[8] * xv;
958         sum10 += v[9] * xv;
959         sum11 += v[10] * xv;
960         sum12 += v[11] * xv;
961         v += 12;
962       }
963     }
964     if (usecprow) z = zarray + 12 * ridx[i];
965     z[0]  = sum1;
966     z[1]  = sum2;
967     z[2]  = sum3;
968     z[3]  = sum4;
969     z[4]  = sum5;
970     z[5]  = sum6;
971     z[6]  = sum7;
972     z[7]  = sum8;
973     z[8]  = sum9;
974     z[9]  = sum10;
975     z[10] = sum11;
976     z[11] = sum12;
977     if (!usecprow) z += 12;
978   }
979   PetscCall(VecRestoreArrayRead(xx, &x));
980   PetscCall(VecRestoreArrayWrite(zz, &zarray));
981   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
982   PetscFunctionReturn(PETSC_SUCCESS);
983 }
984 
985 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
986 {
987   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
988   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
989   const PetscScalar *x, *xb;
990   PetscScalar       *zarray, *yarray, xv;
991   const MatScalar   *v;
992   const PetscInt    *ii, *ij = a->j, *idx;
993   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
994   PetscBool          usecprow = a->compressedrow.use;
995 
996   PetscFunctionBegin;
997   PetscCall(VecGetArrayRead(xx, &x));
998   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
999 
1000   v = a->a;
1001   if (usecprow) {
1002     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1003     mbs  = a->compressedrow.nrows;
1004     ii   = a->compressedrow.i;
1005     ridx = a->compressedrow.rindex;
1006   } else {
1007     ii = a->i;
1008     y  = yarray;
1009     z  = zarray;
1010   }
1011 
1012   for (i = 0; i < mbs; i++) {
1013     n   = ii[i + 1] - ii[i];
1014     idx = ij + ii[i];
1015 
1016     if (usecprow) {
1017       y = yarray + 12 * ridx[i];
1018       z = zarray + 12 * ridx[i];
1019     }
1020     sum1  = y[0];
1021     sum2  = y[1];
1022     sum3  = y[2];
1023     sum4  = y[3];
1024     sum5  = y[4];
1025     sum6  = y[5];
1026     sum7  = y[6];
1027     sum8  = y[7];
1028     sum9  = y[8];
1029     sum10 = y[9];
1030     sum11 = y[10];
1031     sum12 = y[11];
1032 
1033     for (j = 0; j < n; j++) {
1034       xb = x + 12 * (idx[j]);
1035 
1036       for (k = 0; k < 12; k++) {
1037         xv = xb[k];
1038         sum1 += v[0] * xv;
1039         sum2 += v[1] * xv;
1040         sum3 += v[2] * xv;
1041         sum4 += v[3] * xv;
1042         sum5 += v[4] * xv;
1043         sum6 += v[5] * xv;
1044         sum7 += v[6] * xv;
1045         sum8 += v[7] * xv;
1046         sum9 += v[8] * xv;
1047         sum10 += v[9] * xv;
1048         sum11 += v[10] * xv;
1049         sum12 += v[11] * xv;
1050         v += 12;
1051       }
1052     }
1053 
1054     z[0]  = sum1;
1055     z[1]  = sum2;
1056     z[2]  = sum3;
1057     z[3]  = sum4;
1058     z[4]  = sum5;
1059     z[5]  = sum6;
1060     z[6]  = sum7;
1061     z[7]  = sum8;
1062     z[8]  = sum9;
1063     z[9]  = sum10;
1064     z[10] = sum11;
1065     z[11] = sum12;
1066     if (!usecprow) {
1067       y += 12;
1068       z += 12;
1069     }
1070   }
1071   PetscCall(VecRestoreArrayRead(xx, &x));
1072   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1073   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1074   PetscFunctionReturn(PETSC_SUCCESS);
1075 }
1076 
1077 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1078 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1079 {
1080   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1081   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1082   const PetscScalar *x, *xb;
1083   PetscScalar        x1, x2, x3, x4, *zarray;
1084   const MatScalar   *v;
1085   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1086   PetscInt           mbs, i, j, n;
1087   PetscBool          usecprow = a->compressedrow.use;
1088 
1089   PetscFunctionBegin;
1090   PetscCall(VecGetArrayRead(xx, &x));
1091   PetscCall(VecGetArrayWrite(zz, &zarray));
1092 
1093   v = a->a;
1094   if (usecprow) {
1095     mbs  = a->compressedrow.nrows;
1096     ii   = a->compressedrow.i;
1097     ridx = a->compressedrow.rindex;
1098     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1099   } else {
1100     mbs = a->mbs;
1101     ii  = a->i;
1102     z   = zarray;
1103   }
1104 
1105   for (i = 0; i < mbs; i++) {
1106     n   = ii[i + 1] - ii[i];
1107     idx = ij + ii[i];
1108 
1109     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1110     for (j = 0; j < n; j++) {
1111       xb = x + 12 * (idx[j]);
1112       x1 = xb[0];
1113       x2 = xb[1];
1114       x3 = xb[2];
1115       x4 = xb[3];
1116 
1117       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1118       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1119       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1120       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1121       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1122       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1123       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1124       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1125       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1126       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1127       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1128       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1129       v += 48;
1130 
1131       x1 = xb[4];
1132       x2 = xb[5];
1133       x3 = xb[6];
1134       x4 = xb[7];
1135 
1136       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1137       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1138       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1139       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1140       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1141       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1142       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1143       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1144       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1145       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1146       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1147       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1148       v += 48;
1149 
1150       x1 = xb[8];
1151       x2 = xb[9];
1152       x3 = xb[10];
1153       x4 = xb[11];
1154       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1155       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1156       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1157       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1158       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1159       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1160       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1161       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1162       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1163       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1164       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1165       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1166       v += 48;
1167     }
1168     if (usecprow) z = zarray + 12 * ridx[i];
1169     z[0]  = sum1;
1170     z[1]  = sum2;
1171     z[2]  = sum3;
1172     z[3]  = sum4;
1173     z[4]  = sum5;
1174     z[5]  = sum6;
1175     z[6]  = sum7;
1176     z[7]  = sum8;
1177     z[8]  = sum9;
1178     z[9]  = sum10;
1179     z[10] = sum11;
1180     z[11] = sum12;
1181     if (!usecprow) z += 12;
1182   }
1183   PetscCall(VecRestoreArrayRead(xx, &x));
1184   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1185   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1186   PetscFunctionReturn(PETSC_SUCCESS);
1187 }
1188 
1189 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1190 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1191 {
1192   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1193   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1194   const PetscScalar *x, *xb;
1195   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1196   const MatScalar   *v;
1197   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1198   PetscInt           mbs      = a->mbs, i, j, n;
1199   PetscBool          usecprow = a->compressedrow.use;
1200 
1201   PetscFunctionBegin;
1202   PetscCall(VecGetArrayRead(xx, &x));
1203   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1204 
1205   v = a->a;
1206   if (usecprow) {
1207     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1208     mbs  = a->compressedrow.nrows;
1209     ii   = a->compressedrow.i;
1210     ridx = a->compressedrow.rindex;
1211   } else {
1212     ii = a->i;
1213     y  = yarray;
1214     z  = zarray;
1215   }
1216 
1217   for (i = 0; i < mbs; i++) {
1218     n   = ii[i + 1] - ii[i];
1219     idx = ij + ii[i];
1220 
1221     if (usecprow) {
1222       y = yarray + 12 * ridx[i];
1223       z = zarray + 12 * ridx[i];
1224     }
1225     sum1  = y[0];
1226     sum2  = y[1];
1227     sum3  = y[2];
1228     sum4  = y[3];
1229     sum5  = y[4];
1230     sum6  = y[5];
1231     sum7  = y[6];
1232     sum8  = y[7];
1233     sum9  = y[8];
1234     sum10 = y[9];
1235     sum11 = y[10];
1236     sum12 = y[11];
1237 
1238     for (j = 0; j < n; j++) {
1239       xb = x + 12 * (idx[j]);
1240       x1 = xb[0];
1241       x2 = xb[1];
1242       x3 = xb[2];
1243       x4 = xb[3];
1244 
1245       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1246       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1247       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1248       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1249       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1250       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1251       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1252       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1253       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1254       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1255       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1256       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1257       v += 48;
1258 
1259       x1 = xb[4];
1260       x2 = xb[5];
1261       x3 = xb[6];
1262       x4 = xb[7];
1263 
1264       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1265       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1266       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1267       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1268       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1269       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1270       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1271       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1272       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1273       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1274       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1275       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1276       v += 48;
1277 
1278       x1 = xb[8];
1279       x2 = xb[9];
1280       x3 = xb[10];
1281       x4 = xb[11];
1282       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1283       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1284       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1285       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1286       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1287       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1288       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1289       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1290       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1291       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1292       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1293       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1294       v += 48;
1295     }
1296     z[0]  = sum1;
1297     z[1]  = sum2;
1298     z[2]  = sum3;
1299     z[3]  = sum4;
1300     z[4]  = sum5;
1301     z[5]  = sum6;
1302     z[6]  = sum7;
1303     z[7]  = sum8;
1304     z[8]  = sum9;
1305     z[9]  = sum10;
1306     z[10] = sum11;
1307     z[11] = sum12;
1308     if (!usecprow) {
1309       y += 12;
1310       z += 12;
1311     }
1312   }
1313   PetscCall(VecRestoreArrayRead(xx, &x));
1314   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1315   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1316   PetscFunctionReturn(PETSC_SUCCESS);
1317 }
1318 
1319 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1320 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1321 {
1322   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1323   PetscScalar       *z = NULL, *zarray;
1324   const PetscScalar *x, *work;
1325   const MatScalar   *v = a->a;
1326   PetscInt           mbs, i, j, n;
1327   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1328   PetscBool          usecprow = a->compressedrow.use;
1329   const PetscInt     bs = 12, bs2 = 144;
1330 
1331   __m256d a0, a1, a2, a3, a4, a5;
1332   __m256d w0, w1, w2, w3;
1333   __m256d z0, z1, z2;
1334 
1335   PetscFunctionBegin;
1336   PetscCall(VecGetArrayRead(xx, &x));
1337   PetscCall(VecGetArrayWrite(zz, &zarray));
1338 
1339   if (usecprow) {
1340     mbs  = a->compressedrow.nrows;
1341     ii   = a->compressedrow.i;
1342     ridx = a->compressedrow.rindex;
1343     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1344   } else {
1345     mbs = a->mbs;
1346     ii  = a->i;
1347     z   = zarray;
1348   }
1349 
1350   for (i = 0; i < mbs; i++) {
1351     z0 = _mm256_setzero_pd();
1352     z1 = _mm256_setzero_pd();
1353     z2 = _mm256_setzero_pd();
1354 
1355     n = ii[1] - ii[0];
1356     ii++;
1357     for (j = 0; j < n; j++) {
1358       work = x + bs * (*idx++);
1359 
1360       /* first column of a */
1361       w0 = _mm256_set1_pd(work[0]);
1362       a0 = _mm256_loadu_pd(v + 0);
1363       z0 = _mm256_fmadd_pd(a0, w0, z0);
1364       a1 = _mm256_loadu_pd(v + 4);
1365       z1 = _mm256_fmadd_pd(a1, w0, z1);
1366       a2 = _mm256_loadu_pd(v + 8);
1367       z2 = _mm256_fmadd_pd(a2, w0, z2);
1368 
1369       /* second column of a */
1370       w1 = _mm256_set1_pd(work[1]);
1371       a3 = _mm256_loadu_pd(v + 12);
1372       z0 = _mm256_fmadd_pd(a3, w1, z0);
1373       a4 = _mm256_loadu_pd(v + 16);
1374       z1 = _mm256_fmadd_pd(a4, w1, z1);
1375       a5 = _mm256_loadu_pd(v + 20);
1376       z2 = _mm256_fmadd_pd(a5, w1, z2);
1377 
1378       /* third column of a */
1379       w2 = _mm256_set1_pd(work[2]);
1380       a0 = _mm256_loadu_pd(v + 24);
1381       z0 = _mm256_fmadd_pd(a0, w2, z0);
1382       a1 = _mm256_loadu_pd(v + 28);
1383       z1 = _mm256_fmadd_pd(a1, w2, z1);
1384       a2 = _mm256_loadu_pd(v + 32);
1385       z2 = _mm256_fmadd_pd(a2, w2, z2);
1386 
1387       /* fourth column of a */
1388       w3 = _mm256_set1_pd(work[3]);
1389       a3 = _mm256_loadu_pd(v + 36);
1390       z0 = _mm256_fmadd_pd(a3, w3, z0);
1391       a4 = _mm256_loadu_pd(v + 40);
1392       z1 = _mm256_fmadd_pd(a4, w3, z1);
1393       a5 = _mm256_loadu_pd(v + 44);
1394       z2 = _mm256_fmadd_pd(a5, w3, z2);
1395 
1396       /* fifth column of a */
1397       w0 = _mm256_set1_pd(work[4]);
1398       a0 = _mm256_loadu_pd(v + 48);
1399       z0 = _mm256_fmadd_pd(a0, w0, z0);
1400       a1 = _mm256_loadu_pd(v + 52);
1401       z1 = _mm256_fmadd_pd(a1, w0, z1);
1402       a2 = _mm256_loadu_pd(v + 56);
1403       z2 = _mm256_fmadd_pd(a2, w0, z2);
1404 
1405       /* sixth column of a */
1406       w1 = _mm256_set1_pd(work[5]);
1407       a3 = _mm256_loadu_pd(v + 60);
1408       z0 = _mm256_fmadd_pd(a3, w1, z0);
1409       a4 = _mm256_loadu_pd(v + 64);
1410       z1 = _mm256_fmadd_pd(a4, w1, z1);
1411       a5 = _mm256_loadu_pd(v + 68);
1412       z2 = _mm256_fmadd_pd(a5, w1, z2);
1413 
1414       /* seventh column of a */
1415       w2 = _mm256_set1_pd(work[6]);
1416       a0 = _mm256_loadu_pd(v + 72);
1417       z0 = _mm256_fmadd_pd(a0, w2, z0);
1418       a1 = _mm256_loadu_pd(v + 76);
1419       z1 = _mm256_fmadd_pd(a1, w2, z1);
1420       a2 = _mm256_loadu_pd(v + 80);
1421       z2 = _mm256_fmadd_pd(a2, w2, z2);
1422 
1423       /* eighth column of a */
1424       w3 = _mm256_set1_pd(work[7]);
1425       a3 = _mm256_loadu_pd(v + 84);
1426       z0 = _mm256_fmadd_pd(a3, w3, z0);
1427       a4 = _mm256_loadu_pd(v + 88);
1428       z1 = _mm256_fmadd_pd(a4, w3, z1);
1429       a5 = _mm256_loadu_pd(v + 92);
1430       z2 = _mm256_fmadd_pd(a5, w3, z2);
1431 
1432       /* ninth column of a */
1433       w0 = _mm256_set1_pd(work[8]);
1434       a0 = _mm256_loadu_pd(v + 96);
1435       z0 = _mm256_fmadd_pd(a0, w0, z0);
1436       a1 = _mm256_loadu_pd(v + 100);
1437       z1 = _mm256_fmadd_pd(a1, w0, z1);
1438       a2 = _mm256_loadu_pd(v + 104);
1439       z2 = _mm256_fmadd_pd(a2, w0, z2);
1440 
1441       /* tenth column of a */
1442       w1 = _mm256_set1_pd(work[9]);
1443       a3 = _mm256_loadu_pd(v + 108);
1444       z0 = _mm256_fmadd_pd(a3, w1, z0);
1445       a4 = _mm256_loadu_pd(v + 112);
1446       z1 = _mm256_fmadd_pd(a4, w1, z1);
1447       a5 = _mm256_loadu_pd(v + 116);
1448       z2 = _mm256_fmadd_pd(a5, w1, z2);
1449 
1450       /* eleventh column of a */
1451       w2 = _mm256_set1_pd(work[10]);
1452       a0 = _mm256_loadu_pd(v + 120);
1453       z0 = _mm256_fmadd_pd(a0, w2, z0);
1454       a1 = _mm256_loadu_pd(v + 124);
1455       z1 = _mm256_fmadd_pd(a1, w2, z1);
1456       a2 = _mm256_loadu_pd(v + 128);
1457       z2 = _mm256_fmadd_pd(a2, w2, z2);
1458 
1459       /* twelveth column of a */
1460       w3 = _mm256_set1_pd(work[11]);
1461       a3 = _mm256_loadu_pd(v + 132);
1462       z0 = _mm256_fmadd_pd(a3, w3, z0);
1463       a4 = _mm256_loadu_pd(v + 136);
1464       z1 = _mm256_fmadd_pd(a4, w3, z1);
1465       a5 = _mm256_loadu_pd(v + 140);
1466       z2 = _mm256_fmadd_pd(a5, w3, z2);
1467 
1468       v += bs2;
1469     }
1470     if (usecprow) z = zarray + bs * ridx[i];
1471     _mm256_storeu_pd(&z[0], z0);
1472     _mm256_storeu_pd(&z[4], z1);
1473     _mm256_storeu_pd(&z[8], z2);
1474     if (!usecprow) z += bs;
1475   }
1476   PetscCall(VecRestoreArrayRead(xx, &x));
1477   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1478   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1479   PetscFunctionReturn(PETSC_SUCCESS);
1480 }
1481 #endif
1482 
1483 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1484 /* Default MatMult for block size 15 */
1485 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1486 {
1487   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1488   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1489   const PetscScalar *x, *xb;
1490   PetscScalar       *zarray, xv;
1491   const MatScalar   *v;
1492   const PetscInt    *ii, *ij = a->j, *idx;
1493   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1494   PetscBool          usecprow = a->compressedrow.use;
1495 
1496   PetscFunctionBegin;
1497   PetscCall(VecGetArrayRead(xx, &x));
1498   PetscCall(VecGetArrayWrite(zz, &zarray));
1499 
1500   v = a->a;
1501   if (usecprow) {
1502     mbs  = a->compressedrow.nrows;
1503     ii   = a->compressedrow.i;
1504     ridx = a->compressedrow.rindex;
1505     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1506   } else {
1507     mbs = a->mbs;
1508     ii  = a->i;
1509     z   = zarray;
1510   }
1511 
1512   for (i = 0; i < mbs; i++) {
1513     n     = ii[i + 1] - ii[i];
1514     idx   = ij + ii[i];
1515     sum1  = 0.0;
1516     sum2  = 0.0;
1517     sum3  = 0.0;
1518     sum4  = 0.0;
1519     sum5  = 0.0;
1520     sum6  = 0.0;
1521     sum7  = 0.0;
1522     sum8  = 0.0;
1523     sum9  = 0.0;
1524     sum10 = 0.0;
1525     sum11 = 0.0;
1526     sum12 = 0.0;
1527     sum13 = 0.0;
1528     sum14 = 0.0;
1529     sum15 = 0.0;
1530 
1531     for (j = 0; j < n; j++) {
1532       xb = x + 15 * (idx[j]);
1533 
1534       for (k = 0; k < 15; k++) {
1535         xv = xb[k];
1536         sum1 += v[0] * xv;
1537         sum2 += v[1] * xv;
1538         sum3 += v[2] * xv;
1539         sum4 += v[3] * xv;
1540         sum5 += v[4] * xv;
1541         sum6 += v[5] * xv;
1542         sum7 += v[6] * xv;
1543         sum8 += v[7] * xv;
1544         sum9 += v[8] * xv;
1545         sum10 += v[9] * xv;
1546         sum11 += v[10] * xv;
1547         sum12 += v[11] * xv;
1548         sum13 += v[12] * xv;
1549         sum14 += v[13] * xv;
1550         sum15 += v[14] * xv;
1551         v += 15;
1552       }
1553     }
1554     if (usecprow) z = zarray + 15 * ridx[i];
1555     z[0]  = sum1;
1556     z[1]  = sum2;
1557     z[2]  = sum3;
1558     z[3]  = sum4;
1559     z[4]  = sum5;
1560     z[5]  = sum6;
1561     z[6]  = sum7;
1562     z[7]  = sum8;
1563     z[8]  = sum9;
1564     z[9]  = sum10;
1565     z[10] = sum11;
1566     z[11] = sum12;
1567     z[12] = sum13;
1568     z[13] = sum14;
1569     z[14] = sum15;
1570 
1571     if (!usecprow) z += 15;
1572   }
1573 
1574   PetscCall(VecRestoreArrayRead(xx, &x));
1575   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1576   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1577   PetscFunctionReturn(PETSC_SUCCESS);
1578 }
1579 
1580 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1581 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1582 {
1583   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1584   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1585   const PetscScalar *x, *xb;
1586   PetscScalar        x1, x2, x3, x4, *zarray;
1587   const MatScalar   *v;
1588   const PetscInt    *ii, *ij = a->j, *idx;
1589   PetscInt           mbs, i, j, n, *ridx = NULL;
1590   PetscBool          usecprow = a->compressedrow.use;
1591 
1592   PetscFunctionBegin;
1593   PetscCall(VecGetArrayRead(xx, &x));
1594   PetscCall(VecGetArrayWrite(zz, &zarray));
1595 
1596   v = a->a;
1597   if (usecprow) {
1598     mbs  = a->compressedrow.nrows;
1599     ii   = a->compressedrow.i;
1600     ridx = a->compressedrow.rindex;
1601     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1602   } else {
1603     mbs = a->mbs;
1604     ii  = a->i;
1605     z   = zarray;
1606   }
1607 
1608   for (i = 0; i < mbs; i++) {
1609     n     = ii[i + 1] - ii[i];
1610     idx   = ij + ii[i];
1611     sum1  = 0.0;
1612     sum2  = 0.0;
1613     sum3  = 0.0;
1614     sum4  = 0.0;
1615     sum5  = 0.0;
1616     sum6  = 0.0;
1617     sum7  = 0.0;
1618     sum8  = 0.0;
1619     sum9  = 0.0;
1620     sum10 = 0.0;
1621     sum11 = 0.0;
1622     sum12 = 0.0;
1623     sum13 = 0.0;
1624     sum14 = 0.0;
1625     sum15 = 0.0;
1626 
1627     for (j = 0; j < n; j++) {
1628       xb = x + 15 * (idx[j]);
1629       x1 = xb[0];
1630       x2 = xb[1];
1631       x3 = xb[2];
1632       x4 = xb[3];
1633 
1634       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1635       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1636       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1637       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1638       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1639       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1640       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1641       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1642       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1643       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1644       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1645       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1646       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1647       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1648       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1649 
1650       v += 60;
1651 
1652       x1 = xb[4];
1653       x2 = xb[5];
1654       x3 = xb[6];
1655       x4 = xb[7];
1656 
1657       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1658       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1659       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1660       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1661       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1662       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1663       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1664       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1665       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1666       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1667       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1668       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1669       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1670       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1671       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1672       v += 60;
1673 
1674       x1 = xb[8];
1675       x2 = xb[9];
1676       x3 = xb[10];
1677       x4 = xb[11];
1678       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1679       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1680       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1681       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1682       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1683       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1684       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1685       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1686       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1687       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1688       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1689       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1690       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1691       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1692       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1693       v += 60;
1694 
1695       x1 = xb[12];
1696       x2 = xb[13];
1697       x3 = xb[14];
1698       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1699       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1700       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1701       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1702       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1703       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1704       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1705       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1706       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1707       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1708       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1709       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1710       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1711       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1712       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1713       v += 45;
1714     }
1715     if (usecprow) z = zarray + 15 * ridx[i];
1716     z[0]  = sum1;
1717     z[1]  = sum2;
1718     z[2]  = sum3;
1719     z[3]  = sum4;
1720     z[4]  = sum5;
1721     z[5]  = sum6;
1722     z[6]  = sum7;
1723     z[7]  = sum8;
1724     z[8]  = sum9;
1725     z[9]  = sum10;
1726     z[10] = sum11;
1727     z[11] = sum12;
1728     z[12] = sum13;
1729     z[13] = sum14;
1730     z[14] = sum15;
1731 
1732     if (!usecprow) z += 15;
1733   }
1734 
1735   PetscCall(VecRestoreArrayRead(xx, &x));
1736   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1737   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1738   PetscFunctionReturn(PETSC_SUCCESS);
1739 }
1740 
1741 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1742 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1743 {
1744   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1745   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1746   const PetscScalar *x, *xb;
1747   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1748   const MatScalar   *v;
1749   const PetscInt    *ii, *ij = a->j, *idx;
1750   PetscInt           mbs, i, j, n, *ridx = NULL;
1751   PetscBool          usecprow = a->compressedrow.use;
1752 
1753   PetscFunctionBegin;
1754   PetscCall(VecGetArrayRead(xx, &x));
1755   PetscCall(VecGetArrayWrite(zz, &zarray));
1756 
1757   v = a->a;
1758   if (usecprow) {
1759     mbs  = a->compressedrow.nrows;
1760     ii   = a->compressedrow.i;
1761     ridx = a->compressedrow.rindex;
1762     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1763   } else {
1764     mbs = a->mbs;
1765     ii  = a->i;
1766     z   = zarray;
1767   }
1768 
1769   for (i = 0; i < mbs; i++) {
1770     n     = ii[i + 1] - ii[i];
1771     idx   = ij + ii[i];
1772     sum1  = 0.0;
1773     sum2  = 0.0;
1774     sum3  = 0.0;
1775     sum4  = 0.0;
1776     sum5  = 0.0;
1777     sum6  = 0.0;
1778     sum7  = 0.0;
1779     sum8  = 0.0;
1780     sum9  = 0.0;
1781     sum10 = 0.0;
1782     sum11 = 0.0;
1783     sum12 = 0.0;
1784     sum13 = 0.0;
1785     sum14 = 0.0;
1786     sum15 = 0.0;
1787 
1788     for (j = 0; j < n; j++) {
1789       xb = x + 15 * (idx[j]);
1790       x1 = xb[0];
1791       x2 = xb[1];
1792       x3 = xb[2];
1793       x4 = xb[3];
1794       x5 = xb[4];
1795       x6 = xb[5];
1796       x7 = xb[6];
1797       x8 = xb[7];
1798 
1799       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1800       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1801       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1802       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1803       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1804       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1805       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1806       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1807       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1808       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1809       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1810       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1811       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1812       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1813       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1814       v += 120;
1815 
1816       x1 = xb[8];
1817       x2 = xb[9];
1818       x3 = xb[10];
1819       x4 = xb[11];
1820       x5 = xb[12];
1821       x6 = xb[13];
1822       x7 = xb[14];
1823 
1824       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1825       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1826       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1827       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1828       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1829       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1830       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1831       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1832       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1833       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1834       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1835       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1836       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1837       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1838       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1839       v += 105;
1840     }
1841     if (usecprow) z = zarray + 15 * ridx[i];
1842     z[0]  = sum1;
1843     z[1]  = sum2;
1844     z[2]  = sum3;
1845     z[3]  = sum4;
1846     z[4]  = sum5;
1847     z[5]  = sum6;
1848     z[6]  = sum7;
1849     z[7]  = sum8;
1850     z[8]  = sum9;
1851     z[9]  = sum10;
1852     z[10] = sum11;
1853     z[11] = sum12;
1854     z[12] = sum13;
1855     z[13] = sum14;
1856     z[14] = sum15;
1857 
1858     if (!usecprow) z += 15;
1859   }
1860 
1861   PetscCall(VecRestoreArrayRead(xx, &x));
1862   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1863   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1864   PetscFunctionReturn(PETSC_SUCCESS);
1865 }
1866 
1867 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1868 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1869 {
1870   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1871   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1872   const PetscScalar *x, *xb;
1873   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1874   const MatScalar   *v;
1875   const PetscInt    *ii, *ij = a->j, *idx;
1876   PetscInt           mbs, i, j, n, *ridx = NULL;
1877   PetscBool          usecprow = a->compressedrow.use;
1878 
1879   PetscFunctionBegin;
1880   PetscCall(VecGetArrayRead(xx, &x));
1881   PetscCall(VecGetArrayWrite(zz, &zarray));
1882 
1883   v = a->a;
1884   if (usecprow) {
1885     mbs  = a->compressedrow.nrows;
1886     ii   = a->compressedrow.i;
1887     ridx = a->compressedrow.rindex;
1888     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1889   } else {
1890     mbs = a->mbs;
1891     ii  = a->i;
1892     z   = zarray;
1893   }
1894 
1895   for (i = 0; i < mbs; i++) {
1896     n     = ii[i + 1] - ii[i];
1897     idx   = ij + ii[i];
1898     sum1  = 0.0;
1899     sum2  = 0.0;
1900     sum3  = 0.0;
1901     sum4  = 0.0;
1902     sum5  = 0.0;
1903     sum6  = 0.0;
1904     sum7  = 0.0;
1905     sum8  = 0.0;
1906     sum9  = 0.0;
1907     sum10 = 0.0;
1908     sum11 = 0.0;
1909     sum12 = 0.0;
1910     sum13 = 0.0;
1911     sum14 = 0.0;
1912     sum15 = 0.0;
1913 
1914     for (j = 0; j < n; j++) {
1915       xb  = x + 15 * (idx[j]);
1916       x1  = xb[0];
1917       x2  = xb[1];
1918       x3  = xb[2];
1919       x4  = xb[3];
1920       x5  = xb[4];
1921       x6  = xb[5];
1922       x7  = xb[6];
1923       x8  = xb[7];
1924       x9  = xb[8];
1925       x10 = xb[9];
1926       x11 = xb[10];
1927       x12 = xb[11];
1928       x13 = xb[12];
1929       x14 = xb[13];
1930       x15 = xb[14];
1931 
1932       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1933       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1934       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1935       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1936       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1937       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1938       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1939       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1940       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1941       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1942       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1943       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1944       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1945       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1946       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1947       v += 225;
1948     }
1949     if (usecprow) z = zarray + 15 * ridx[i];
1950     z[0]  = sum1;
1951     z[1]  = sum2;
1952     z[2]  = sum3;
1953     z[3]  = sum4;
1954     z[4]  = sum5;
1955     z[5]  = sum6;
1956     z[6]  = sum7;
1957     z[7]  = sum8;
1958     z[8]  = sum9;
1959     z[9]  = sum10;
1960     z[10] = sum11;
1961     z[11] = sum12;
1962     z[12] = sum13;
1963     z[13] = sum14;
1964     z[14] = sum15;
1965 
1966     if (!usecprow) z += 15;
1967   }
1968 
1969   PetscCall(VecRestoreArrayRead(xx, &x));
1970   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1971   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1972   PetscFunctionReturn(PETSC_SUCCESS);
1973 }
1974 
1975 /*
1976     This will not work with MatScalar == float because it calls the BLAS
1977 */
1978 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1979 {
1980   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1981   PetscScalar       *z = NULL, *work, *workt, *zarray;
1982   const PetscScalar *x, *xb;
1983   const MatScalar   *v;
1984   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1985   const PetscInt    *idx, *ii, *ridx = NULL;
1986   PetscInt           ncols, k;
1987   PetscBool          usecprow = a->compressedrow.use;
1988 
1989   PetscFunctionBegin;
1990   PetscCall(VecGetArrayRead(xx, &x));
1991   PetscCall(VecGetArrayWrite(zz, &zarray));
1992 
1993   idx = a->j;
1994   v   = a->a;
1995   if (usecprow) {
1996     mbs  = a->compressedrow.nrows;
1997     ii   = a->compressedrow.i;
1998     ridx = a->compressedrow.rindex;
1999     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2000   } else {
2001     mbs = a->mbs;
2002     ii  = a->i;
2003     z   = zarray;
2004   }
2005 
2006   if (!a->mult_work) {
2007     k = PetscMax(A->rmap->n, A->cmap->n);
2008     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2009   }
2010   work = a->mult_work;
2011   for (i = 0; i < mbs; i++) {
2012     n = ii[1] - ii[0];
2013     ii++;
2014     ncols = n * bs;
2015     workt = work;
2016     for (j = 0; j < n; j++) {
2017       xb = x + bs * (*idx++);
2018       for (k = 0; k < bs; k++) workt[k] = xb[k];
2019       workt += bs;
2020     }
2021     if (usecprow) z = zarray + bs * ridx[i];
2022     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2023     v += n * bs2;
2024     if (!usecprow) z += bs;
2025   }
2026   PetscCall(VecRestoreArrayRead(xx, &x));
2027   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2028   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2029   PetscFunctionReturn(PETSC_SUCCESS);
2030 }
2031 
2032 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2033 {
2034   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2035   const PetscScalar *x;
2036   PetscScalar       *y, *z, sum;
2037   const MatScalar   *v;
2038   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2039   const PetscInt    *idx, *ii;
2040   PetscBool          usecprow = a->compressedrow.use;
2041 
2042   PetscFunctionBegin;
2043   PetscCall(VecGetArrayRead(xx, &x));
2044   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2045 
2046   idx = a->j;
2047   v   = a->a;
2048   if (usecprow) {
2049     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2050     mbs  = a->compressedrow.nrows;
2051     ii   = a->compressedrow.i;
2052     ridx = a->compressedrow.rindex;
2053   } else {
2054     ii = a->i;
2055   }
2056 
2057   for (i = 0; i < mbs; i++) {
2058     n = ii[1] - ii[0];
2059     ii++;
2060     if (!usecprow) {
2061       sum = y[i];
2062     } else {
2063       sum = y[ridx[i]];
2064     }
2065     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2066     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2067     PetscSparseDensePlusDot(sum, x, v, idx, n);
2068     v += n;
2069     idx += n;
2070     if (usecprow) {
2071       z[ridx[i]] = sum;
2072     } else {
2073       z[i] = sum;
2074     }
2075   }
2076   PetscCall(VecRestoreArrayRead(xx, &x));
2077   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2078   PetscCall(PetscLogFlops(2.0 * a->nz));
2079   PetscFunctionReturn(PETSC_SUCCESS);
2080 }
2081 
2082 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2083 {
2084   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2085   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2086   const PetscScalar *x, *xb;
2087   PetscScalar        x1, x2, *yarray, *zarray;
2088   const MatScalar   *v;
2089   PetscInt           mbs = a->mbs, i, n, j;
2090   const PetscInt    *idx, *ii, *ridx = NULL;
2091   PetscBool          usecprow = a->compressedrow.use;
2092 
2093   PetscFunctionBegin;
2094   PetscCall(VecGetArrayRead(xx, &x));
2095   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2096 
2097   idx = a->j;
2098   v   = a->a;
2099   if (usecprow) {
2100     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2101     mbs  = a->compressedrow.nrows;
2102     ii   = a->compressedrow.i;
2103     ridx = a->compressedrow.rindex;
2104   } else {
2105     ii = a->i;
2106     y  = yarray;
2107     z  = zarray;
2108   }
2109 
2110   for (i = 0; i < mbs; i++) {
2111     n = ii[1] - ii[0];
2112     ii++;
2113     if (usecprow) {
2114       z = zarray + 2 * ridx[i];
2115       y = yarray + 2 * ridx[i];
2116     }
2117     sum1 = y[0];
2118     sum2 = y[1];
2119     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2120     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2121     for (j = 0; j < n; j++) {
2122       xb = x + 2 * (*idx++);
2123       x1 = xb[0];
2124       x2 = xb[1];
2125 
2126       sum1 += v[0] * x1 + v[2] * x2;
2127       sum2 += v[1] * x1 + v[3] * x2;
2128       v += 4;
2129     }
2130     z[0] = sum1;
2131     z[1] = sum2;
2132     if (!usecprow) {
2133       z += 2;
2134       y += 2;
2135     }
2136   }
2137   PetscCall(VecRestoreArrayRead(xx, &x));
2138   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2139   PetscCall(PetscLogFlops(4.0 * a->nz));
2140   PetscFunctionReturn(PETSC_SUCCESS);
2141 }
2142 
2143 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2144 {
2145   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2146   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2147   const PetscScalar *x, *xb;
2148   const MatScalar   *v;
2149   PetscInt           mbs = a->mbs, i, j, n;
2150   const PetscInt    *idx, *ii, *ridx = NULL;
2151   PetscBool          usecprow = a->compressedrow.use;
2152 
2153   PetscFunctionBegin;
2154   PetscCall(VecGetArrayRead(xx, &x));
2155   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2156 
2157   idx = a->j;
2158   v   = a->a;
2159   if (usecprow) {
2160     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2161     mbs  = a->compressedrow.nrows;
2162     ii   = a->compressedrow.i;
2163     ridx = a->compressedrow.rindex;
2164   } else {
2165     ii = a->i;
2166     y  = yarray;
2167     z  = zarray;
2168   }
2169 
2170   for (i = 0; i < mbs; i++) {
2171     n = ii[1] - ii[0];
2172     ii++;
2173     if (usecprow) {
2174       z = zarray + 3 * ridx[i];
2175       y = yarray + 3 * ridx[i];
2176     }
2177     sum1 = y[0];
2178     sum2 = y[1];
2179     sum3 = y[2];
2180     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2181     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2182     for (j = 0; j < n; j++) {
2183       xb = x + 3 * (*idx++);
2184       x1 = xb[0];
2185       x2 = xb[1];
2186       x3 = xb[2];
2187       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2188       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2189       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2190       v += 9;
2191     }
2192     z[0] = sum1;
2193     z[1] = sum2;
2194     z[2] = sum3;
2195     if (!usecprow) {
2196       z += 3;
2197       y += 3;
2198     }
2199   }
2200   PetscCall(VecRestoreArrayRead(xx, &x));
2201   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2202   PetscCall(PetscLogFlops(18.0 * a->nz));
2203   PetscFunctionReturn(PETSC_SUCCESS);
2204 }
2205 
2206 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2207 {
2208   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2209   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2210   const PetscScalar *x, *xb;
2211   const MatScalar   *v;
2212   PetscInt           mbs = a->mbs, i, j, n;
2213   const PetscInt    *idx, *ii, *ridx = NULL;
2214   PetscBool          usecprow = a->compressedrow.use;
2215 
2216   PetscFunctionBegin;
2217   PetscCall(VecGetArrayRead(xx, &x));
2218   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2219 
2220   idx = a->j;
2221   v   = a->a;
2222   if (usecprow) {
2223     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2224     mbs  = a->compressedrow.nrows;
2225     ii   = a->compressedrow.i;
2226     ridx = a->compressedrow.rindex;
2227   } else {
2228     ii = a->i;
2229     y  = yarray;
2230     z  = zarray;
2231   }
2232 
2233   for (i = 0; i < mbs; i++) {
2234     n = ii[1] - ii[0];
2235     ii++;
2236     if (usecprow) {
2237       z = zarray + 4 * ridx[i];
2238       y = yarray + 4 * ridx[i];
2239     }
2240     sum1 = y[0];
2241     sum2 = y[1];
2242     sum3 = y[2];
2243     sum4 = y[3];
2244     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2245     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2246     for (j = 0; j < n; j++) {
2247       xb = x + 4 * (*idx++);
2248       x1 = xb[0];
2249       x2 = xb[1];
2250       x3 = xb[2];
2251       x4 = xb[3];
2252       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2253       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2254       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2255       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2256       v += 16;
2257     }
2258     z[0] = sum1;
2259     z[1] = sum2;
2260     z[2] = sum3;
2261     z[3] = sum4;
2262     if (!usecprow) {
2263       z += 4;
2264       y += 4;
2265     }
2266   }
2267   PetscCall(VecRestoreArrayRead(xx, &x));
2268   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2269   PetscCall(PetscLogFlops(32.0 * a->nz));
2270   PetscFunctionReturn(PETSC_SUCCESS);
2271 }
2272 
2273 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2274 {
2275   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2276   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2277   const PetscScalar *x, *xb;
2278   PetscScalar       *yarray, *zarray;
2279   const MatScalar   *v;
2280   PetscInt           mbs = a->mbs, i, j, n;
2281   const PetscInt    *idx, *ii, *ridx = NULL;
2282   PetscBool          usecprow = a->compressedrow.use;
2283 
2284   PetscFunctionBegin;
2285   PetscCall(VecGetArrayRead(xx, &x));
2286   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2287 
2288   idx = a->j;
2289   v   = a->a;
2290   if (usecprow) {
2291     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2292     mbs  = a->compressedrow.nrows;
2293     ii   = a->compressedrow.i;
2294     ridx = a->compressedrow.rindex;
2295   } else {
2296     ii = a->i;
2297     y  = yarray;
2298     z  = zarray;
2299   }
2300 
2301   for (i = 0; i < mbs; i++) {
2302     n = ii[1] - ii[0];
2303     ii++;
2304     if (usecprow) {
2305       z = zarray + 5 * ridx[i];
2306       y = yarray + 5 * ridx[i];
2307     }
2308     sum1 = y[0];
2309     sum2 = y[1];
2310     sum3 = y[2];
2311     sum4 = y[3];
2312     sum5 = y[4];
2313     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2314     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2315     for (j = 0; j < n; j++) {
2316       xb = x + 5 * (*idx++);
2317       x1 = xb[0];
2318       x2 = xb[1];
2319       x3 = xb[2];
2320       x4 = xb[3];
2321       x5 = xb[4];
2322       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2323       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2324       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2325       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2326       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2327       v += 25;
2328     }
2329     z[0] = sum1;
2330     z[1] = sum2;
2331     z[2] = sum3;
2332     z[3] = sum4;
2333     z[4] = sum5;
2334     if (!usecprow) {
2335       z += 5;
2336       y += 5;
2337     }
2338   }
2339   PetscCall(VecRestoreArrayRead(xx, &x));
2340   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2341   PetscCall(PetscLogFlops(50.0 * a->nz));
2342   PetscFunctionReturn(PETSC_SUCCESS);
2343 }
2344 
2345 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2346 {
2347   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2348   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2349   const PetscScalar *x, *xb;
2350   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2351   const MatScalar   *v;
2352   PetscInt           mbs = a->mbs, i, j, n;
2353   const PetscInt    *idx, *ii, *ridx = NULL;
2354   PetscBool          usecprow = a->compressedrow.use;
2355 
2356   PetscFunctionBegin;
2357   PetscCall(VecGetArrayRead(xx, &x));
2358   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2359 
2360   idx = a->j;
2361   v   = a->a;
2362   if (usecprow) {
2363     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2364     mbs  = a->compressedrow.nrows;
2365     ii   = a->compressedrow.i;
2366     ridx = a->compressedrow.rindex;
2367   } else {
2368     ii = a->i;
2369     y  = yarray;
2370     z  = zarray;
2371   }
2372 
2373   for (i = 0; i < mbs; i++) {
2374     n = ii[1] - ii[0];
2375     ii++;
2376     if (usecprow) {
2377       z = zarray + 6 * ridx[i];
2378       y = yarray + 6 * ridx[i];
2379     }
2380     sum1 = y[0];
2381     sum2 = y[1];
2382     sum3 = y[2];
2383     sum4 = y[3];
2384     sum5 = y[4];
2385     sum6 = y[5];
2386     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2387     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2388     for (j = 0; j < n; j++) {
2389       xb = x + 6 * (*idx++);
2390       x1 = xb[0];
2391       x2 = xb[1];
2392       x3 = xb[2];
2393       x4 = xb[3];
2394       x5 = xb[4];
2395       x6 = xb[5];
2396       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2397       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2398       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2399       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2400       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2401       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2402       v += 36;
2403     }
2404     z[0] = sum1;
2405     z[1] = sum2;
2406     z[2] = sum3;
2407     z[3] = sum4;
2408     z[4] = sum5;
2409     z[5] = sum6;
2410     if (!usecprow) {
2411       z += 6;
2412       y += 6;
2413     }
2414   }
2415   PetscCall(VecRestoreArrayRead(xx, &x));
2416   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2417   PetscCall(PetscLogFlops(72.0 * a->nz));
2418   PetscFunctionReturn(PETSC_SUCCESS);
2419 }
2420 
2421 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2422 {
2423   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2424   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2425   const PetscScalar *x, *xb;
2426   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2427   const MatScalar   *v;
2428   PetscInt           mbs = a->mbs, i, j, n;
2429   const PetscInt    *idx, *ii, *ridx = NULL;
2430   PetscBool          usecprow = a->compressedrow.use;
2431 
2432   PetscFunctionBegin;
2433   PetscCall(VecGetArrayRead(xx, &x));
2434   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2435 
2436   idx = a->j;
2437   v   = a->a;
2438   if (usecprow) {
2439     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2440     mbs  = a->compressedrow.nrows;
2441     ii   = a->compressedrow.i;
2442     ridx = a->compressedrow.rindex;
2443   } else {
2444     ii = a->i;
2445     y  = yarray;
2446     z  = zarray;
2447   }
2448 
2449   for (i = 0; i < mbs; i++) {
2450     n = ii[1] - ii[0];
2451     ii++;
2452     if (usecprow) {
2453       z = zarray + 7 * ridx[i];
2454       y = yarray + 7 * ridx[i];
2455     }
2456     sum1 = y[0];
2457     sum2 = y[1];
2458     sum3 = y[2];
2459     sum4 = y[3];
2460     sum5 = y[4];
2461     sum6 = y[5];
2462     sum7 = y[6];
2463     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2464     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2465     for (j = 0; j < n; j++) {
2466       xb = x + 7 * (*idx++);
2467       x1 = xb[0];
2468       x2 = xb[1];
2469       x3 = xb[2];
2470       x4 = xb[3];
2471       x5 = xb[4];
2472       x6 = xb[5];
2473       x7 = xb[6];
2474       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2475       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2476       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2477       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2478       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2479       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2480       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2481       v += 49;
2482     }
2483     z[0] = sum1;
2484     z[1] = sum2;
2485     z[2] = sum3;
2486     z[3] = sum4;
2487     z[4] = sum5;
2488     z[5] = sum6;
2489     z[6] = sum7;
2490     if (!usecprow) {
2491       z += 7;
2492       y += 7;
2493     }
2494   }
2495   PetscCall(VecRestoreArrayRead(xx, &x));
2496   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2497   PetscCall(PetscLogFlops(98.0 * a->nz));
2498   PetscFunctionReturn(PETSC_SUCCESS);
2499 }
2500 
2501 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2502 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2503 {
2504   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2505   PetscScalar       *z = NULL, *work, *workt, *zarray;
2506   const PetscScalar *x, *xb;
2507   const MatScalar   *v;
2508   PetscInt           mbs, i, j, n;
2509   PetscInt           k;
2510   PetscBool          usecprow = a->compressedrow.use;
2511   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2512 
2513   __m256d a0, a1, a2, a3, a4, a5;
2514   __m256d w0, w1, w2, w3;
2515   __m256d z0, z1, z2;
2516   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2517 
2518   PetscFunctionBegin;
2519   PetscCall(VecCopy(yy, zz));
2520   PetscCall(VecGetArrayRead(xx, &x));
2521   PetscCall(VecGetArray(zz, &zarray));
2522 
2523   idx = a->j;
2524   v   = a->a;
2525   if (usecprow) {
2526     mbs  = a->compressedrow.nrows;
2527     ii   = a->compressedrow.i;
2528     ridx = a->compressedrow.rindex;
2529   } else {
2530     mbs = a->mbs;
2531     ii  = a->i;
2532     z   = zarray;
2533   }
2534 
2535   if (!a->mult_work) {
2536     k = PetscMax(A->rmap->n, A->cmap->n);
2537     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2538   }
2539 
2540   work = a->mult_work;
2541   for (i = 0; i < mbs; i++) {
2542     n = ii[1] - ii[0];
2543     ii++;
2544     workt = work;
2545     for (j = 0; j < n; j++) {
2546       xb = x + bs * (*idx++);
2547       for (k = 0; k < bs; k++) workt[k] = xb[k];
2548       workt += bs;
2549     }
2550     if (usecprow) z = zarray + bs * ridx[i];
2551 
2552     z0 = _mm256_loadu_pd(&z[0]);
2553     z1 = _mm256_loadu_pd(&z[4]);
2554     z2 = _mm256_set1_pd(z[8]);
2555 
2556     for (j = 0; j < n; j++) {
2557       /* first column of a */
2558       w0 = _mm256_set1_pd(work[j * 9]);
2559       a0 = _mm256_loadu_pd(&v[j * 81]);
2560       z0 = _mm256_fmadd_pd(a0, w0, z0);
2561       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2562       z1 = _mm256_fmadd_pd(a1, w0, z1);
2563       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2564       z2 = _mm256_fmadd_pd(a2, w0, z2);
2565 
2566       /* second column of a */
2567       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2568       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2569       z0 = _mm256_fmadd_pd(a0, w1, z0);
2570       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2571       z1 = _mm256_fmadd_pd(a1, w1, z1);
2572       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2573       z2 = _mm256_fmadd_pd(a2, w1, z2);
2574 
2575       /* third column of a */
2576       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2577       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2578       z0 = _mm256_fmadd_pd(a3, w2, z0);
2579       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2580       z1 = _mm256_fmadd_pd(a4, w2, z1);
2581       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2582       z2 = _mm256_fmadd_pd(a5, w2, z2);
2583 
2584       /* fourth column of a */
2585       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2586       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2587       z0 = _mm256_fmadd_pd(a0, w3, z0);
2588       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2589       z1 = _mm256_fmadd_pd(a1, w3, z1);
2590       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2591       z2 = _mm256_fmadd_pd(a2, w3, z2);
2592 
2593       /* fifth column of a */
2594       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2595       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2596       z0 = _mm256_fmadd_pd(a3, w0, z0);
2597       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2598       z1 = _mm256_fmadd_pd(a4, w0, z1);
2599       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2600       z2 = _mm256_fmadd_pd(a5, w0, z2);
2601 
2602       /* sixth column of a */
2603       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2604       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2605       z0 = _mm256_fmadd_pd(a0, w1, z0);
2606       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2607       z1 = _mm256_fmadd_pd(a1, w1, z1);
2608       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2609       z2 = _mm256_fmadd_pd(a2, w1, z2);
2610 
2611       /* seventh column of a */
2612       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2613       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2614       z0 = _mm256_fmadd_pd(a0, w2, z0);
2615       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2616       z1 = _mm256_fmadd_pd(a1, w2, z1);
2617       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2618       z2 = _mm256_fmadd_pd(a2, w2, z2);
2619 
2620       /* eighth column of a */
2621       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2622       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2623       z0 = _mm256_fmadd_pd(a3, w3, z0);
2624       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2625       z1 = _mm256_fmadd_pd(a4, w3, z1);
2626       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2627       z2 = _mm256_fmadd_pd(a5, w3, z2);
2628 
2629       /* ninth column of a */
2630       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2631       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2632       z0 = _mm256_fmadd_pd(a0, w0, z0);
2633       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2634       z1 = _mm256_fmadd_pd(a1, w0, z1);
2635       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2636       z2 = _mm256_fmadd_pd(a2, w0, z2);
2637     }
2638 
2639     _mm256_storeu_pd(&z[0], z0);
2640     _mm256_storeu_pd(&z[4], z1);
2641     _mm256_maskstore_pd(&z[8], mask1, z2);
2642 
2643     v += n * bs2;
2644     if (!usecprow) z += bs;
2645   }
2646   PetscCall(VecRestoreArrayRead(xx, &x));
2647   PetscCall(VecRestoreArray(zz, &zarray));
2648   PetscCall(PetscLogFlops(162.0 * a->nz));
2649   PetscFunctionReturn(PETSC_SUCCESS);
2650 }
2651 #endif
2652 
2653 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2654 {
2655   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2656   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2657   const PetscScalar *x, *xb;
2658   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2659   const MatScalar   *v;
2660   PetscInt           mbs = a->mbs, i, j, n;
2661   const PetscInt    *idx, *ii, *ridx = NULL;
2662   PetscBool          usecprow = a->compressedrow.use;
2663 
2664   PetscFunctionBegin;
2665   PetscCall(VecGetArrayRead(xx, &x));
2666   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2667 
2668   idx = a->j;
2669   v   = a->a;
2670   if (usecprow) {
2671     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2672     mbs  = a->compressedrow.nrows;
2673     ii   = a->compressedrow.i;
2674     ridx = a->compressedrow.rindex;
2675   } else {
2676     ii = a->i;
2677     y  = yarray;
2678     z  = zarray;
2679   }
2680 
2681   for (i = 0; i < mbs; i++) {
2682     n = ii[1] - ii[0];
2683     ii++;
2684     if (usecprow) {
2685       z = zarray + 11 * ridx[i];
2686       y = yarray + 11 * ridx[i];
2687     }
2688     sum1  = y[0];
2689     sum2  = y[1];
2690     sum3  = y[2];
2691     sum4  = y[3];
2692     sum5  = y[4];
2693     sum6  = y[5];
2694     sum7  = y[6];
2695     sum8  = y[7];
2696     sum9  = y[8];
2697     sum10 = y[9];
2698     sum11 = y[10];
2699     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2700     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2701     for (j = 0; j < n; j++) {
2702       xb  = x + 11 * (*idx++);
2703       x1  = xb[0];
2704       x2  = xb[1];
2705       x3  = xb[2];
2706       x4  = xb[3];
2707       x5  = xb[4];
2708       x6  = xb[5];
2709       x7  = xb[6];
2710       x8  = xb[7];
2711       x9  = xb[8];
2712       x10 = xb[9];
2713       x11 = xb[10];
2714       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2715       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2716       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2717       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2718       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2719       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2720       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2721       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2722       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2723       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2724       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2725       v += 121;
2726     }
2727     z[0]  = sum1;
2728     z[1]  = sum2;
2729     z[2]  = sum3;
2730     z[3]  = sum4;
2731     z[4]  = sum5;
2732     z[5]  = sum6;
2733     z[6]  = sum7;
2734     z[7]  = sum8;
2735     z[8]  = sum9;
2736     z[9]  = sum10;
2737     z[10] = sum11;
2738     if (!usecprow) {
2739       z += 11;
2740       y += 11;
2741     }
2742   }
2743   PetscCall(VecRestoreArrayRead(xx, &x));
2744   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2745   PetscCall(PetscLogFlops(242.0 * a->nz));
2746   PetscFunctionReturn(PETSC_SUCCESS);
2747 }
2748 
2749 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2750 {
2751   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2752   PetscScalar       *z = NULL, *work, *workt, *zarray;
2753   const PetscScalar *x, *xb;
2754   const MatScalar   *v;
2755   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2756   PetscInt           ncols, k;
2757   const PetscInt    *ridx     = NULL, *idx, *ii;
2758   PetscBool          usecprow = a->compressedrow.use;
2759 
2760   PetscFunctionBegin;
2761   PetscCall(VecCopy(yy, zz));
2762   PetscCall(VecGetArrayRead(xx, &x));
2763   PetscCall(VecGetArray(zz, &zarray));
2764 
2765   idx = a->j;
2766   v   = a->a;
2767   if (usecprow) {
2768     mbs  = a->compressedrow.nrows;
2769     ii   = a->compressedrow.i;
2770     ridx = a->compressedrow.rindex;
2771   } else {
2772     mbs = a->mbs;
2773     ii  = a->i;
2774     z   = zarray;
2775   }
2776 
2777   if (!a->mult_work) {
2778     k = PetscMax(A->rmap->n, A->cmap->n);
2779     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2780   }
2781   work = a->mult_work;
2782   for (i = 0; i < mbs; i++) {
2783     n = ii[1] - ii[0];
2784     ii++;
2785     ncols = n * bs;
2786     workt = work;
2787     for (j = 0; j < n; j++) {
2788       xb = x + bs * (*idx++);
2789       for (k = 0; k < bs; k++) workt[k] = xb[k];
2790       workt += bs;
2791     }
2792     if (usecprow) z = zarray + bs * ridx[i];
2793     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2794     v += n * bs2;
2795     if (!usecprow) z += bs;
2796   }
2797   PetscCall(VecRestoreArrayRead(xx, &x));
2798   PetscCall(VecRestoreArray(zz, &zarray));
2799   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2800   PetscFunctionReturn(PETSC_SUCCESS);
2801 }
2802 
2803 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2804 {
2805   PetscScalar zero = 0.0;
2806 
2807   PetscFunctionBegin;
2808   PetscCall(VecSet(zz, zero));
2809   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2810   PetscFunctionReturn(PETSC_SUCCESS);
2811 }
2812 
2813 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2814 {
2815   PetscScalar zero = 0.0;
2816 
2817   PetscFunctionBegin;
2818   PetscCall(VecSet(zz, zero));
2819   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2820   PetscFunctionReturn(PETSC_SUCCESS);
2821 }
2822 
2823 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2824 {
2825   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2826   PetscScalar       *z, x1, x2, x3, x4, x5;
2827   const PetscScalar *x, *xb = NULL;
2828   const MatScalar   *v;
2829   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2830   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2831   Mat_CompressedRow  cprow    = a->compressedrow;
2832   PetscBool          usecprow = cprow.use;
2833 
2834   PetscFunctionBegin;
2835   if (yy != zz) PetscCall(VecCopy(yy, zz));
2836   PetscCall(VecGetArrayRead(xx, &x));
2837   PetscCall(VecGetArray(zz, &z));
2838 
2839   idx = a->j;
2840   v   = a->a;
2841   if (usecprow) {
2842     mbs  = cprow.nrows;
2843     ii   = cprow.i;
2844     ridx = cprow.rindex;
2845   } else {
2846     mbs = a->mbs;
2847     ii  = a->i;
2848     xb  = x;
2849   }
2850 
2851   switch (bs) {
2852   case 1:
2853     for (i = 0; i < mbs; i++) {
2854       if (usecprow) xb = x + ridx[i];
2855       x1 = xb[0];
2856       ib = idx + ii[0];
2857       n  = ii[1] - ii[0];
2858       ii++;
2859       for (j = 0; j < n; j++) {
2860         rval = ib[j];
2861         z[rval] += PetscConj(*v) * x1;
2862         v++;
2863       }
2864       if (!usecprow) xb++;
2865     }
2866     break;
2867   case 2:
2868     for (i = 0; i < mbs; i++) {
2869       if (usecprow) xb = x + 2 * ridx[i];
2870       x1 = xb[0];
2871       x2 = xb[1];
2872       ib = idx + ii[0];
2873       n  = ii[1] - ii[0];
2874       ii++;
2875       for (j = 0; j < n; j++) {
2876         rval = ib[j] * 2;
2877         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2878         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2879         v += 4;
2880       }
2881       if (!usecprow) xb += 2;
2882     }
2883     break;
2884   case 3:
2885     for (i = 0; i < mbs; i++) {
2886       if (usecprow) xb = x + 3 * ridx[i];
2887       x1 = xb[0];
2888       x2 = xb[1];
2889       x3 = xb[2];
2890       ib = idx + ii[0];
2891       n  = ii[1] - ii[0];
2892       ii++;
2893       for (j = 0; j < n; j++) {
2894         rval = ib[j] * 3;
2895         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2896         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2897         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2898         v += 9;
2899       }
2900       if (!usecprow) xb += 3;
2901     }
2902     break;
2903   case 4:
2904     for (i = 0; i < mbs; i++) {
2905       if (usecprow) xb = x + 4 * ridx[i];
2906       x1 = xb[0];
2907       x2 = xb[1];
2908       x3 = xb[2];
2909       x4 = xb[3];
2910       ib = idx + ii[0];
2911       n  = ii[1] - ii[0];
2912       ii++;
2913       for (j = 0; j < n; j++) {
2914         rval = ib[j] * 4;
2915         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2916         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2917         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2918         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2919         v += 16;
2920       }
2921       if (!usecprow) xb += 4;
2922     }
2923     break;
2924   case 5:
2925     for (i = 0; i < mbs; i++) {
2926       if (usecprow) xb = x + 5 * ridx[i];
2927       x1 = xb[0];
2928       x2 = xb[1];
2929       x3 = xb[2];
2930       x4 = xb[3];
2931       x5 = xb[4];
2932       ib = idx + ii[0];
2933       n  = ii[1] - ii[0];
2934       ii++;
2935       for (j = 0; j < n; j++) {
2936         rval = ib[j] * 5;
2937         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2938         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2939         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2940         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2941         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2942         v += 25;
2943       }
2944       if (!usecprow) xb += 5;
2945     }
2946     break;
2947   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2948     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2949 #if 0
2950     {
2951       PetscInt          ncols,k,bs2=a->bs2;
2952       PetscScalar       *work,*workt,zb;
2953       const PetscScalar *xtmp;
2954       if (!a->mult_work) {
2955         k    = PetscMax(A->rmap->n,A->cmap->n);
2956         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2957       }
2958       work = a->mult_work;
2959       xtmp = x;
2960       for (i=0; i<mbs; i++) {
2961         n     = ii[1] - ii[0]; ii++;
2962         ncols = n*bs;
2963         PetscCall(PetscArrayzero(work,ncols));
2964         if (usecprow) xtmp = x + bs*ridx[i];
2965         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2966         v += n*bs2;
2967         if (!usecprow) xtmp += bs;
2968         workt = work;
2969         for (j=0; j<n; j++) {
2970           zb = z + bs*(*idx++);
2971           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2972           workt += bs;
2973         }
2974       }
2975     }
2976 #endif
2977   }
2978   PetscCall(VecRestoreArrayRead(xx, &x));
2979   PetscCall(VecRestoreArray(zz, &z));
2980   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2981   PetscFunctionReturn(PETSC_SUCCESS);
2982 }
2983 
2984 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2985 {
2986   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2987   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2988   const PetscScalar *x, *xb = NULL;
2989   const MatScalar   *v;
2990   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2991   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2992   Mat_CompressedRow  cprow    = a->compressedrow;
2993   PetscBool          usecprow = cprow.use;
2994 
2995   PetscFunctionBegin;
2996   if (yy != zz) PetscCall(VecCopy(yy, zz));
2997   PetscCall(VecGetArrayRead(xx, &x));
2998   PetscCall(VecGetArray(zz, &z));
2999 
3000   idx = a->j;
3001   v   = a->a;
3002   if (usecprow) {
3003     mbs  = cprow.nrows;
3004     ii   = cprow.i;
3005     ridx = cprow.rindex;
3006   } else {
3007     mbs = a->mbs;
3008     ii  = a->i;
3009     xb  = x;
3010   }
3011 
3012   switch (bs) {
3013   case 1:
3014     for (i = 0; i < mbs; i++) {
3015       if (usecprow) xb = x + ridx[i];
3016       x1 = xb[0];
3017       ib = idx + ii[0];
3018       n  = ii[1] - ii[0];
3019       ii++;
3020       for (j = 0; j < n; j++) {
3021         rval = ib[j];
3022         z[rval] += *v * x1;
3023         v++;
3024       }
3025       if (!usecprow) xb++;
3026     }
3027     break;
3028   case 2:
3029     for (i = 0; i < mbs; i++) {
3030       if (usecprow) xb = x + 2 * ridx[i];
3031       x1 = xb[0];
3032       x2 = xb[1];
3033       ib = idx + ii[0];
3034       n  = ii[1] - ii[0];
3035       ii++;
3036       for (j = 0; j < n; j++) {
3037         rval = ib[j] * 2;
3038         z[rval++] += v[0] * x1 + v[1] * x2;
3039         z[rval++] += v[2] * x1 + v[3] * x2;
3040         v += 4;
3041       }
3042       if (!usecprow) xb += 2;
3043     }
3044     break;
3045   case 3:
3046     for (i = 0; i < mbs; i++) {
3047       if (usecprow) xb = x + 3 * ridx[i];
3048       x1 = xb[0];
3049       x2 = xb[1];
3050       x3 = xb[2];
3051       ib = idx + ii[0];
3052       n  = ii[1] - ii[0];
3053       ii++;
3054       for (j = 0; j < n; j++) {
3055         rval = ib[j] * 3;
3056         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3057         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3058         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3059         v += 9;
3060       }
3061       if (!usecprow) xb += 3;
3062     }
3063     break;
3064   case 4:
3065     for (i = 0; i < mbs; i++) {
3066       if (usecprow) xb = x + 4 * ridx[i];
3067       x1 = xb[0];
3068       x2 = xb[1];
3069       x3 = xb[2];
3070       x4 = xb[3];
3071       ib = idx + ii[0];
3072       n  = ii[1] - ii[0];
3073       ii++;
3074       for (j = 0; j < n; j++) {
3075         rval = ib[j] * 4;
3076         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3077         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3078         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3079         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3080         v += 16;
3081       }
3082       if (!usecprow) xb += 4;
3083     }
3084     break;
3085   case 5:
3086     for (i = 0; i < mbs; i++) {
3087       if (usecprow) xb = x + 5 * ridx[i];
3088       x1 = xb[0];
3089       x2 = xb[1];
3090       x3 = xb[2];
3091       x4 = xb[3];
3092       x5 = xb[4];
3093       ib = idx + ii[0];
3094       n  = ii[1] - ii[0];
3095       ii++;
3096       for (j = 0; j < n; j++) {
3097         rval = ib[j] * 5;
3098         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3099         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3100         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3101         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3102         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3103         v += 25;
3104       }
3105       if (!usecprow) xb += 5;
3106     }
3107     break;
3108   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3109     PetscInt           ncols, k;
3110     PetscScalar       *work, *workt;
3111     const PetscScalar *xtmp;
3112     if (!a->mult_work) {
3113       k = PetscMax(A->rmap->n, A->cmap->n);
3114       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3115     }
3116     work = a->mult_work;
3117     xtmp = x;
3118     for (i = 0; i < mbs; i++) {
3119       n = ii[1] - ii[0];
3120       ii++;
3121       ncols = n * bs;
3122       PetscCall(PetscArrayzero(work, ncols));
3123       if (usecprow) xtmp = x + bs * ridx[i];
3124       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3125       v += n * bs2;
3126       if (!usecprow) xtmp += bs;
3127       workt = work;
3128       for (j = 0; j < n; j++) {
3129         zb = z + bs * (*idx++);
3130         for (k = 0; k < bs; k++) zb[k] += workt[k];
3131         workt += bs;
3132       }
3133     }
3134   }
3135   }
3136   PetscCall(VecRestoreArrayRead(xx, &x));
3137   PetscCall(VecRestoreArray(zz, &z));
3138   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3139   PetscFunctionReturn(PETSC_SUCCESS);
3140 }
3141 
3142 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3143 {
3144   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3145   PetscInt     totalnz = a->bs2 * a->nz;
3146   PetscScalar  oalpha  = alpha;
3147   PetscBLASInt one     = 1, tnz;
3148 
3149   PetscFunctionBegin;
3150   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3151   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3152   PetscCall(PetscLogFlops(totalnz));
3153   PetscFunctionReturn(PETSC_SUCCESS);
3154 }
3155 
3156 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3157 {
3158   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3159   MatScalar   *v   = a->a;
3160   PetscReal    sum = 0.0;
3161   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3162 
3163   PetscFunctionBegin;
3164   if (type == NORM_FROBENIUS) {
3165 #if defined(PETSC_USE_REAL___FP16)
3166     PetscBLASInt one = 1, cnt = bs2 * nz;
3167     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3168 #else
3169     for (i = 0; i < bs2 * nz; i++) {
3170       sum += PetscRealPart(PetscConj(*v) * (*v));
3171       v++;
3172     }
3173 #endif
3174     *norm = PetscSqrtReal(sum);
3175     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3176   } else if (type == NORM_1) { /* maximum column sum */
3177     PetscReal *tmp;
3178     PetscInt  *bcol = a->j;
3179     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3180     for (i = 0; i < nz; i++) {
3181       for (j = 0; j < bs; j++) {
3182         k1 = bs * (*bcol) + j; /* column index */
3183         for (k = 0; k < bs; k++) {
3184           tmp[k1] += PetscAbsScalar(*v);
3185           v++;
3186         }
3187       }
3188       bcol++;
3189     }
3190     *norm = 0.0;
3191     for (j = 0; j < A->cmap->n; j++) {
3192       if (tmp[j] > *norm) *norm = tmp[j];
3193     }
3194     PetscCall(PetscFree(tmp));
3195     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3196   } else if (type == NORM_INFINITY) { /* maximum row sum */
3197     *norm = 0.0;
3198     for (k = 0; k < bs; k++) {
3199       for (j = 0; j < a->mbs; j++) {
3200         v   = a->a + bs2 * a->i[j] + k;
3201         sum = 0.0;
3202         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3203           for (k1 = 0; k1 < bs; k1++) {
3204             sum += PetscAbsScalar(*v);
3205             v += bs;
3206           }
3207         }
3208         if (sum > *norm) *norm = sum;
3209       }
3210     }
3211     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3212   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3213   PetscFunctionReturn(PETSC_SUCCESS);
3214 }
3215 
3216 PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3217 {
3218   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3219 
3220   PetscFunctionBegin;
3221   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3222   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3223     *flg = PETSC_FALSE;
3224     PetscFunctionReturn(PETSC_SUCCESS);
3225   }
3226 
3227   /* if the a->i are the same */
3228   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3229   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3230 
3231   /* if a->j are the same */
3232   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3233   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3234 
3235   /* if a->a are the same */
3236   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3237   PetscFunctionReturn(PETSC_SUCCESS);
3238 }
3239 
3240 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3241 {
3242   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3243   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3244   PetscScalar *x, zero = 0.0;
3245   MatScalar   *aa, *aa_j;
3246 
3247   PetscFunctionBegin;
3248   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3249   bs   = A->rmap->bs;
3250   aa   = a->a;
3251   ai   = a->i;
3252   aj   = a->j;
3253   ambs = a->mbs;
3254   bs2  = a->bs2;
3255 
3256   PetscCall(VecSet(v, zero));
3257   PetscCall(VecGetArray(v, &x));
3258   PetscCall(VecGetLocalSize(v, &n));
3259   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3260   for (i = 0; i < ambs; i++) {
3261     for (j = ai[i]; j < ai[i + 1]; j++) {
3262       if (aj[j] == i) {
3263         row  = i * bs;
3264         aa_j = aa + j * bs2;
3265         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3266         break;
3267       }
3268     }
3269   }
3270   PetscCall(VecRestoreArray(v, &x));
3271   PetscFunctionReturn(PETSC_SUCCESS);
3272 }
3273 
3274 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3275 {
3276   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3277   const PetscScalar *l, *r, *li, *ri;
3278   PetscScalar        x;
3279   MatScalar         *aa, *v;
3280   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3281   const PetscInt    *ai, *aj;
3282 
3283   PetscFunctionBegin;
3284   ai  = a->i;
3285   aj  = a->j;
3286   aa  = a->a;
3287   m   = A->rmap->n;
3288   n   = A->cmap->n;
3289   bs  = A->rmap->bs;
3290   mbs = a->mbs;
3291   bs2 = a->bs2;
3292   if (ll) {
3293     PetscCall(VecGetArrayRead(ll, &l));
3294     PetscCall(VecGetLocalSize(ll, &lm));
3295     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3296     for (i = 0; i < mbs; i++) { /* for each block row */
3297       M  = ai[i + 1] - ai[i];
3298       li = l + i * bs;
3299       v  = aa + bs2 * ai[i];
3300       for (j = 0; j < M; j++) { /* for each block */
3301         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3302       }
3303     }
3304     PetscCall(VecRestoreArrayRead(ll, &l));
3305     PetscCall(PetscLogFlops(a->nz));
3306   }
3307 
3308   if (rr) {
3309     PetscCall(VecGetArrayRead(rr, &r));
3310     PetscCall(VecGetLocalSize(rr, &rn));
3311     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3312     for (i = 0; i < mbs; i++) { /* for each block row */
3313       iai = ai[i];
3314       M   = ai[i + 1] - iai;
3315       v   = aa + bs2 * iai;
3316       for (j = 0; j < M; j++) { /* for each block */
3317         ri = r + bs * aj[iai + j];
3318         for (k = 0; k < bs; k++) {
3319           x = ri[k];
3320           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3321           v += bs;
3322         }
3323       }
3324     }
3325     PetscCall(VecRestoreArrayRead(rr, &r));
3326     PetscCall(PetscLogFlops(a->nz));
3327   }
3328   PetscFunctionReturn(PETSC_SUCCESS);
3329 }
3330 
3331 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3332 {
3333   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3334 
3335   PetscFunctionBegin;
3336   info->block_size   = a->bs2;
3337   info->nz_allocated = a->bs2 * a->maxnz;
3338   info->nz_used      = a->bs2 * a->nz;
3339   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3340   info->assemblies   = A->num_ass;
3341   info->mallocs      = A->info.mallocs;
3342   info->memory       = 0; /* REVIEW ME */
3343   if (A->factortype) {
3344     info->fill_ratio_given  = A->info.fill_ratio_given;
3345     info->fill_ratio_needed = A->info.fill_ratio_needed;
3346     info->factor_mallocs    = A->info.factor_mallocs;
3347   } else {
3348     info->fill_ratio_given  = 0;
3349     info->fill_ratio_needed = 0;
3350     info->factor_mallocs    = 0;
3351   }
3352   PetscFunctionReturn(PETSC_SUCCESS);
3353 }
3354 
3355 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3356 {
3357   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3358 
3359   PetscFunctionBegin;
3360   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3361   PetscFunctionReturn(PETSC_SUCCESS);
3362 }
3363 
3364 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3365 {
3366   PetscFunctionBegin;
3367   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3368   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3369   PetscFunctionReturn(PETSC_SUCCESS);
3370 }
3371 
3372 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3373 {
3374   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3375   PetscScalar       *z = NULL, sum1;
3376   const PetscScalar *xb;
3377   PetscScalar        x1;
3378   const MatScalar   *v, *vv;
3379   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3380   PetscBool          usecprow = a->compressedrow.use;
3381 
3382   PetscFunctionBegin;
3383   idx = a->j;
3384   v   = a->a;
3385   if (usecprow) {
3386     mbs  = a->compressedrow.nrows;
3387     ii   = a->compressedrow.i;
3388     ridx = a->compressedrow.rindex;
3389   } else {
3390     mbs = a->mbs;
3391     ii  = a->i;
3392     z   = c;
3393   }
3394 
3395   for (i = 0; i < mbs; i++) {
3396     n = ii[1] - ii[0];
3397     ii++;
3398     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3399     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3400     if (usecprow) z = c + ridx[i];
3401     jj = idx;
3402     vv = v;
3403     for (k = 0; k < cn; k++) {
3404       idx  = jj;
3405       v    = vv;
3406       sum1 = 0.0;
3407       for (j = 0; j < n; j++) {
3408         xb = b + (*idx++);
3409         x1 = xb[0 + k * bm];
3410         sum1 += v[0] * x1;
3411         v += 1;
3412       }
3413       z[0 + k * cm] = sum1;
3414     }
3415     if (!usecprow) z += 1;
3416   }
3417   PetscFunctionReturn(PETSC_SUCCESS);
3418 }
3419 
3420 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3421 {
3422   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3423   PetscScalar       *z = NULL, sum1, sum2;
3424   const PetscScalar *xb;
3425   PetscScalar        x1, x2;
3426   const MatScalar   *v, *vv;
3427   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3428   PetscBool          usecprow = a->compressedrow.use;
3429 
3430   PetscFunctionBegin;
3431   idx = a->j;
3432   v   = a->a;
3433   if (usecprow) {
3434     mbs  = a->compressedrow.nrows;
3435     ii   = a->compressedrow.i;
3436     ridx = a->compressedrow.rindex;
3437   } else {
3438     mbs = a->mbs;
3439     ii  = a->i;
3440     z   = c;
3441   }
3442 
3443   for (i = 0; i < mbs; i++) {
3444     n = ii[1] - ii[0];
3445     ii++;
3446     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3447     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3448     if (usecprow) z = c + 2 * ridx[i];
3449     jj = idx;
3450     vv = v;
3451     for (k = 0; k < cn; k++) {
3452       idx  = jj;
3453       v    = vv;
3454       sum1 = 0.0;
3455       sum2 = 0.0;
3456       for (j = 0; j < n; j++) {
3457         xb = b + 2 * (*idx++);
3458         x1 = xb[0 + k * bm];
3459         x2 = xb[1 + k * bm];
3460         sum1 += v[0] * x1 + v[2] * x2;
3461         sum2 += v[1] * x1 + v[3] * x2;
3462         v += 4;
3463       }
3464       z[0 + k * cm] = sum1;
3465       z[1 + k * cm] = sum2;
3466     }
3467     if (!usecprow) z += 2;
3468   }
3469   PetscFunctionReturn(PETSC_SUCCESS);
3470 }
3471 
3472 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3473 {
3474   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3475   PetscScalar       *z = NULL, sum1, sum2, sum3;
3476   const PetscScalar *xb;
3477   PetscScalar        x1, x2, x3;
3478   const MatScalar   *v, *vv;
3479   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3480   PetscBool          usecprow = a->compressedrow.use;
3481 
3482   PetscFunctionBegin;
3483   idx = a->j;
3484   v   = a->a;
3485   if (usecprow) {
3486     mbs  = a->compressedrow.nrows;
3487     ii   = a->compressedrow.i;
3488     ridx = a->compressedrow.rindex;
3489   } else {
3490     mbs = a->mbs;
3491     ii  = a->i;
3492     z   = c;
3493   }
3494 
3495   for (i = 0; i < mbs; i++) {
3496     n = ii[1] - ii[0];
3497     ii++;
3498     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3499     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3500     if (usecprow) z = c + 3 * ridx[i];
3501     jj = idx;
3502     vv = v;
3503     for (k = 0; k < cn; k++) {
3504       idx  = jj;
3505       v    = vv;
3506       sum1 = 0.0;
3507       sum2 = 0.0;
3508       sum3 = 0.0;
3509       for (j = 0; j < n; j++) {
3510         xb = b + 3 * (*idx++);
3511         x1 = xb[0 + k * bm];
3512         x2 = xb[1 + k * bm];
3513         x3 = xb[2 + k * bm];
3514         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3515         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3516         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3517         v += 9;
3518       }
3519       z[0 + k * cm] = sum1;
3520       z[1 + k * cm] = sum2;
3521       z[2 + k * cm] = sum3;
3522     }
3523     if (!usecprow) z += 3;
3524   }
3525   PetscFunctionReturn(PETSC_SUCCESS);
3526 }
3527 
3528 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3529 {
3530   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3531   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3532   const PetscScalar *xb;
3533   PetscScalar        x1, x2, x3, x4;
3534   const MatScalar   *v, *vv;
3535   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3536   PetscBool          usecprow = a->compressedrow.use;
3537 
3538   PetscFunctionBegin;
3539   idx = a->j;
3540   v   = a->a;
3541   if (usecprow) {
3542     mbs  = a->compressedrow.nrows;
3543     ii   = a->compressedrow.i;
3544     ridx = a->compressedrow.rindex;
3545   } else {
3546     mbs = a->mbs;
3547     ii  = a->i;
3548     z   = c;
3549   }
3550 
3551   for (i = 0; i < mbs; i++) {
3552     n = ii[1] - ii[0];
3553     ii++;
3554     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3555     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3556     if (usecprow) z = c + 4 * ridx[i];
3557     jj = idx;
3558     vv = v;
3559     for (k = 0; k < cn; k++) {
3560       idx  = jj;
3561       v    = vv;
3562       sum1 = 0.0;
3563       sum2 = 0.0;
3564       sum3 = 0.0;
3565       sum4 = 0.0;
3566       for (j = 0; j < n; j++) {
3567         xb = b + 4 * (*idx++);
3568         x1 = xb[0 + k * bm];
3569         x2 = xb[1 + k * bm];
3570         x3 = xb[2 + k * bm];
3571         x4 = xb[3 + k * bm];
3572         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3573         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3574         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3575         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3576         v += 16;
3577       }
3578       z[0 + k * cm] = sum1;
3579       z[1 + k * cm] = sum2;
3580       z[2 + k * cm] = sum3;
3581       z[3 + k * cm] = sum4;
3582     }
3583     if (!usecprow) z += 4;
3584   }
3585   PetscFunctionReturn(PETSC_SUCCESS);
3586 }
3587 
3588 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3589 {
3590   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3591   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3592   const PetscScalar *xb;
3593   PetscScalar        x1, x2, x3, x4, x5;
3594   const MatScalar   *v, *vv;
3595   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3596   PetscBool          usecprow = a->compressedrow.use;
3597 
3598   PetscFunctionBegin;
3599   idx = a->j;
3600   v   = a->a;
3601   if (usecprow) {
3602     mbs  = a->compressedrow.nrows;
3603     ii   = a->compressedrow.i;
3604     ridx = a->compressedrow.rindex;
3605   } else {
3606     mbs = a->mbs;
3607     ii  = a->i;
3608     z   = c;
3609   }
3610 
3611   for (i = 0; i < mbs; i++) {
3612     n = ii[1] - ii[0];
3613     ii++;
3614     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3615     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3616     if (usecprow) z = c + 5 * ridx[i];
3617     jj = idx;
3618     vv = v;
3619     for (k = 0; k < cn; k++) {
3620       idx  = jj;
3621       v    = vv;
3622       sum1 = 0.0;
3623       sum2 = 0.0;
3624       sum3 = 0.0;
3625       sum4 = 0.0;
3626       sum5 = 0.0;
3627       for (j = 0; j < n; j++) {
3628         xb = b + 5 * (*idx++);
3629         x1 = xb[0 + k * bm];
3630         x2 = xb[1 + k * bm];
3631         x3 = xb[2 + k * bm];
3632         x4 = xb[3 + k * bm];
3633         x5 = xb[4 + k * bm];
3634         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3635         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3636         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3637         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3638         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3639         v += 25;
3640       }
3641       z[0 + k * cm] = sum1;
3642       z[1 + k * cm] = sum2;
3643       z[2 + k * cm] = sum3;
3644       z[3 + k * cm] = sum4;
3645       z[4 + k * cm] = sum5;
3646     }
3647     if (!usecprow) z += 5;
3648   }
3649   PetscFunctionReturn(PETSC_SUCCESS);
3650 }
3651 
3652 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3653 {
3654   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3655   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3656   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3657   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3658   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3659   PetscBLASInt     bbs, bcn, bbm, bcm;
3660   PetscScalar     *z = NULL;
3661   PetscScalar     *c, *b;
3662   const MatScalar *v;
3663   const PetscInt  *idx, *ii, *ridx = NULL;
3664   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3665   PetscBool        usecprow = a->compressedrow.use;
3666 
3667   PetscFunctionBegin;
3668   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3669   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3670   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3671   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3672   b = bd->v;
3673   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3674   PetscCall(MatDenseGetArray(C, &c));
3675   switch (bs) {
3676   case 1:
3677     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3678     break;
3679   case 2:
3680     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3681     break;
3682   case 3:
3683     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3684     break;
3685   case 4:
3686     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3687     break;
3688   case 5:
3689     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3690     break;
3691   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3692     PetscCall(PetscBLASIntCast(bs, &bbs));
3693     PetscCall(PetscBLASIntCast(cn, &bcn));
3694     PetscCall(PetscBLASIntCast(bm, &bbm));
3695     PetscCall(PetscBLASIntCast(cm, &bcm));
3696     idx = a->j;
3697     v   = a->a;
3698     if (usecprow) {
3699       mbs  = a->compressedrow.nrows;
3700       ii   = a->compressedrow.i;
3701       ridx = a->compressedrow.rindex;
3702     } else {
3703       mbs = a->mbs;
3704       ii  = a->i;
3705       z   = c;
3706     }
3707     for (i = 0; i < mbs; i++) {
3708       n = ii[1] - ii[0];
3709       ii++;
3710       if (usecprow) z = c + bs * ridx[i];
3711       if (n) {
3712         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3713         v += bs2;
3714       }
3715       for (j = 1; j < n; j++) {
3716         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3717         v += bs2;
3718       }
3719       if (!usecprow) z += bs;
3720     }
3721   }
3722   PetscCall(MatDenseRestoreArray(C, &c));
3723   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3724   PetscFunctionReturn(PETSC_SUCCESS);
3725 }
3726