xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 43cdf1ebbcbfa05bee08e48007ef1bae3f20f4e9)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
12   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
13   const PetscInt *idx;
14   PetscBT         table_out, table_in;
15 
16   PetscFunctionBegin;
17   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18   mbs = a->mbs;
19   ai  = a->i;
20   aj  = a->j;
21   bs  = A->rmap->bs;
22   PetscCall(PetscBTCreate(mbs, &table_out));
23   PetscCall(PetscMalloc1(mbs + 1, &nidx));
24   PetscCall(PetscBTCreate(mbs, &table_in));
25 
26   for (i = 0; i < is_max; i++) { /* for each is */
27     isz = 0;
28     PetscCall(PetscBTMemzero(mbs, table_out));
29 
30     /* Extract the indices, assume there can be duplicate entries */
31     PetscCall(ISGetIndices(is[i], &idx));
32     PetscCall(ISGetLocalSize(is[i], &n));
33 
34     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35     bcol_max = 0;
36     for (j = 0; j < n; ++j) {
37       brow = idx[j] / bs; /* convert the indices into block indices */
38       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39       if (!PetscBTLookupSet(table_out, brow)) {
40         nidx[isz++] = brow;
41         if (bcol_max < brow) bcol_max = brow;
42       }
43     }
44     PetscCall(ISRestoreIndices(is[i], &idx));
45     PetscCall(ISDestroy(&is[i]));
46 
47     k = 0;
48     for (j = 0; j < ov; j++) { /* for each overlap */
49       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50       PetscCall(PetscBTMemzero(mbs, table_in));
51       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52 
53       n = isz; /* length of the updated is[i] */
54       for (brow = 0; brow < mbs; brow++) {
55         start = ai[brow];
56         end   = ai[brow + 1];
57         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l < end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out, bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else {             /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l < end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in, bcol)) {
72               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
80   } /* for each is */
81   PetscCall(PetscBTDestroy(&table_out));
82   PetscCall(PetscFree(nidx));
83   PetscCall(PetscBTDestroy(&table_in));
84   PetscFunctionReturn(0);
85 }
86 
87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
88         Zero some ops' to avoid invalid usse */
89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90 {
91   PetscFunctionBegin;
92   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93   Bseq->ops->mult                   = NULL;
94   Bseq->ops->multadd                = NULL;
95   Bseq->ops->multtranspose          = NULL;
96   Bseq->ops->multtransposeadd       = NULL;
97   Bseq->ops->lufactor               = NULL;
98   Bseq->ops->choleskyfactor         = NULL;
99   Bseq->ops->lufactorsymbolic       = NULL;
100   Bseq->ops->choleskyfactorsymbolic = NULL;
101   Bseq->ops->getinertia             = NULL;
102   PetscFunctionReturn(0);
103 }
104 
105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107 {
108   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
109   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111   const PetscInt *irow, *icol;
112   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113   PetscInt       *aj = a->j, *ai = a->i;
114   MatScalar      *mat_a;
115   Mat             C;
116   PetscBool       flag;
117 
118   PetscFunctionBegin;
119 
120   PetscCall(ISGetIndices(isrow, &irow));
121   PetscCall(ISGetIndices(iscol, &icol));
122   PetscCall(ISGetLocalSize(isrow, &nrows));
123   PetscCall(ISGetLocalSize(iscol, &ncols));
124 
125   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126   ssmap = smap;
127   PetscCall(PetscMalloc1(1 + nrows, &lens));
128   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129   /* determine lens of each row */
130   for (i = 0; i < nrows; i++) {
131     kstart  = ai[irow[i]];
132     kend    = kstart + a->ilen[irow[i]];
133     lens[i] = 0;
134     for (k = kstart; k < kend; k++) {
135       if (ssmap[aj[k]]) lens[i]++;
136     }
137   }
138   /* Create and fill new matrix */
139   if (scall == MAT_REUSE_MATRIX) {
140     c = (Mat_SeqSBAIJ *)((*B)->data);
141 
142     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
145     PetscCall(PetscArrayzero(c->ilen, c->mbs));
146     C = *B;
147   } else {
148     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
149     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
150     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
151     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152   }
153   c = (Mat_SeqSBAIJ *)(C->data);
154   for (i = 0; i < nrows; i++) {
155     row      = irow[i];
156     kstart   = ai[row];
157     kend     = kstart + a->ilen[row];
158     mat_i    = c->i[i];
159     mat_j    = c->j + mat_i;
160     mat_a    = c->a + mat_i * bs2;
161     mat_ilen = c->ilen + i;
162     for (k = kstart; k < kend; k++) {
163       if ((tcol = ssmap[a->j[k]])) {
164         *mat_j++ = tcol - 1;
165         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166         mat_a += bs2;
167         (*mat_ilen)++;
168       }
169     }
170   }
171   /* sort */
172   {
173     MatScalar *work;
174 
175     PetscCall(PetscMalloc1(bs2, &work));
176     for (i = 0; i < nrows; i++) {
177       PetscInt ilen;
178       mat_i = c->i[i];
179       mat_j = c->j + mat_i;
180       mat_a = c->a + mat_i * bs2;
181       ilen  = c->ilen[i];
182       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183     }
184     PetscCall(PetscFree(work));
185   }
186 
187   /* Free work space */
188   PetscCall(ISRestoreIndices(iscol, &icol));
189   PetscCall(PetscFree(smap));
190   PetscCall(PetscFree(lens));
191   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
192   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
193 
194   PetscCall(ISRestoreIndices(isrow, &irow));
195   *B = C;
196   PetscFunctionReturn(0);
197 }
198 
199 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200 {
201   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
202   IS              is1, is2;
203   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
204   const PetscInt *irow, *icol;
205 
206   PetscFunctionBegin;
207   PetscCall(ISGetIndices(isrow, &irow));
208   PetscCall(ISGetIndices(iscol, &icol));
209   PetscCall(ISGetLocalSize(isrow, &nrows));
210   PetscCall(ISGetLocalSize(iscol, &ncols));
211 
212   /* Verify if the indices corespond to each element in a block
213    and form the IS with compressed IS */
214   maxmnbs = PetscMax(a->mbs, a->nbs);
215   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
216   PetscCall(PetscArrayzero(vary, a->mbs));
217   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
218   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
219   count = 0;
220   for (i = 0; i < nrows; i++) {
221     PetscInt j = irow[i] / bs;
222     if ((vary[j]--) == bs) iary[count++] = j;
223   }
224   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
225 
226   PetscCall(PetscArrayzero(vary, a->nbs));
227   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
228   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
229   count = 0;
230   for (i = 0; i < ncols; i++) {
231     PetscInt j = icol[i] / bs;
232     if ((vary[j]--) == bs) iary[count++] = j;
233   }
234   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
235   PetscCall(ISRestoreIndices(isrow, &irow));
236   PetscCall(ISRestoreIndices(iscol, &icol));
237   PetscCall(PetscFree2(vary, iary));
238 
239   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
240   PetscCall(ISDestroy(&is1));
241   PetscCall(ISDestroy(&is2));
242 
243   if (isrow != iscol) {
244     PetscBool isequal;
245     PetscCall(ISEqual(isrow, iscol, &isequal));
246     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
252 {
253   PetscInt i;
254 
255   PetscFunctionBegin;
256   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
257 
258   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
259   PetscFunctionReturn(0);
260 }
261 
262 /* -------------------------------------------------------*/
263 /* Should check that shapes of vectors and matrices match */
264 /* -------------------------------------------------------*/
265 
266 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
267 {
268   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
269   PetscScalar       *z, x1, x2, zero = 0.0;
270   const PetscScalar *x, *xb;
271   const MatScalar   *v;
272   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
273   const PetscInt    *aj = a->j, *ai = a->i, *ib;
274   PetscInt           nonzerorow = 0;
275 
276   PetscFunctionBegin;
277   PetscCall(VecSet(zz, zero));
278   if (!a->nz) PetscFunctionReturn(0);
279   PetscCall(VecGetArrayRead(xx, &x));
280   PetscCall(VecGetArray(zz, &z));
281 
282   v  = a->a;
283   xb = x;
284 
285   for (i = 0; i < mbs; i++) {
286     n    = ai[1] - ai[0]; /* length of i_th block row of A */
287     x1   = xb[0];
288     x2   = xb[1];
289     ib   = aj + *ai;
290     jmin = 0;
291     nonzerorow += (n > 0);
292     if (*ib == i) { /* (diag of A)*x */
293       z[2 * i] += v[0] * x1 + v[2] * x2;
294       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
295       v += 4;
296       jmin++;
297     }
298     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
299     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
300     for (j = jmin; j < n; j++) {
301       /* (strict lower triangular part of A)*x  */
302       cval = ib[j] * 2;
303       z[cval] += v[0] * x1 + v[1] * x2;
304       z[cval + 1] += v[2] * x1 + v[3] * x2;
305       /* (strict upper triangular part of A)*x  */
306       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
307       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
308       v += 4;
309     }
310     xb += 2;
311     ai++;
312   }
313 
314   PetscCall(VecRestoreArrayRead(xx, &x));
315   PetscCall(VecRestoreArray(zz, &z));
316   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
317   PetscFunctionReturn(0);
318 }
319 
320 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
321 {
322   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
323   PetscScalar       *z, x1, x2, x3, zero = 0.0;
324   const PetscScalar *x, *xb;
325   const MatScalar   *v;
326   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
327   const PetscInt    *aj = a->j, *ai = a->i, *ib;
328   PetscInt           nonzerorow = 0;
329 
330   PetscFunctionBegin;
331   PetscCall(VecSet(zz, zero));
332   if (!a->nz) PetscFunctionReturn(0);
333   PetscCall(VecGetArrayRead(xx, &x));
334   PetscCall(VecGetArray(zz, &z));
335 
336   v  = a->a;
337   xb = x;
338 
339   for (i = 0; i < mbs; i++) {
340     n    = ai[1] - ai[0]; /* length of i_th block row of A */
341     x1   = xb[0];
342     x2   = xb[1];
343     x3   = xb[2];
344     ib   = aj + *ai;
345     jmin = 0;
346     nonzerorow += (n > 0);
347     if (*ib == i) { /* (diag of A)*x */
348       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
349       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
350       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
351       v += 9;
352       jmin++;
353     }
354     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
355     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356     for (j = jmin; j < n; j++) {
357       /* (strict lower triangular part of A)*x  */
358       cval = ib[j] * 3;
359       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
360       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
361       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
362       /* (strict upper triangular part of A)*x  */
363       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
364       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
365       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
366       v += 9;
367     }
368     xb += 3;
369     ai++;
370   }
371 
372   PetscCall(VecRestoreArrayRead(xx, &x));
373   PetscCall(VecRestoreArray(zz, &z));
374   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
375   PetscFunctionReturn(0);
376 }
377 
378 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
379 {
380   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
381   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
382   const PetscScalar *x, *xb;
383   const MatScalar   *v;
384   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
385   const PetscInt    *aj = a->j, *ai = a->i, *ib;
386   PetscInt           nonzerorow = 0;
387 
388   PetscFunctionBegin;
389   PetscCall(VecSet(zz, zero));
390   if (!a->nz) PetscFunctionReturn(0);
391   PetscCall(VecGetArrayRead(xx, &x));
392   PetscCall(VecGetArray(zz, &z));
393 
394   v  = a->a;
395   xb = x;
396 
397   for (i = 0; i < mbs; i++) {
398     n    = ai[1] - ai[0]; /* length of i_th block row of A */
399     x1   = xb[0];
400     x2   = xb[1];
401     x3   = xb[2];
402     x4   = xb[3];
403     ib   = aj + *ai;
404     jmin = 0;
405     nonzerorow += (n > 0);
406     if (*ib == i) { /* (diag of A)*x */
407       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
408       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
409       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
410       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
411       v += 16;
412       jmin++;
413     }
414     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
415     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
416     for (j = jmin; j < n; j++) {
417       /* (strict lower triangular part of A)*x  */
418       cval = ib[j] * 4;
419       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
420       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
421       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
422       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
423       /* (strict upper triangular part of A)*x  */
424       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
425       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
426       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
427       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
428       v += 16;
429     }
430     xb += 4;
431     ai++;
432   }
433 
434   PetscCall(VecRestoreArrayRead(xx, &x));
435   PetscCall(VecRestoreArray(zz, &z));
436   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
437   PetscFunctionReturn(0);
438 }
439 
440 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
441 {
442   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
443   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
444   const PetscScalar *x, *xb;
445   const MatScalar   *v;
446   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
447   const PetscInt    *aj = a->j, *ai = a->i, *ib;
448   PetscInt           nonzerorow = 0;
449 
450   PetscFunctionBegin;
451   PetscCall(VecSet(zz, zero));
452   if (!a->nz) PetscFunctionReturn(0);
453   PetscCall(VecGetArrayRead(xx, &x));
454   PetscCall(VecGetArray(zz, &z));
455 
456   v  = a->a;
457   xb = x;
458 
459   for (i = 0; i < mbs; i++) {
460     n    = ai[1] - ai[0]; /* length of i_th block row of A */
461     x1   = xb[0];
462     x2   = xb[1];
463     x3   = xb[2];
464     x4   = xb[3];
465     x5   = xb[4];
466     ib   = aj + *ai;
467     jmin = 0;
468     nonzerorow += (n > 0);
469     if (*ib == i) { /* (diag of A)*x */
470       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
471       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
472       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
473       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
474       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
475       v += 25;
476       jmin++;
477     }
478     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
479     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
480     for (j = jmin; j < n; j++) {
481       /* (strict lower triangular part of A)*x  */
482       cval = ib[j] * 5;
483       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
484       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
485       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
486       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
487       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
488       /* (strict upper triangular part of A)*x  */
489       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
490       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
491       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
492       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
493       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
494       v += 25;
495     }
496     xb += 5;
497     ai++;
498   }
499 
500   PetscCall(VecRestoreArrayRead(xx, &x));
501   PetscCall(VecRestoreArray(zz, &z));
502   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
503   PetscFunctionReturn(0);
504 }
505 
506 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
507 {
508   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
509   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
510   const PetscScalar *x, *xb;
511   const MatScalar   *v;
512   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
513   const PetscInt    *aj = a->j, *ai = a->i, *ib;
514   PetscInt           nonzerorow = 0;
515 
516   PetscFunctionBegin;
517   PetscCall(VecSet(zz, zero));
518   if (!a->nz) PetscFunctionReturn(0);
519   PetscCall(VecGetArrayRead(xx, &x));
520   PetscCall(VecGetArray(zz, &z));
521 
522   v  = a->a;
523   xb = x;
524 
525   for (i = 0; i < mbs; i++) {
526     n    = ai[1] - ai[0]; /* length of i_th block row of A */
527     x1   = xb[0];
528     x2   = xb[1];
529     x3   = xb[2];
530     x4   = xb[3];
531     x5   = xb[4];
532     x6   = xb[5];
533     ib   = aj + *ai;
534     jmin = 0;
535     nonzerorow += (n > 0);
536     if (*ib == i) { /* (diag of A)*x */
537       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
538       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
539       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
540       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
541       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
542       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
543       v += 36;
544       jmin++;
545     }
546     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
547     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
548     for (j = jmin; j < n; j++) {
549       /* (strict lower triangular part of A)*x  */
550       cval = ib[j] * 6;
551       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
552       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
553       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
554       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
555       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
556       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
557       /* (strict upper triangular part of A)*x  */
558       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
559       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
560       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
561       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
562       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
563       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
564       v += 36;
565     }
566     xb += 6;
567     ai++;
568   }
569 
570   PetscCall(VecRestoreArrayRead(xx, &x));
571   PetscCall(VecRestoreArray(zz, &z));
572   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
573   PetscFunctionReturn(0);
574 }
575 
576 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
577 {
578   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
579   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
580   const PetscScalar *x, *xb;
581   const MatScalar   *v;
582   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
583   const PetscInt    *aj = a->j, *ai = a->i, *ib;
584   PetscInt           nonzerorow = 0;
585 
586   PetscFunctionBegin;
587   PetscCall(VecSet(zz, zero));
588   if (!a->nz) PetscFunctionReturn(0);
589   PetscCall(VecGetArrayRead(xx, &x));
590   PetscCall(VecGetArray(zz, &z));
591 
592   v  = a->a;
593   xb = x;
594 
595   for (i = 0; i < mbs; i++) {
596     n    = ai[1] - ai[0]; /* length of i_th block row of A */
597     x1   = xb[0];
598     x2   = xb[1];
599     x3   = xb[2];
600     x4   = xb[3];
601     x5   = xb[4];
602     x6   = xb[5];
603     x7   = xb[6];
604     ib   = aj + *ai;
605     jmin = 0;
606     nonzerorow += (n > 0);
607     if (*ib == i) { /* (diag of A)*x */
608       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
609       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
610       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
611       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
612       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
613       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
614       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
615       v += 49;
616       jmin++;
617     }
618     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
619     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
620     for (j = jmin; j < n; j++) {
621       /* (strict lower triangular part of A)*x  */
622       cval = ib[j] * 7;
623       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
624       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
625       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
626       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
627       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
628       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
629       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
630       /* (strict upper triangular part of A)*x  */
631       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
632       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
633       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
634       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
635       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
636       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
637       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
638       v += 49;
639     }
640     xb += 7;
641     ai++;
642   }
643   PetscCall(VecRestoreArrayRead(xx, &x));
644   PetscCall(VecRestoreArray(zz, &z));
645   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
646   PetscFunctionReturn(0);
647 }
648 
649 /*
650     This will not work with MatScalar == float because it calls the BLAS
651 */
652 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
653 {
654   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
655   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
656   const PetscScalar *x, *x_ptr, *xb;
657   const MatScalar   *v;
658   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
659   const PetscInt    *idx, *aj, *ii;
660   PetscInt           nonzerorow = 0;
661 
662   PetscFunctionBegin;
663   PetscCall(VecSet(zz, zero));
664   if (!a->nz) PetscFunctionReturn(0);
665   PetscCall(VecGetArrayRead(xx, &x));
666   PetscCall(VecGetArray(zz, &z));
667 
668   x_ptr = x;
669   z_ptr = z;
670 
671   aj = a->j;
672   v  = a->a;
673   ii = a->i;
674 
675   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
676   work = a->mult_work;
677 
678   for (i = 0; i < mbs; i++) {
679     n     = ii[1] - ii[0];
680     ncols = n * bs;
681     workt = work;
682     idx   = aj + ii[0];
683     nonzerorow += (n > 0);
684 
685     /* upper triangular part */
686     for (j = 0; j < n; j++) {
687       xb = x_ptr + bs * (*idx++);
688       for (k = 0; k < bs; k++) workt[k] = xb[k];
689       workt += bs;
690     }
691     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
692     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
693 
694     /* strict lower triangular part */
695     idx = aj + ii[0];
696     if (n && *idx == i) {
697       ncols -= bs;
698       v += bs2;
699       idx++;
700       n--;
701     }
702 
703     if (ncols > 0) {
704       workt = work;
705       PetscCall(PetscArrayzero(workt, ncols));
706       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
707       for (j = 0; j < n; j++) {
708         zb = z_ptr + bs * (*idx++);
709         for (k = 0; k < bs; k++) zb[k] += workt[k];
710         workt += bs;
711       }
712     }
713     x += bs;
714     v += n * bs2;
715     z += bs;
716     ii++;
717   }
718 
719   PetscCall(VecRestoreArrayRead(xx, &x));
720   PetscCall(VecRestoreArray(zz, &z));
721   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
722   PetscFunctionReturn(0);
723 }
724 
725 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
726 {
727   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
728   PetscScalar       *z, x1;
729   const PetscScalar *x, *xb;
730   const MatScalar   *v;
731   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
732   const PetscInt    *aj = a->j, *ai = a->i, *ib;
733   PetscInt           nonzerorow = 0;
734 #if defined(PETSC_USE_COMPLEX)
735   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
736 #else
737   const int aconj = 0;
738 #endif
739 
740   PetscFunctionBegin;
741   PetscCall(VecCopy(yy, zz));
742   if (!a->nz) PetscFunctionReturn(0);
743   PetscCall(VecGetArrayRead(xx, &x));
744   PetscCall(VecGetArray(zz, &z));
745   v  = a->a;
746   xb = x;
747 
748   for (i = 0; i < mbs; i++) {
749     n    = ai[1] - ai[0]; /* length of i_th row of A */
750     x1   = xb[0];
751     ib   = aj + *ai;
752     jmin = 0;
753     nonzerorow += (n > 0);
754     if (n && *ib == i) { /* (diag of A)*x */
755       z[i] += *v++ * x[*ib++];
756       jmin++;
757     }
758     if (aconj) {
759       for (j = jmin; j < n; j++) {
760         cval = *ib;
761         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
762         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
763       }
764     } else {
765       for (j = jmin; j < n; j++) {
766         cval = *ib;
767         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
768         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
769       }
770     }
771     xb++;
772     ai++;
773   }
774 
775   PetscCall(VecRestoreArrayRead(xx, &x));
776   PetscCall(VecRestoreArray(zz, &z));
777 
778   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
779   PetscFunctionReturn(0);
780 }
781 
782 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
783 {
784   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
785   PetscScalar       *z, x1, x2;
786   const PetscScalar *x, *xb;
787   const MatScalar   *v;
788   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
789   const PetscInt    *aj = a->j, *ai = a->i, *ib;
790   PetscInt           nonzerorow = 0;
791 
792   PetscFunctionBegin;
793   PetscCall(VecCopy(yy, zz));
794   if (!a->nz) PetscFunctionReturn(0);
795   PetscCall(VecGetArrayRead(xx, &x));
796   PetscCall(VecGetArray(zz, &z));
797 
798   v  = a->a;
799   xb = x;
800 
801   for (i = 0; i < mbs; i++) {
802     n    = ai[1] - ai[0]; /* length of i_th block row of A */
803     x1   = xb[0];
804     x2   = xb[1];
805     ib   = aj + *ai;
806     jmin = 0;
807     nonzerorow += (n > 0);
808     if (n && *ib == i) { /* (diag of A)*x */
809       z[2 * i] += v[0] * x1 + v[2] * x2;
810       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
811       v += 4;
812       jmin++;
813     }
814     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
815     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
816     for (j = jmin; j < n; j++) {
817       /* (strict lower triangular part of A)*x  */
818       cval = ib[j] * 2;
819       z[cval] += v[0] * x1 + v[1] * x2;
820       z[cval + 1] += v[2] * x1 + v[3] * x2;
821       /* (strict upper triangular part of A)*x  */
822       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
823       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
824       v += 4;
825     }
826     xb += 2;
827     ai++;
828   }
829   PetscCall(VecRestoreArrayRead(xx, &x));
830   PetscCall(VecRestoreArray(zz, &z));
831 
832   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
833   PetscFunctionReturn(0);
834 }
835 
836 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
837 {
838   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
839   PetscScalar       *z, x1, x2, x3;
840   const PetscScalar *x, *xb;
841   const MatScalar   *v;
842   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
843   const PetscInt    *aj = a->j, *ai = a->i, *ib;
844   PetscInt           nonzerorow = 0;
845 
846   PetscFunctionBegin;
847   PetscCall(VecCopy(yy, zz));
848   if (!a->nz) PetscFunctionReturn(0);
849   PetscCall(VecGetArrayRead(xx, &x));
850   PetscCall(VecGetArray(zz, &z));
851 
852   v  = a->a;
853   xb = x;
854 
855   for (i = 0; i < mbs; i++) {
856     n    = ai[1] - ai[0]; /* length of i_th block row of A */
857     x1   = xb[0];
858     x2   = xb[1];
859     x3   = xb[2];
860     ib   = aj + *ai;
861     jmin = 0;
862     nonzerorow += (n > 0);
863     if (n && *ib == i) { /* (diag of A)*x */
864       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
865       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
866       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
867       v += 9;
868       jmin++;
869     }
870     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
871     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
872     for (j = jmin; j < n; j++) {
873       /* (strict lower triangular part of A)*x  */
874       cval = ib[j] * 3;
875       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
876       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
877       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
878       /* (strict upper triangular part of A)*x  */
879       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
880       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
881       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
882       v += 9;
883     }
884     xb += 3;
885     ai++;
886   }
887 
888   PetscCall(VecRestoreArrayRead(xx, &x));
889   PetscCall(VecRestoreArray(zz, &z));
890 
891   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
896 {
897   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
898   PetscScalar       *z, x1, x2, x3, x4;
899   const PetscScalar *x, *xb;
900   const MatScalar   *v;
901   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
902   const PetscInt    *aj = a->j, *ai = a->i, *ib;
903   PetscInt           nonzerorow = 0;
904 
905   PetscFunctionBegin;
906   PetscCall(VecCopy(yy, zz));
907   if (!a->nz) PetscFunctionReturn(0);
908   PetscCall(VecGetArrayRead(xx, &x));
909   PetscCall(VecGetArray(zz, &z));
910 
911   v  = a->a;
912   xb = x;
913 
914   for (i = 0; i < mbs; i++) {
915     n    = ai[1] - ai[0]; /* length of i_th block row of A */
916     x1   = xb[0];
917     x2   = xb[1];
918     x3   = xb[2];
919     x4   = xb[3];
920     ib   = aj + *ai;
921     jmin = 0;
922     nonzerorow += (n > 0);
923     if (n && *ib == i) { /* (diag of A)*x */
924       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
925       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
926       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
927       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
928       v += 16;
929       jmin++;
930     }
931     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
932     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
933     for (j = jmin; j < n; j++) {
934       /* (strict lower triangular part of A)*x  */
935       cval = ib[j] * 4;
936       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
937       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
938       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
939       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
940       /* (strict upper triangular part of A)*x  */
941       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
942       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
943       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
944       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
945       v += 16;
946     }
947     xb += 4;
948     ai++;
949   }
950 
951   PetscCall(VecRestoreArrayRead(xx, &x));
952   PetscCall(VecRestoreArray(zz, &z));
953 
954   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
955   PetscFunctionReturn(0);
956 }
957 
958 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
959 {
960   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
961   PetscScalar       *z, x1, x2, x3, x4, x5;
962   const PetscScalar *x, *xb;
963   const MatScalar   *v;
964   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
965   const PetscInt    *aj = a->j, *ai = a->i, *ib;
966   PetscInt           nonzerorow = 0;
967 
968   PetscFunctionBegin;
969   PetscCall(VecCopy(yy, zz));
970   if (!a->nz) PetscFunctionReturn(0);
971   PetscCall(VecGetArrayRead(xx, &x));
972   PetscCall(VecGetArray(zz, &z));
973 
974   v  = a->a;
975   xb = x;
976 
977   for (i = 0; i < mbs; i++) {
978     n    = ai[1] - ai[0]; /* length of i_th block row of A */
979     x1   = xb[0];
980     x2   = xb[1];
981     x3   = xb[2];
982     x4   = xb[3];
983     x5   = xb[4];
984     ib   = aj + *ai;
985     jmin = 0;
986     nonzerorow += (n > 0);
987     if (n && *ib == i) { /* (diag of A)*x */
988       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
989       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
990       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
991       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
992       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
993       v += 25;
994       jmin++;
995     }
996     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
997     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
998     for (j = jmin; j < n; j++) {
999       /* (strict lower triangular part of A)*x  */
1000       cval = ib[j] * 5;
1001       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1002       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1003       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1004       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1005       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1006       /* (strict upper triangular part of A)*x  */
1007       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1008       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1009       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1010       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1011       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1012       v += 25;
1013     }
1014     xb += 5;
1015     ai++;
1016   }
1017 
1018   PetscCall(VecRestoreArrayRead(xx, &x));
1019   PetscCall(VecRestoreArray(zz, &z));
1020 
1021   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1026 {
1027   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1028   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1029   const PetscScalar *x, *xb;
1030   const MatScalar   *v;
1031   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1032   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1033   PetscInt           nonzerorow = 0;
1034 
1035   PetscFunctionBegin;
1036   PetscCall(VecCopy(yy, zz));
1037   if (!a->nz) PetscFunctionReturn(0);
1038   PetscCall(VecGetArrayRead(xx, &x));
1039   PetscCall(VecGetArray(zz, &z));
1040 
1041   v  = a->a;
1042   xb = x;
1043 
1044   for (i = 0; i < mbs; i++) {
1045     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1046     x1   = xb[0];
1047     x2   = xb[1];
1048     x3   = xb[2];
1049     x4   = xb[3];
1050     x5   = xb[4];
1051     x6   = xb[5];
1052     ib   = aj + *ai;
1053     jmin = 0;
1054     nonzerorow += (n > 0);
1055     if (n && *ib == i) { /* (diag of A)*x */
1056       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1057       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1058       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1059       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1060       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1061       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1062       v += 36;
1063       jmin++;
1064     }
1065     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1066     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1067     for (j = jmin; j < n; j++) {
1068       /* (strict lower triangular part of A)*x  */
1069       cval = ib[j] * 6;
1070       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1071       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1072       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1073       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1074       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1075       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1076       /* (strict upper triangular part of A)*x  */
1077       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1078       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1079       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1080       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1081       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1082       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1083       v += 36;
1084     }
1085     xb += 6;
1086     ai++;
1087   }
1088 
1089   PetscCall(VecRestoreArrayRead(xx, &x));
1090   PetscCall(VecRestoreArray(zz, &z));
1091 
1092   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1097 {
1098   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1099   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1100   const PetscScalar *x, *xb;
1101   const MatScalar   *v;
1102   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1103   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1104   PetscInt           nonzerorow = 0;
1105 
1106   PetscFunctionBegin;
1107   PetscCall(VecCopy(yy, zz));
1108   if (!a->nz) PetscFunctionReturn(0);
1109   PetscCall(VecGetArrayRead(xx, &x));
1110   PetscCall(VecGetArray(zz, &z));
1111 
1112   v  = a->a;
1113   xb = x;
1114 
1115   for (i = 0; i < mbs; i++) {
1116     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1117     x1   = xb[0];
1118     x2   = xb[1];
1119     x3   = xb[2];
1120     x4   = xb[3];
1121     x5   = xb[4];
1122     x6   = xb[5];
1123     x7   = xb[6];
1124     ib   = aj + *ai;
1125     jmin = 0;
1126     nonzerorow += (n > 0);
1127     if (n && *ib == i) { /* (diag of A)*x */
1128       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1129       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1130       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1131       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1132       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1133       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1134       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1135       v += 49;
1136       jmin++;
1137     }
1138     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1139     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1140     for (j = jmin; j < n; j++) {
1141       /* (strict lower triangular part of A)*x  */
1142       cval = ib[j] * 7;
1143       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1144       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1145       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1146       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1147       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1148       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1149       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1150       /* (strict upper triangular part of A)*x  */
1151       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1152       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1153       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1154       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1155       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1156       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1157       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1158       v += 49;
1159     }
1160     xb += 7;
1161     ai++;
1162   }
1163 
1164   PetscCall(VecRestoreArrayRead(xx, &x));
1165   PetscCall(VecRestoreArray(zz, &z));
1166 
1167   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1172 {
1173   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1174   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1175   const PetscScalar *x, *x_ptr, *xb;
1176   const MatScalar   *v;
1177   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1178   const PetscInt    *idx, *aj, *ii;
1179   PetscInt           nonzerorow = 0;
1180 
1181   PetscFunctionBegin;
1182   PetscCall(VecCopy(yy, zz));
1183   if (!a->nz) PetscFunctionReturn(0);
1184   PetscCall(VecGetArrayRead(xx, &x));
1185   x_ptr = x;
1186   PetscCall(VecGetArray(zz, &z));
1187   z_ptr = z;
1188 
1189   aj = a->j;
1190   v  = a->a;
1191   ii = a->i;
1192 
1193   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1194   work = a->mult_work;
1195 
1196   for (i = 0; i < mbs; i++) {
1197     n     = ii[1] - ii[0];
1198     ncols = n * bs;
1199     workt = work;
1200     idx   = aj + ii[0];
1201     nonzerorow += (n > 0);
1202 
1203     /* upper triangular part */
1204     for (j = 0; j < n; j++) {
1205       xb = x_ptr + bs * (*idx++);
1206       for (k = 0; k < bs; k++) workt[k] = xb[k];
1207       workt += bs;
1208     }
1209     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1210     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1211 
1212     /* strict lower triangular part */
1213     idx = aj + ii[0];
1214     if (n && *idx == i) {
1215       ncols -= bs;
1216       v += bs2;
1217       idx++;
1218       n--;
1219     }
1220     if (ncols > 0) {
1221       workt = work;
1222       PetscCall(PetscArrayzero(workt, ncols));
1223       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1224       for (j = 0; j < n; j++) {
1225         zb = z_ptr + bs * (*idx++);
1226         for (k = 0; k < bs; k++) zb[k] += workt[k];
1227         workt += bs;
1228       }
1229     }
1230 
1231     x += bs;
1232     v += n * bs2;
1233     z += bs;
1234     ii++;
1235   }
1236 
1237   PetscCall(VecRestoreArrayRead(xx, &x));
1238   PetscCall(VecRestoreArray(zz, &z));
1239 
1240   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1241   PetscFunctionReturn(0);
1242 }
1243 
1244 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1245 {
1246   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1247   PetscScalar   oalpha = alpha;
1248   PetscBLASInt  one    = 1, totalnz;
1249 
1250   PetscFunctionBegin;
1251   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1252   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1253   PetscCall(PetscLogFlops(totalnz));
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1258 {
1259   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1260   const MatScalar *v        = a->a;
1261   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1262   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1263   const PetscInt  *aj = a->j, *col;
1264 
1265   PetscFunctionBegin;
1266   if (!a->nz) {
1267     *norm = 0.0;
1268     PetscFunctionReturn(0);
1269   }
1270   if (type == NORM_FROBENIUS) {
1271     for (k = 0; k < mbs; k++) {
1272       jmin = a->i[k];
1273       jmax = a->i[k + 1];
1274       col  = aj + jmin;
1275       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1276         for (i = 0; i < bs2; i++) {
1277           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1278           v++;
1279         }
1280         jmin++;
1281       }
1282       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1283         for (i = 0; i < bs2; i++) {
1284           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1285           v++;
1286         }
1287       }
1288     }
1289     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1290     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1291   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1292     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1293     for (i = 0; i < mbs; i++) jl[i] = mbs;
1294     il[0] = 0;
1295 
1296     *norm = 0.0;
1297     for (k = 0; k < mbs; k++) { /* k_th block row */
1298       for (j = 0; j < bs; j++) sum[j] = 0.0;
1299       /*-- col sum --*/
1300       i = jl[k]; /* first |A(i,k)| to be added */
1301       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1302                   at step k */
1303       while (i < mbs) {
1304         nexti = jl[i]; /* next block row to be added */
1305         ik    = il[i]; /* block index of A(i,k) in the array a */
1306         for (j = 0; j < bs; j++) {
1307           v = a->a + ik * bs2 + j * bs;
1308           for (k1 = 0; k1 < bs; k1++) {
1309             sum[j] += PetscAbsScalar(*v);
1310             v++;
1311           }
1312         }
1313         /* update il, jl */
1314         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1315         jmax = a->i[i + 1];
1316         if (jmin < jmax) {
1317           il[i] = jmin;
1318           j     = a->j[jmin];
1319           jl[i] = jl[j];
1320           jl[j] = i;
1321         }
1322         i = nexti;
1323       }
1324       /*-- row sum --*/
1325       jmin = a->i[k];
1326       jmax = a->i[k + 1];
1327       for (i = jmin; i < jmax; i++) {
1328         for (j = 0; j < bs; j++) {
1329           v = a->a + i * bs2 + j;
1330           for (k1 = 0; k1 < bs; k1++) {
1331             sum[j] += PetscAbsScalar(*v);
1332             v += bs;
1333           }
1334         }
1335       }
1336       /* add k_th block row to il, jl */
1337       col = aj + jmin;
1338       if (jmax - jmin > 0 && *col == k) jmin++;
1339       if (jmin < jmax) {
1340         il[k] = jmin;
1341         j     = a->j[jmin];
1342         jl[k] = jl[j];
1343         jl[j] = k;
1344       }
1345       for (j = 0; j < bs; j++) {
1346         if (sum[j] > *norm) *norm = sum[j];
1347       }
1348     }
1349     PetscCall(PetscFree3(sum, il, jl));
1350     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1351   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1352   PetscFunctionReturn(0);
1353 }
1354 
1355 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1356 {
1357   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1358 
1359   PetscFunctionBegin;
1360   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1361   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1362     *flg = PETSC_FALSE;
1363     PetscFunctionReturn(0);
1364   }
1365 
1366   /* if the a->i are the same */
1367   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1368   if (!*flg) PetscFunctionReturn(0);
1369 
1370   /* if a->j are the same */
1371   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1372   if (!*flg) PetscFunctionReturn(0);
1373 
1374   /* if a->a are the same */
1375   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1380 {
1381   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1382   PetscInt         i, j, k, row, bs, ambs, bs2;
1383   const PetscInt  *ai, *aj;
1384   PetscScalar     *x, zero = 0.0;
1385   const MatScalar *aa, *aa_j;
1386 
1387   PetscFunctionBegin;
1388   bs = A->rmap->bs;
1389   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1390 
1391   aa   = a->a;
1392   ambs = a->mbs;
1393 
1394   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1395     PetscInt *diag = a->diag;
1396     aa             = a->a;
1397     ambs           = a->mbs;
1398     PetscCall(VecGetArray(v, &x));
1399     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1400     PetscCall(VecRestoreArray(v, &x));
1401     PetscFunctionReturn(0);
1402   }
1403 
1404   ai  = a->i;
1405   aj  = a->j;
1406   bs2 = a->bs2;
1407   PetscCall(VecSet(v, zero));
1408   if (!a->nz) PetscFunctionReturn(0);
1409   PetscCall(VecGetArray(v, &x));
1410   for (i = 0; i < ambs; i++) {
1411     j = ai[i];
1412     if (aj[j] == i) { /* if this is a diagonal element */
1413       row  = i * bs;
1414       aa_j = aa + j * bs2;
1415       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1416     }
1417   }
1418   PetscCall(VecRestoreArray(v, &x));
1419   PetscFunctionReturn(0);
1420 }
1421 
1422 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1423 {
1424   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1425   PetscScalar        x;
1426   const PetscScalar *l, *li, *ri;
1427   MatScalar         *aa, *v;
1428   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1429   const PetscInt    *ai, *aj;
1430   PetscBool          flg;
1431 
1432   PetscFunctionBegin;
1433   if (ll != rr) {
1434     PetscCall(VecEqual(ll, rr, &flg));
1435     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1436   }
1437   if (!ll) PetscFunctionReturn(0);
1438   ai  = a->i;
1439   aj  = a->j;
1440   aa  = a->a;
1441   m   = A->rmap->N;
1442   bs  = A->rmap->bs;
1443   mbs = a->mbs;
1444   bs2 = a->bs2;
1445 
1446   PetscCall(VecGetArrayRead(ll, &l));
1447   PetscCall(VecGetLocalSize(ll, &lm));
1448   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1449   for (i = 0; i < mbs; i++) { /* for each block row */
1450     M  = ai[i + 1] - ai[i];
1451     li = l + i * bs;
1452     v  = aa + bs2 * ai[i];
1453     for (j = 0; j < M; j++) { /* for each block */
1454       ri = l + bs * aj[ai[i] + j];
1455       for (k = 0; k < bs; k++) {
1456         x = ri[k];
1457         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1458       }
1459     }
1460   }
1461   PetscCall(VecRestoreArrayRead(ll, &l));
1462   PetscCall(PetscLogFlops(2.0 * a->nz));
1463   PetscFunctionReturn(0);
1464 }
1465 
1466 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1467 {
1468   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1469 
1470   PetscFunctionBegin;
1471   info->block_size   = a->bs2;
1472   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1473   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1474   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1475   info->assemblies   = A->num_ass;
1476   info->mallocs      = A->info.mallocs;
1477   info->memory       = 0; /* REVIEW ME */
1478   if (A->factortype) {
1479     info->fill_ratio_given  = A->info.fill_ratio_given;
1480     info->fill_ratio_needed = A->info.fill_ratio_needed;
1481     info->factor_mallocs    = A->info.factor_mallocs;
1482   } else {
1483     info->fill_ratio_given  = 0;
1484     info->fill_ratio_needed = 0;
1485     info->factor_mallocs    = 0;
1486   }
1487   PetscFunctionReturn(0);
1488 }
1489 
1490 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1491 {
1492   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1493 
1494   PetscFunctionBegin;
1495   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1500 {
1501   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1502   PetscInt         i, j, n, row, col, bs, mbs;
1503   const PetscInt  *ai, *aj;
1504   PetscReal        atmp;
1505   const MatScalar *aa;
1506   PetscScalar     *x;
1507   PetscInt         ncols, brow, bcol, krow, kcol;
1508 
1509   PetscFunctionBegin;
1510   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1511   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1512   bs  = A->rmap->bs;
1513   aa  = a->a;
1514   ai  = a->i;
1515   aj  = a->j;
1516   mbs = a->mbs;
1517 
1518   PetscCall(VecSet(v, 0.0));
1519   PetscCall(VecGetArray(v, &x));
1520   PetscCall(VecGetLocalSize(v, &n));
1521   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1522   for (i = 0; i < mbs; i++) {
1523     ncols = ai[1] - ai[0];
1524     ai++;
1525     brow = bs * i;
1526     for (j = 0; j < ncols; j++) {
1527       bcol = bs * (*aj);
1528       for (kcol = 0; kcol < bs; kcol++) {
1529         col = bcol + kcol; /* col index */
1530         for (krow = 0; krow < bs; krow++) {
1531           atmp = PetscAbsScalar(*aa);
1532           aa++;
1533           row = brow + krow; /* row index */
1534           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1535           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1536         }
1537       }
1538       aj++;
1539     }
1540   }
1541   PetscCall(VecRestoreArray(v, &x));
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1546 {
1547   PetscFunctionBegin;
1548   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1549   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1554 {
1555   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1556   PetscScalar       *z = c;
1557   const PetscScalar *xb;
1558   PetscScalar        x1;
1559   const MatScalar   *v   = a->a, *vv;
1560   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1561 #if defined(PETSC_USE_COMPLEX)
1562   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1563 #else
1564   const int aconj = 0;
1565 #endif
1566 
1567   PetscFunctionBegin;
1568   for (i = 0; i < mbs; i++) {
1569     n = ii[1] - ii[0];
1570     ii++;
1571     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1572     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1573     jj = idx;
1574     vv = v;
1575     for (k = 0; k < cn; k++) {
1576       idx = jj;
1577       v   = vv;
1578       for (j = 0; j < n; j++) {
1579         xb = b + (*idx);
1580         x1 = xb[0 + k * bm];
1581         z[0 + k * cm] += v[0] * x1;
1582         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1583         v += 1;
1584         ++idx;
1585       }
1586     }
1587     z += 1;
1588   }
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1593 {
1594   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1595   PetscScalar       *z = c;
1596   const PetscScalar *xb;
1597   PetscScalar        x1, x2;
1598   const MatScalar   *v   = a->a, *vv;
1599   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1600 
1601   PetscFunctionBegin;
1602   for (i = 0; i < mbs; i++) {
1603     n = ii[1] - ii[0];
1604     ii++;
1605     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1606     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1607     jj = idx;
1608     vv = v;
1609     for (k = 0; k < cn; k++) {
1610       idx = jj;
1611       v   = vv;
1612       for (j = 0; j < n; j++) {
1613         xb = b + 2 * (*idx);
1614         x1 = xb[0 + k * bm];
1615         x2 = xb[1 + k * bm];
1616         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1617         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1618         if (*idx != i) {
1619           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1620           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1621         }
1622         v += 4;
1623         ++idx;
1624       }
1625     }
1626     z += 2;
1627   }
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1632 {
1633   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1634   PetscScalar       *z = c;
1635   const PetscScalar *xb;
1636   PetscScalar        x1, x2, x3;
1637   const MatScalar   *v   = a->a, *vv;
1638   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1639 
1640   PetscFunctionBegin;
1641   for (i = 0; i < mbs; i++) {
1642     n = ii[1] - ii[0];
1643     ii++;
1644     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1645     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1646     jj = idx;
1647     vv = v;
1648     for (k = 0; k < cn; k++) {
1649       idx = jj;
1650       v   = vv;
1651       for (j = 0; j < n; j++) {
1652         xb = b + 3 * (*idx);
1653         x1 = xb[0 + k * bm];
1654         x2 = xb[1 + k * bm];
1655         x3 = xb[2 + k * bm];
1656         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1657         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1658         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1659         if (*idx != i) {
1660           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1661           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1662           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1663         }
1664         v += 9;
1665         ++idx;
1666       }
1667     }
1668     z += 3;
1669   }
1670   PetscFunctionReturn(0);
1671 }
1672 
1673 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1674 {
1675   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1676   PetscScalar       *z = c;
1677   const PetscScalar *xb;
1678   PetscScalar        x1, x2, x3, x4;
1679   const MatScalar   *v   = a->a, *vv;
1680   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1681 
1682   PetscFunctionBegin;
1683   for (i = 0; i < mbs; i++) {
1684     n = ii[1] - ii[0];
1685     ii++;
1686     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1687     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1688     jj = idx;
1689     vv = v;
1690     for (k = 0; k < cn; k++) {
1691       idx = jj;
1692       v   = vv;
1693       for (j = 0; j < n; j++) {
1694         xb = b + 4 * (*idx);
1695         x1 = xb[0 + k * bm];
1696         x2 = xb[1 + k * bm];
1697         x3 = xb[2 + k * bm];
1698         x4 = xb[3 + k * bm];
1699         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1700         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1701         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1702         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1703         if (*idx != i) {
1704           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1705           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1706           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1707           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1708         }
1709         v += 16;
1710         ++idx;
1711       }
1712     }
1713     z += 4;
1714   }
1715   PetscFunctionReturn(0);
1716 }
1717 
1718 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1719 {
1720   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1721   PetscScalar       *z = c;
1722   const PetscScalar *xb;
1723   PetscScalar        x1, x2, x3, x4, x5;
1724   const MatScalar   *v   = a->a, *vv;
1725   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1726 
1727   PetscFunctionBegin;
1728   for (i = 0; i < mbs; i++) {
1729     n = ii[1] - ii[0];
1730     ii++;
1731     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1732     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1733     jj = idx;
1734     vv = v;
1735     for (k = 0; k < cn; k++) {
1736       idx = jj;
1737       v   = vv;
1738       for (j = 0; j < n; j++) {
1739         xb = b + 5 * (*idx);
1740         x1 = xb[0 + k * bm];
1741         x2 = xb[1 + k * bm];
1742         x3 = xb[2 + k * bm];
1743         x4 = xb[3 + k * bm];
1744         x5 = xb[4 + k * cm];
1745         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1746         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1747         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1748         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1749         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1750         if (*idx != i) {
1751           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1752           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1753           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1754           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1755           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1756         }
1757         v += 25;
1758         ++idx;
1759       }
1760     }
1761     z += 5;
1762   }
1763   PetscFunctionReturn(0);
1764 }
1765 
1766 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1767 {
1768   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1769   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1770   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1771   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1772   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1773   PetscBLASInt     bbs, bcn, bbm, bcm;
1774   PetscScalar     *z = NULL;
1775   PetscScalar     *c, *b;
1776   const MatScalar *v;
1777   const PetscInt  *idx, *ii;
1778   PetscScalar      _DOne = 1.0;
1779 
1780   PetscFunctionBegin;
1781   if (!cm || !cn) PetscFunctionReturn(0);
1782   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1783   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1784   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1785   b = bd->v;
1786   PetscCall(MatZeroEntries(C));
1787   PetscCall(MatDenseGetArray(C, &c));
1788   switch (bs) {
1789   case 1:
1790     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1791     break;
1792   case 2:
1793     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1794     break;
1795   case 3:
1796     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1797     break;
1798   case 4:
1799     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1800     break;
1801   case 5:
1802     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1803     break;
1804   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1805     PetscCall(PetscBLASIntCast(bs, &bbs));
1806     PetscCall(PetscBLASIntCast(cn, &bcn));
1807     PetscCall(PetscBLASIntCast(bm, &bbm));
1808     PetscCall(PetscBLASIntCast(cm, &bcm));
1809     idx = a->j;
1810     v   = a->a;
1811     mbs = a->mbs;
1812     ii  = a->i;
1813     z   = c;
1814     for (i = 0; i < mbs; i++) {
1815       n = ii[1] - ii[0];
1816       ii++;
1817       for (j = 0; j < n; j++) {
1818         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1819         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1820         v += bs2;
1821       }
1822       z += bs;
1823     }
1824   }
1825   PetscCall(MatDenseRestoreArray(C, &c));
1826   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1827   PetscFunctionReturn(0);
1828 }
1829