xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d9acb416d05abeed0a33bde3a81aeb2ea0364f6a)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <../src/mat/impls/sbaij/seq/sbaij.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7 
8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9 {
10   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
11   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
12   const PetscInt *idx;
13   PetscBT         table_out, table_in;
14 
15   PetscFunctionBegin;
16   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17   mbs = a->mbs;
18   ai  = a->i;
19   aj  = a->j;
20   bs  = A->rmap->bs;
21   PetscCall(PetscBTCreate(mbs, &table_out));
22   PetscCall(PetscMalloc1(mbs + 1, &nidx));
23   PetscCall(PetscBTCreate(mbs, &table_in));
24 
25   for (i = 0; i < is_max; i++) { /* for each is */
26     isz = 0;
27     PetscCall(PetscBTMemzero(mbs, table_out));
28 
29     /* Extract the indices, assume there can be duplicate entries */
30     PetscCall(ISGetIndices(is[i], &idx));
31     PetscCall(ISGetLocalSize(is[i], &n));
32 
33     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34     bcol_max = 0;
35     for (j = 0; j < n; ++j) {
36       brow = idx[j] / bs; /* convert the indices into block indices */
37       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38       if (!PetscBTLookupSet(table_out, brow)) {
39         nidx[isz++] = brow;
40         if (bcol_max < brow) bcol_max = brow;
41       }
42     }
43     PetscCall(ISRestoreIndices(is[i], &idx));
44     PetscCall(ISDestroy(&is[i]));
45 
46     k = 0;
47     for (j = 0; j < ov; j++) { /* for each overlap */
48       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49       PetscCall(PetscBTMemzero(mbs, table_in));
50       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
51 
52       n = isz; /* length of the updated is[i] */
53       for (brow = 0; brow < mbs; brow++) {
54         start = ai[brow];
55         end   = ai[brow + 1];
56         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57           for (l = start; l < end; l++) {
58             bcol = aj[l];
59             if (!PetscBTLookupSet(table_out, bcol)) {
60               nidx[isz++] = bcol;
61               if (bcol_max < bcol) bcol_max = bcol;
62             }
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l < end; l++) {
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table_in, bcol)) {
71               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
79   } /* for each is */
80   PetscCall(PetscBTDestroy(&table_out));
81   PetscCall(PetscFree(nidx));
82   PetscCall(PetscBTDestroy(&table_in));
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87         Zero some ops' to avoid invalid use */
88 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89 {
90   PetscFunctionBegin;
91   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92   Bseq->ops->mult                   = NULL;
93   Bseq->ops->multadd                = NULL;
94   Bseq->ops->multtranspose          = NULL;
95   Bseq->ops->multtransposeadd       = NULL;
96   Bseq->ops->lufactor               = NULL;
97   Bseq->ops->choleskyfactor         = NULL;
98   Bseq->ops->lufactorsymbolic       = NULL;
99   Bseq->ops->choleskyfactorsymbolic = NULL;
100   Bseq->ops->getinertia             = NULL;
101   PetscFunctionReturn(PETSC_SUCCESS);
102 }
103 
104 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
105 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106 {
107   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108   Mat_SeqBAIJ    *d = NULL;
109   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111   const PetscInt *irow, *icol;
112   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113   PetscInt       *aj = a->j, *ai = a->i;
114   MatScalar      *mat_a;
115   Mat             C;
116   PetscBool       flag;
117 
118   PetscFunctionBegin;
119 
120   PetscCall(ISGetIndices(isrow, &irow));
121   PetscCall(ISGetIndices(iscol, &icol));
122   PetscCall(ISGetLocalSize(isrow, &nrows));
123   PetscCall(ISGetLocalSize(iscol, &ncols));
124 
125   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126   ssmap = smap;
127   PetscCall(PetscMalloc1(1 + nrows, &lens));
128   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129   /* determine lens of each row */
130   for (i = 0; i < nrows; i++) {
131     kstart  = ai[irow[i]];
132     kend    = kstart + a->ilen[irow[i]];
133     lens[i] = 0;
134     for (k = kstart; k < kend; k++) {
135       if (ssmap[aj[k]]) lens[i]++;
136     }
137   }
138   /* Create and fill new matrix */
139   if (scall == MAT_REUSE_MATRIX) {
140     if (sym) {
141       c = (Mat_SeqSBAIJ *)((*B)->data);
142 
143       PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
144       PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
145       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
146       PetscCall(PetscArrayzero(c->ilen, c->mbs));
147     } else {
148       d = (Mat_SeqBAIJ *)((*B)->data);
149 
150       PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
151       PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
152       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
153       PetscCall(PetscArrayzero(d->ilen, d->mbs));
154     }
155     C = *B;
156   } else {
157     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
158     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
159     if (sym) {
160       PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
161       PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
162     } else {
163       PetscCall(MatSetType(C, MATSEQBAIJ));
164       PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
165     }
166   }
167   if (sym) c = (Mat_SeqSBAIJ *)(C->data);
168   else d = (Mat_SeqBAIJ *)(C->data);
169   for (i = 0; i < nrows; i++) {
170     row    = irow[i];
171     kstart = ai[row];
172     kend   = kstart + a->ilen[row];
173     if (sym) {
174       mat_i    = c->i[i];
175       mat_j    = PetscSafePointerPlusOffset(c->j, mat_i);
176       mat_a    = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
177       mat_ilen = c->ilen + i;
178     } else {
179       mat_i    = d->i[i];
180       mat_j    = PetscSafePointerPlusOffset(d->j, mat_i);
181       mat_a    = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
182       mat_ilen = d->ilen + i;
183     }
184     for (k = kstart; k < kend; k++) {
185       if ((tcol = ssmap[a->j[k]])) {
186         *mat_j++ = tcol - 1;
187         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
188         mat_a += bs2;
189         (*mat_ilen)++;
190       }
191     }
192   }
193   /* sort */
194   {
195     MatScalar *work;
196 
197     PetscCall(PetscMalloc1(bs2, &work));
198     for (i = 0; i < nrows; i++) {
199       PetscInt ilen;
200       if (sym) {
201         mat_i = c->i[i];
202         mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
203         mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
204         ilen  = c->ilen[i];
205       } else {
206         mat_i = d->i[i];
207         mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
208         mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
209         ilen  = d->ilen[i];
210       }
211       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
212     }
213     PetscCall(PetscFree(work));
214   }
215 
216   /* Free work space */
217   PetscCall(ISRestoreIndices(iscol, &icol));
218   PetscCall(PetscFree(smap));
219   PetscCall(PetscFree(lens));
220   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
221   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
222 
223   PetscCall(ISRestoreIndices(isrow, &irow));
224   *B = C;
225   PetscFunctionReturn(PETSC_SUCCESS);
226 }
227 
228 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
229 {
230   Mat       C[2];
231   IS        is1, is2, intersect = NULL;
232   PetscInt  n1, n2, ni;
233   PetscBool sym = PETSC_TRUE;
234 
235   PetscFunctionBegin;
236   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
237   if (isrow == iscol) {
238     is2 = is1;
239     PetscCall(PetscObjectReference((PetscObject)is2));
240   } else {
241     PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
242     PetscCall(ISIntersect(is1, is2, &intersect));
243     PetscCall(ISGetLocalSize(intersect, &ni));
244     PetscCall(ISDestroy(&intersect));
245     if (ni == 0) sym = PETSC_FALSE;
246     else if (PetscDefined(USE_DEBUG)) {
247       PetscCall(ISGetLocalSize(is1, &n1));
248       PetscCall(ISGetLocalSize(is2, &n2));
249       PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
250     }
251   }
252   if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
253   else {
254     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
255     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
256     PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
257     PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
258     PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
259     if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
260     else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
261     else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
262     PetscCall(MatDestroy(C));
263     PetscCall(MatDestroy(C + 1));
264   }
265   PetscCall(ISDestroy(&is1));
266   PetscCall(ISDestroy(&is2));
267 
268   if (sym && isrow != iscol) {
269     PetscBool isequal;
270     PetscCall(ISEqual(isrow, iscol, &isequal));
271     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
272   }
273   PetscFunctionReturn(PETSC_SUCCESS);
274 }
275 
276 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
277 {
278   PetscInt i;
279 
280   PetscFunctionBegin;
281   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
282 
283   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 /* Should check that shapes of vectors and matrices match */
288 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
289 {
290   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
291   PetscScalar       *z, x1, x2, zero = 0.0;
292   const PetscScalar *x, *xb;
293   const MatScalar   *v;
294   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
295   const PetscInt    *aj = a->j, *ai = a->i, *ib;
296   PetscInt           nonzerorow = 0;
297 
298   PetscFunctionBegin;
299   PetscCall(VecSet(zz, zero));
300   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
301   PetscCall(VecGetArrayRead(xx, &x));
302   PetscCall(VecGetArray(zz, &z));
303 
304   v  = a->a;
305   xb = x;
306 
307   for (i = 0; i < mbs; i++) {
308     n    = ai[1] - ai[0]; /* length of i_th block row of A */
309     x1   = xb[0];
310     x2   = xb[1];
311     ib   = aj + *ai;
312     jmin = 0;
313     nonzerorow += (n > 0);
314     if (*ib == i) { /* (diag of A)*x */
315       z[2 * i] += v[0] * x1 + v[2] * x2;
316       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
317       v += 4;
318       jmin++;
319     }
320     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
321     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
322     for (j = jmin; j < n; j++) {
323       /* (strict lower triangular part of A)*x  */
324       cval = ib[j] * 2;
325       z[cval] += v[0] * x1 + v[1] * x2;
326       z[cval + 1] += v[2] * x1 + v[3] * x2;
327       /* (strict upper triangular part of A)*x  */
328       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
329       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
330       v += 4;
331     }
332     xb += 2;
333     ai++;
334   }
335 
336   PetscCall(VecRestoreArrayRead(xx, &x));
337   PetscCall(VecRestoreArray(zz, &z));
338   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
339   PetscFunctionReturn(PETSC_SUCCESS);
340 }
341 
342 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
343 {
344   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
345   PetscScalar       *z, x1, x2, x3, zero = 0.0;
346   const PetscScalar *x, *xb;
347   const MatScalar   *v;
348   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
349   const PetscInt    *aj = a->j, *ai = a->i, *ib;
350   PetscInt           nonzerorow = 0;
351 
352   PetscFunctionBegin;
353   PetscCall(VecSet(zz, zero));
354   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
355   PetscCall(VecGetArrayRead(xx, &x));
356   PetscCall(VecGetArray(zz, &z));
357 
358   v  = a->a;
359   xb = x;
360 
361   for (i = 0; i < mbs; i++) {
362     n    = ai[1] - ai[0]; /* length of i_th block row of A */
363     x1   = xb[0];
364     x2   = xb[1];
365     x3   = xb[2];
366     ib   = aj + *ai;
367     jmin = 0;
368     nonzerorow += (n > 0);
369     if (*ib == i) { /* (diag of A)*x */
370       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
371       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
372       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
373       v += 9;
374       jmin++;
375     }
376     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
377     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378     for (j = jmin; j < n; j++) {
379       /* (strict lower triangular part of A)*x  */
380       cval = ib[j] * 3;
381       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
382       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
383       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
384       /* (strict upper triangular part of A)*x  */
385       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
386       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
387       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
388       v += 9;
389     }
390     xb += 3;
391     ai++;
392   }
393 
394   PetscCall(VecRestoreArrayRead(xx, &x));
395   PetscCall(VecRestoreArray(zz, &z));
396   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
397   PetscFunctionReturn(PETSC_SUCCESS);
398 }
399 
400 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
401 {
402   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
403   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
404   const PetscScalar *x, *xb;
405   const MatScalar   *v;
406   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
407   const PetscInt    *aj = a->j, *ai = a->i, *ib;
408   PetscInt           nonzerorow = 0;
409 
410   PetscFunctionBegin;
411   PetscCall(VecSet(zz, zero));
412   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
413   PetscCall(VecGetArrayRead(xx, &x));
414   PetscCall(VecGetArray(zz, &z));
415 
416   v  = a->a;
417   xb = x;
418 
419   for (i = 0; i < mbs; i++) {
420     n    = ai[1] - ai[0]; /* length of i_th block row of A */
421     x1   = xb[0];
422     x2   = xb[1];
423     x3   = xb[2];
424     x4   = xb[3];
425     ib   = aj + *ai;
426     jmin = 0;
427     nonzerorow += (n > 0);
428     if (*ib == i) { /* (diag of A)*x */
429       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
430       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
431       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
432       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
433       v += 16;
434       jmin++;
435     }
436     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
437     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
438     for (j = jmin; j < n; j++) {
439       /* (strict lower triangular part of A)*x  */
440       cval = ib[j] * 4;
441       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
442       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
443       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
444       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
445       /* (strict upper triangular part of A)*x  */
446       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
447       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
448       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
449       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
450       v += 16;
451     }
452     xb += 4;
453     ai++;
454   }
455 
456   PetscCall(VecRestoreArrayRead(xx, &x));
457   PetscCall(VecRestoreArray(zz, &z));
458   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
459   PetscFunctionReturn(PETSC_SUCCESS);
460 }
461 
462 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
463 {
464   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
465   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
466   const PetscScalar *x, *xb;
467   const MatScalar   *v;
468   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
469   const PetscInt    *aj = a->j, *ai = a->i, *ib;
470   PetscInt           nonzerorow = 0;
471 
472   PetscFunctionBegin;
473   PetscCall(VecSet(zz, zero));
474   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
475   PetscCall(VecGetArrayRead(xx, &x));
476   PetscCall(VecGetArray(zz, &z));
477 
478   v  = a->a;
479   xb = x;
480 
481   for (i = 0; i < mbs; i++) {
482     n    = ai[1] - ai[0]; /* length of i_th block row of A */
483     x1   = xb[0];
484     x2   = xb[1];
485     x3   = xb[2];
486     x4   = xb[3];
487     x5   = xb[4];
488     ib   = aj + *ai;
489     jmin = 0;
490     nonzerorow += (n > 0);
491     if (*ib == i) { /* (diag of A)*x */
492       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
493       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
494       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
495       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
496       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
497       v += 25;
498       jmin++;
499     }
500     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
501     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
502     for (j = jmin; j < n; j++) {
503       /* (strict lower triangular part of A)*x  */
504       cval = ib[j] * 5;
505       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
506       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
507       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
508       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
509       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
510       /* (strict upper triangular part of A)*x  */
511       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
512       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
513       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
514       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
515       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
516       v += 25;
517     }
518     xb += 5;
519     ai++;
520   }
521 
522   PetscCall(VecRestoreArrayRead(xx, &x));
523   PetscCall(VecRestoreArray(zz, &z));
524   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
525   PetscFunctionReturn(PETSC_SUCCESS);
526 }
527 
528 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
529 {
530   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
531   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
532   const PetscScalar *x, *xb;
533   const MatScalar   *v;
534   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
535   const PetscInt    *aj = a->j, *ai = a->i, *ib;
536   PetscInt           nonzerorow = 0;
537 
538   PetscFunctionBegin;
539   PetscCall(VecSet(zz, zero));
540   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
541   PetscCall(VecGetArrayRead(xx, &x));
542   PetscCall(VecGetArray(zz, &z));
543 
544   v  = a->a;
545   xb = x;
546 
547   for (i = 0; i < mbs; i++) {
548     n    = ai[1] - ai[0]; /* length of i_th block row of A */
549     x1   = xb[0];
550     x2   = xb[1];
551     x3   = xb[2];
552     x4   = xb[3];
553     x5   = xb[4];
554     x6   = xb[5];
555     ib   = aj + *ai;
556     jmin = 0;
557     nonzerorow += (n > 0);
558     if (*ib == i) { /* (diag of A)*x */
559       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
560       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
561       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
562       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
563       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
564       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
565       v += 36;
566       jmin++;
567     }
568     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
569     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
570     for (j = jmin; j < n; j++) {
571       /* (strict lower triangular part of A)*x  */
572       cval = ib[j] * 6;
573       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
574       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
575       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
576       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
577       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
578       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
579       /* (strict upper triangular part of A)*x  */
580       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
581       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
582       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
583       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
584       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
585       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
586       v += 36;
587     }
588     xb += 6;
589     ai++;
590   }
591 
592   PetscCall(VecRestoreArrayRead(xx, &x));
593   PetscCall(VecRestoreArray(zz, &z));
594   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
595   PetscFunctionReturn(PETSC_SUCCESS);
596 }
597 
598 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
599 {
600   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
601   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
602   const PetscScalar *x, *xb;
603   const MatScalar   *v;
604   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
605   const PetscInt    *aj = a->j, *ai = a->i, *ib;
606   PetscInt           nonzerorow = 0;
607 
608   PetscFunctionBegin;
609   PetscCall(VecSet(zz, zero));
610   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
611   PetscCall(VecGetArrayRead(xx, &x));
612   PetscCall(VecGetArray(zz, &z));
613 
614   v  = a->a;
615   xb = x;
616 
617   for (i = 0; i < mbs; i++) {
618     n    = ai[1] - ai[0]; /* length of i_th block row of A */
619     x1   = xb[0];
620     x2   = xb[1];
621     x3   = xb[2];
622     x4   = xb[3];
623     x5   = xb[4];
624     x6   = xb[5];
625     x7   = xb[6];
626     ib   = aj + *ai;
627     jmin = 0;
628     nonzerorow += (n > 0);
629     if (*ib == i) { /* (diag of A)*x */
630       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
631       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
632       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
633       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
634       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
635       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
636       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
637       v += 49;
638       jmin++;
639     }
640     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
641     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
642     for (j = jmin; j < n; j++) {
643       /* (strict lower triangular part of A)*x  */
644       cval = ib[j] * 7;
645       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
646       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
647       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
648       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
649       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
650       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
651       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
652       /* (strict upper triangular part of A)*x  */
653       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
654       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
655       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
656       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
657       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
658       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
659       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
660       v += 49;
661     }
662     xb += 7;
663     ai++;
664   }
665   PetscCall(VecRestoreArrayRead(xx, &x));
666   PetscCall(VecRestoreArray(zz, &z));
667   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
668   PetscFunctionReturn(PETSC_SUCCESS);
669 }
670 
671 /*
672     This will not work with MatScalar == float because it calls the BLAS
673 */
674 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
675 {
676   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
677   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
678   const PetscScalar *x, *x_ptr, *xb;
679   const MatScalar   *v;
680   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
681   const PetscInt    *idx, *aj, *ii;
682   PetscInt           nonzerorow = 0;
683 
684   PetscFunctionBegin;
685   PetscCall(VecSet(zz, zero));
686   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
687   PetscCall(VecGetArrayRead(xx, &x));
688   PetscCall(VecGetArray(zz, &z));
689 
690   x_ptr = x;
691   z_ptr = z;
692 
693   aj = a->j;
694   v  = a->a;
695   ii = a->i;
696 
697   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
698   work = a->mult_work;
699 
700   for (i = 0; i < mbs; i++) {
701     n     = ii[1] - ii[0];
702     ncols = n * bs;
703     workt = work;
704     idx   = aj + ii[0];
705     nonzerorow += (n > 0);
706 
707     /* upper triangular part */
708     for (j = 0; j < n; j++) {
709       xb = x_ptr + bs * (*idx++);
710       for (k = 0; k < bs; k++) workt[k] = xb[k];
711       workt += bs;
712     }
713     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
714     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
715 
716     /* strict lower triangular part */
717     idx = aj + ii[0];
718     if (n && *idx == i) {
719       ncols -= bs;
720       v += bs2;
721       idx++;
722       n--;
723     }
724 
725     if (ncols > 0) {
726       workt = work;
727       PetscCall(PetscArrayzero(workt, ncols));
728       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
729       for (j = 0; j < n; j++) {
730         zb = z_ptr + bs * (*idx++);
731         for (k = 0; k < bs; k++) zb[k] += workt[k];
732         workt += bs;
733       }
734     }
735     x += bs;
736     v += n * bs2;
737     z += bs;
738     ii++;
739   }
740 
741   PetscCall(VecRestoreArrayRead(xx, &x));
742   PetscCall(VecRestoreArray(zz, &z));
743   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
744   PetscFunctionReturn(PETSC_SUCCESS);
745 }
746 
747 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
748 {
749   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
750   PetscScalar       *z, x1;
751   const PetscScalar *x, *xb;
752   const MatScalar   *v;
753   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
754   const PetscInt    *aj = a->j, *ai = a->i, *ib;
755   PetscInt           nonzerorow = 0;
756 #if defined(PETSC_USE_COMPLEX)
757   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
758 #else
759   const int aconj = 0;
760 #endif
761 
762   PetscFunctionBegin;
763   PetscCall(VecCopy(yy, zz));
764   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
765   PetscCall(VecGetArrayRead(xx, &x));
766   PetscCall(VecGetArray(zz, &z));
767   v  = a->a;
768   xb = x;
769 
770   for (i = 0; i < mbs; i++) {
771     n    = ai[1] - ai[0]; /* length of i_th row of A */
772     x1   = xb[0];
773     ib   = aj + *ai;
774     jmin = 0;
775     nonzerorow += (n > 0);
776     if (n && *ib == i) { /* (diag of A)*x */
777       z[i] += *v++ * x[*ib++];
778       jmin++;
779     }
780     if (aconj) {
781       for (j = jmin; j < n; j++) {
782         cval = *ib;
783         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
784         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
785       }
786     } else {
787       for (j = jmin; j < n; j++) {
788         cval = *ib;
789         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
790         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
791       }
792     }
793     xb++;
794     ai++;
795   }
796 
797   PetscCall(VecRestoreArrayRead(xx, &x));
798   PetscCall(VecRestoreArray(zz, &z));
799 
800   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
801   PetscFunctionReturn(PETSC_SUCCESS);
802 }
803 
804 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
805 {
806   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
807   PetscScalar       *z, x1, x2;
808   const PetscScalar *x, *xb;
809   const MatScalar   *v;
810   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
811   const PetscInt    *aj = a->j, *ai = a->i, *ib;
812   PetscInt           nonzerorow = 0;
813 
814   PetscFunctionBegin;
815   PetscCall(VecCopy(yy, zz));
816   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
817   PetscCall(VecGetArrayRead(xx, &x));
818   PetscCall(VecGetArray(zz, &z));
819 
820   v  = a->a;
821   xb = x;
822 
823   for (i = 0; i < mbs; i++) {
824     n    = ai[1] - ai[0]; /* length of i_th block row of A */
825     x1   = xb[0];
826     x2   = xb[1];
827     ib   = aj + *ai;
828     jmin = 0;
829     nonzerorow += (n > 0);
830     if (n && *ib == i) { /* (diag of A)*x */
831       z[2 * i] += v[0] * x1 + v[2] * x2;
832       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
833       v += 4;
834       jmin++;
835     }
836     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
837     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
838     for (j = jmin; j < n; j++) {
839       /* (strict lower triangular part of A)*x  */
840       cval = ib[j] * 2;
841       z[cval] += v[0] * x1 + v[1] * x2;
842       z[cval + 1] += v[2] * x1 + v[3] * x2;
843       /* (strict upper triangular part of A)*x  */
844       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
845       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
846       v += 4;
847     }
848     xb += 2;
849     ai++;
850   }
851   PetscCall(VecRestoreArrayRead(xx, &x));
852   PetscCall(VecRestoreArray(zz, &z));
853 
854   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
855   PetscFunctionReturn(PETSC_SUCCESS);
856 }
857 
858 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
859 {
860   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
861   PetscScalar       *z, x1, x2, x3;
862   const PetscScalar *x, *xb;
863   const MatScalar   *v;
864   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
865   const PetscInt    *aj = a->j, *ai = a->i, *ib;
866   PetscInt           nonzerorow = 0;
867 
868   PetscFunctionBegin;
869   PetscCall(VecCopy(yy, zz));
870   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
871   PetscCall(VecGetArrayRead(xx, &x));
872   PetscCall(VecGetArray(zz, &z));
873 
874   v  = a->a;
875   xb = x;
876 
877   for (i = 0; i < mbs; i++) {
878     n    = ai[1] - ai[0]; /* length of i_th block row of A */
879     x1   = xb[0];
880     x2   = xb[1];
881     x3   = xb[2];
882     ib   = aj + *ai;
883     jmin = 0;
884     nonzerorow += (n > 0);
885     if (n && *ib == i) { /* (diag of A)*x */
886       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
887       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
888       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
889       v += 9;
890       jmin++;
891     }
892     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
893     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
894     for (j = jmin; j < n; j++) {
895       /* (strict lower triangular part of A)*x  */
896       cval = ib[j] * 3;
897       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
898       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
899       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
900       /* (strict upper triangular part of A)*x  */
901       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
902       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
903       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
904       v += 9;
905     }
906     xb += 3;
907     ai++;
908   }
909 
910   PetscCall(VecRestoreArrayRead(xx, &x));
911   PetscCall(VecRestoreArray(zz, &z));
912 
913   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
914   PetscFunctionReturn(PETSC_SUCCESS);
915 }
916 
917 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
918 {
919   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
920   PetscScalar       *z, x1, x2, x3, x4;
921   const PetscScalar *x, *xb;
922   const MatScalar   *v;
923   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
924   const PetscInt    *aj = a->j, *ai = a->i, *ib;
925   PetscInt           nonzerorow = 0;
926 
927   PetscFunctionBegin;
928   PetscCall(VecCopy(yy, zz));
929   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
930   PetscCall(VecGetArrayRead(xx, &x));
931   PetscCall(VecGetArray(zz, &z));
932 
933   v  = a->a;
934   xb = x;
935 
936   for (i = 0; i < mbs; i++) {
937     n    = ai[1] - ai[0]; /* length of i_th block row of A */
938     x1   = xb[0];
939     x2   = xb[1];
940     x3   = xb[2];
941     x4   = xb[3];
942     ib   = aj + *ai;
943     jmin = 0;
944     nonzerorow += (n > 0);
945     if (n && *ib == i) { /* (diag of A)*x */
946       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
947       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
948       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
949       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
950       v += 16;
951       jmin++;
952     }
953     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
954     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
955     for (j = jmin; j < n; j++) {
956       /* (strict lower triangular part of A)*x  */
957       cval = ib[j] * 4;
958       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
959       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
960       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
961       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
962       /* (strict upper triangular part of A)*x  */
963       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
964       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
965       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
966       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
967       v += 16;
968     }
969     xb += 4;
970     ai++;
971   }
972 
973   PetscCall(VecRestoreArrayRead(xx, &x));
974   PetscCall(VecRestoreArray(zz, &z));
975 
976   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
977   PetscFunctionReturn(PETSC_SUCCESS);
978 }
979 
980 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
981 {
982   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
983   PetscScalar       *z, x1, x2, x3, x4, x5;
984   const PetscScalar *x, *xb;
985   const MatScalar   *v;
986   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
987   const PetscInt    *aj = a->j, *ai = a->i, *ib;
988   PetscInt           nonzerorow = 0;
989 
990   PetscFunctionBegin;
991   PetscCall(VecCopy(yy, zz));
992   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
993   PetscCall(VecGetArrayRead(xx, &x));
994   PetscCall(VecGetArray(zz, &z));
995 
996   v  = a->a;
997   xb = x;
998 
999   for (i = 0; i < mbs; i++) {
1000     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1001     x1   = xb[0];
1002     x2   = xb[1];
1003     x3   = xb[2];
1004     x4   = xb[3];
1005     x5   = xb[4];
1006     ib   = aj + *ai;
1007     jmin = 0;
1008     nonzerorow += (n > 0);
1009     if (n && *ib == i) { /* (diag of A)*x */
1010       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1011       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1012       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1013       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1014       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1015       v += 25;
1016       jmin++;
1017     }
1018     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1019     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1020     for (j = jmin; j < n; j++) {
1021       /* (strict lower triangular part of A)*x  */
1022       cval = ib[j] * 5;
1023       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1024       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1025       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1026       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1027       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1028       /* (strict upper triangular part of A)*x  */
1029       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1030       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1031       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1032       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1033       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1034       v += 25;
1035     }
1036     xb += 5;
1037     ai++;
1038   }
1039 
1040   PetscCall(VecRestoreArrayRead(xx, &x));
1041   PetscCall(VecRestoreArray(zz, &z));
1042 
1043   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1044   PetscFunctionReturn(PETSC_SUCCESS);
1045 }
1046 
1047 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1048 {
1049   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1050   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1051   const PetscScalar *x, *xb;
1052   const MatScalar   *v;
1053   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1054   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1055   PetscInt           nonzerorow = 0;
1056 
1057   PetscFunctionBegin;
1058   PetscCall(VecCopy(yy, zz));
1059   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1060   PetscCall(VecGetArrayRead(xx, &x));
1061   PetscCall(VecGetArray(zz, &z));
1062 
1063   v  = a->a;
1064   xb = x;
1065 
1066   for (i = 0; i < mbs; i++) {
1067     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1068     x1   = xb[0];
1069     x2   = xb[1];
1070     x3   = xb[2];
1071     x4   = xb[3];
1072     x5   = xb[4];
1073     x6   = xb[5];
1074     ib   = aj + *ai;
1075     jmin = 0;
1076     nonzerorow += (n > 0);
1077     if (n && *ib == i) { /* (diag of A)*x */
1078       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1079       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1080       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1081       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1082       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1083       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1084       v += 36;
1085       jmin++;
1086     }
1087     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1088     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1089     for (j = jmin; j < n; j++) {
1090       /* (strict lower triangular part of A)*x  */
1091       cval = ib[j] * 6;
1092       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1093       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1094       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1095       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1096       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1097       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1098       /* (strict upper triangular part of A)*x  */
1099       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1100       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1101       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1102       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1103       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1104       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1105       v += 36;
1106     }
1107     xb += 6;
1108     ai++;
1109   }
1110 
1111   PetscCall(VecRestoreArrayRead(xx, &x));
1112   PetscCall(VecRestoreArray(zz, &z));
1113 
1114   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1115   PetscFunctionReturn(PETSC_SUCCESS);
1116 }
1117 
1118 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1119 {
1120   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1121   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1122   const PetscScalar *x, *xb;
1123   const MatScalar   *v;
1124   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1125   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1126   PetscInt           nonzerorow = 0;
1127 
1128   PetscFunctionBegin;
1129   PetscCall(VecCopy(yy, zz));
1130   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1131   PetscCall(VecGetArrayRead(xx, &x));
1132   PetscCall(VecGetArray(zz, &z));
1133 
1134   v  = a->a;
1135   xb = x;
1136 
1137   for (i = 0; i < mbs; i++) {
1138     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1139     x1   = xb[0];
1140     x2   = xb[1];
1141     x3   = xb[2];
1142     x4   = xb[3];
1143     x5   = xb[4];
1144     x6   = xb[5];
1145     x7   = xb[6];
1146     ib   = aj + *ai;
1147     jmin = 0;
1148     nonzerorow += (n > 0);
1149     if (n && *ib == i) { /* (diag of A)*x */
1150       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1151       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1152       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1153       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1154       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1155       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1156       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1157       v += 49;
1158       jmin++;
1159     }
1160     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1161     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1162     for (j = jmin; j < n; j++) {
1163       /* (strict lower triangular part of A)*x  */
1164       cval = ib[j] * 7;
1165       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1166       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1167       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1168       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1169       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1170       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1171       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1172       /* (strict upper triangular part of A)*x  */
1173       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1174       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1175       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1176       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1177       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1178       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1179       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1180       v += 49;
1181     }
1182     xb += 7;
1183     ai++;
1184   }
1185 
1186   PetscCall(VecRestoreArrayRead(xx, &x));
1187   PetscCall(VecRestoreArray(zz, &z));
1188 
1189   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1190   PetscFunctionReturn(PETSC_SUCCESS);
1191 }
1192 
1193 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1194 {
1195   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1196   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1197   const PetscScalar *x, *x_ptr, *xb;
1198   const MatScalar   *v;
1199   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1200   const PetscInt    *idx, *aj, *ii;
1201   PetscInt           nonzerorow = 0;
1202 
1203   PetscFunctionBegin;
1204   PetscCall(VecCopy(yy, zz));
1205   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1206   PetscCall(VecGetArrayRead(xx, &x));
1207   x_ptr = x;
1208   PetscCall(VecGetArray(zz, &z));
1209   z_ptr = z;
1210 
1211   aj = a->j;
1212   v  = a->a;
1213   ii = a->i;
1214 
1215   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1216   work = a->mult_work;
1217 
1218   for (i = 0; i < mbs; i++) {
1219     n     = ii[1] - ii[0];
1220     ncols = n * bs;
1221     workt = work;
1222     idx   = aj + ii[0];
1223     nonzerorow += (n > 0);
1224 
1225     /* upper triangular part */
1226     for (j = 0; j < n; j++) {
1227       xb = x_ptr + bs * (*idx++);
1228       for (k = 0; k < bs; k++) workt[k] = xb[k];
1229       workt += bs;
1230     }
1231     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1232     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1233 
1234     /* strict lower triangular part */
1235     idx = aj + ii[0];
1236     if (n && *idx == i) {
1237       ncols -= bs;
1238       v += bs2;
1239       idx++;
1240       n--;
1241     }
1242     if (ncols > 0) {
1243       workt = work;
1244       PetscCall(PetscArrayzero(workt, ncols));
1245       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1246       for (j = 0; j < n; j++) {
1247         zb = z_ptr + bs * (*idx++);
1248         for (k = 0; k < bs; k++) zb[k] += workt[k];
1249         workt += bs;
1250       }
1251     }
1252 
1253     x += bs;
1254     v += n * bs2;
1255     z += bs;
1256     ii++;
1257   }
1258 
1259   PetscCall(VecRestoreArrayRead(xx, &x));
1260   PetscCall(VecRestoreArray(zz, &z));
1261 
1262   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1263   PetscFunctionReturn(PETSC_SUCCESS);
1264 }
1265 
1266 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1267 {
1268   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1269   PetscScalar   oalpha = alpha;
1270   PetscBLASInt  one    = 1, totalnz;
1271 
1272   PetscFunctionBegin;
1273   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1274   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1275   PetscCall(PetscLogFlops(totalnz));
1276   PetscFunctionReturn(PETSC_SUCCESS);
1277 }
1278 
1279 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1280 {
1281   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1282   const MatScalar *v        = a->a;
1283   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1284   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1285   const PetscInt  *aj = a->j, *col;
1286 
1287   PetscFunctionBegin;
1288   if (!a->nz) {
1289     *norm = 0.0;
1290     PetscFunctionReturn(PETSC_SUCCESS);
1291   }
1292   if (type == NORM_FROBENIUS) {
1293     for (k = 0; k < mbs; k++) {
1294       jmin = a->i[k];
1295       jmax = a->i[k + 1];
1296       col  = aj + jmin;
1297       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1298         for (i = 0; i < bs2; i++) {
1299           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1300           v++;
1301         }
1302         jmin++;
1303       }
1304       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1305         for (i = 0; i < bs2; i++) {
1306           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1307           v++;
1308         }
1309       }
1310     }
1311     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1312     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1313   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1314     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1315     for (i = 0; i < mbs; i++) jl[i] = mbs;
1316     il[0] = 0;
1317 
1318     *norm = 0.0;
1319     for (k = 0; k < mbs; k++) { /* k_th block row */
1320       for (j = 0; j < bs; j++) sum[j] = 0.0;
1321       /*-- col sum --*/
1322       i = jl[k]; /* first |A(i,k)| to be added */
1323       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1324                   at step k */
1325       while (i < mbs) {
1326         nexti = jl[i]; /* next block row to be added */
1327         ik    = il[i]; /* block index of A(i,k) in the array a */
1328         for (j = 0; j < bs; j++) {
1329           v = a->a + ik * bs2 + j * bs;
1330           for (k1 = 0; k1 < bs; k1++) {
1331             sum[j] += PetscAbsScalar(*v);
1332             v++;
1333           }
1334         }
1335         /* update il, jl */
1336         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1337         jmax = a->i[i + 1];
1338         if (jmin < jmax) {
1339           il[i] = jmin;
1340           j     = a->j[jmin];
1341           jl[i] = jl[j];
1342           jl[j] = i;
1343         }
1344         i = nexti;
1345       }
1346       /*-- row sum --*/
1347       jmin = a->i[k];
1348       jmax = a->i[k + 1];
1349       for (i = jmin; i < jmax; i++) {
1350         for (j = 0; j < bs; j++) {
1351           v = a->a + i * bs2 + j;
1352           for (k1 = 0; k1 < bs; k1++) {
1353             sum[j] += PetscAbsScalar(*v);
1354             v += bs;
1355           }
1356         }
1357       }
1358       /* add k_th block row to il, jl */
1359       col = aj + jmin;
1360       if (jmax - jmin > 0 && *col == k) jmin++;
1361       if (jmin < jmax) {
1362         il[k] = jmin;
1363         j     = a->j[jmin];
1364         jl[k] = jl[j];
1365         jl[j] = k;
1366       }
1367       for (j = 0; j < bs; j++) {
1368         if (sum[j] > *norm) *norm = sum[j];
1369       }
1370     }
1371     PetscCall(PetscFree3(sum, il, jl));
1372     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1373   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1374   PetscFunctionReturn(PETSC_SUCCESS);
1375 }
1376 
1377 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1378 {
1379   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1380 
1381   PetscFunctionBegin;
1382   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1383   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1384     *flg = PETSC_FALSE;
1385     PetscFunctionReturn(PETSC_SUCCESS);
1386   }
1387 
1388   /* if the a->i are the same */
1389   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1390   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1391 
1392   /* if a->j are the same */
1393   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1394   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1395 
1396   /* if a->a are the same */
1397   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1398   PetscFunctionReturn(PETSC_SUCCESS);
1399 }
1400 
1401 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1402 {
1403   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1404   PetscInt         i, j, k, row, bs, ambs, bs2;
1405   const PetscInt  *ai, *aj;
1406   PetscScalar     *x, zero = 0.0;
1407   const MatScalar *aa, *aa_j;
1408 
1409   PetscFunctionBegin;
1410   bs = A->rmap->bs;
1411   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1412 
1413   aa   = a->a;
1414   ambs = a->mbs;
1415 
1416   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1417     PetscInt *diag = a->diag;
1418     aa             = a->a;
1419     ambs           = a->mbs;
1420     PetscCall(VecGetArray(v, &x));
1421     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1422     PetscCall(VecRestoreArray(v, &x));
1423     PetscFunctionReturn(PETSC_SUCCESS);
1424   }
1425 
1426   ai  = a->i;
1427   aj  = a->j;
1428   bs2 = a->bs2;
1429   PetscCall(VecSet(v, zero));
1430   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1431   PetscCall(VecGetArray(v, &x));
1432   for (i = 0; i < ambs; i++) {
1433     j = ai[i];
1434     if (aj[j] == i) { /* if this is a diagonal element */
1435       row  = i * bs;
1436       aa_j = aa + j * bs2;
1437       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1438     }
1439   }
1440   PetscCall(VecRestoreArray(v, &x));
1441   PetscFunctionReturn(PETSC_SUCCESS);
1442 }
1443 
1444 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1445 {
1446   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1447   PetscScalar        x;
1448   const PetscScalar *l, *li, *ri;
1449   MatScalar         *aa, *v;
1450   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1451   const PetscInt    *ai, *aj;
1452   PetscBool          flg;
1453 
1454   PetscFunctionBegin;
1455   if (ll != rr) {
1456     PetscCall(VecEqual(ll, rr, &flg));
1457     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1458   }
1459   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1460   ai  = a->i;
1461   aj  = a->j;
1462   aa  = a->a;
1463   m   = A->rmap->N;
1464   bs  = A->rmap->bs;
1465   mbs = a->mbs;
1466   bs2 = a->bs2;
1467 
1468   PetscCall(VecGetArrayRead(ll, &l));
1469   PetscCall(VecGetLocalSize(ll, &lm));
1470   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1471   for (i = 0; i < mbs; i++) { /* for each block row */
1472     M  = ai[i + 1] - ai[i];
1473     li = l + i * bs;
1474     v  = aa + bs2 * ai[i];
1475     for (j = 0; j < M; j++) { /* for each block */
1476       ri = l + bs * aj[ai[i] + j];
1477       for (k = 0; k < bs; k++) {
1478         x = ri[k];
1479         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1480       }
1481     }
1482   }
1483   PetscCall(VecRestoreArrayRead(ll, &l));
1484   PetscCall(PetscLogFlops(2.0 * a->nz));
1485   PetscFunctionReturn(PETSC_SUCCESS);
1486 }
1487 
1488 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1489 {
1490   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1491 
1492   PetscFunctionBegin;
1493   info->block_size   = a->bs2;
1494   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1495   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1496   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1497   info->assemblies   = A->num_ass;
1498   info->mallocs      = A->info.mallocs;
1499   info->memory       = 0; /* REVIEW ME */
1500   if (A->factortype) {
1501     info->fill_ratio_given  = A->info.fill_ratio_given;
1502     info->fill_ratio_needed = A->info.fill_ratio_needed;
1503     info->factor_mallocs    = A->info.factor_mallocs;
1504   } else {
1505     info->fill_ratio_given  = 0;
1506     info->fill_ratio_needed = 0;
1507     info->factor_mallocs    = 0;
1508   }
1509   PetscFunctionReturn(PETSC_SUCCESS);
1510 }
1511 
1512 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1513 {
1514   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1515 
1516   PetscFunctionBegin;
1517   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1518   PetscFunctionReturn(PETSC_SUCCESS);
1519 }
1520 
1521 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1522 {
1523   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1524   PetscInt         i, j, n, row, col, bs, mbs;
1525   const PetscInt  *ai, *aj;
1526   PetscReal        atmp;
1527   const MatScalar *aa;
1528   PetscScalar     *x;
1529   PetscInt         ncols, brow, bcol, krow, kcol;
1530 
1531   PetscFunctionBegin;
1532   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1533   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1534   bs  = A->rmap->bs;
1535   aa  = a->a;
1536   ai  = a->i;
1537   aj  = a->j;
1538   mbs = a->mbs;
1539 
1540   PetscCall(VecSet(v, 0.0));
1541   PetscCall(VecGetArray(v, &x));
1542   PetscCall(VecGetLocalSize(v, &n));
1543   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1544   for (i = 0; i < mbs; i++) {
1545     ncols = ai[1] - ai[0];
1546     ai++;
1547     brow = bs * i;
1548     for (j = 0; j < ncols; j++) {
1549       bcol = bs * (*aj);
1550       for (kcol = 0; kcol < bs; kcol++) {
1551         col = bcol + kcol; /* col index */
1552         for (krow = 0; krow < bs; krow++) {
1553           atmp = PetscAbsScalar(*aa);
1554           aa++;
1555           row = brow + krow; /* row index */
1556           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1557           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1558         }
1559       }
1560       aj++;
1561     }
1562   }
1563   PetscCall(VecRestoreArray(v, &x));
1564   PetscFunctionReturn(PETSC_SUCCESS);
1565 }
1566 
1567 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1568 {
1569   PetscFunctionBegin;
1570   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1571   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1572   PetscFunctionReturn(PETSC_SUCCESS);
1573 }
1574 
1575 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1576 {
1577   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1578   PetscScalar       *z = c;
1579   const PetscScalar *xb;
1580   PetscScalar        x1;
1581   const MatScalar   *v   = a->a, *vv;
1582   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1583 #if defined(PETSC_USE_COMPLEX)
1584   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1585 #else
1586   const int aconj = 0;
1587 #endif
1588 
1589   PetscFunctionBegin;
1590   for (i = 0; i < mbs; i++) {
1591     n = ii[1] - ii[0];
1592     ii++;
1593     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1594     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1595     jj = idx;
1596     vv = v;
1597     for (k = 0; k < cn; k++) {
1598       idx = jj;
1599       v   = vv;
1600       for (j = 0; j < n; j++) {
1601         xb = b + (*idx);
1602         x1 = xb[0 + k * bm];
1603         z[0 + k * cm] += v[0] * x1;
1604         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1605         v += 1;
1606         ++idx;
1607       }
1608     }
1609     z += 1;
1610   }
1611   PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613 
1614 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1615 {
1616   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1617   PetscScalar       *z = c;
1618   const PetscScalar *xb;
1619   PetscScalar        x1, x2;
1620   const MatScalar   *v   = a->a, *vv;
1621   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1622 
1623   PetscFunctionBegin;
1624   for (i = 0; i < mbs; i++) {
1625     n = ii[1] - ii[0];
1626     ii++;
1627     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1628     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1629     jj = idx;
1630     vv = v;
1631     for (k = 0; k < cn; k++) {
1632       idx = jj;
1633       v   = vv;
1634       for (j = 0; j < n; j++) {
1635         xb = b + 2 * (*idx);
1636         x1 = xb[0 + k * bm];
1637         x2 = xb[1 + k * bm];
1638         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1639         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1640         if (*idx != i) {
1641           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1642           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1643         }
1644         v += 4;
1645         ++idx;
1646       }
1647     }
1648     z += 2;
1649   }
1650   PetscFunctionReturn(PETSC_SUCCESS);
1651 }
1652 
1653 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1654 {
1655   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1656   PetscScalar       *z = c;
1657   const PetscScalar *xb;
1658   PetscScalar        x1, x2, x3;
1659   const MatScalar   *v   = a->a, *vv;
1660   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1661 
1662   PetscFunctionBegin;
1663   for (i = 0; i < mbs; i++) {
1664     n = ii[1] - ii[0];
1665     ii++;
1666     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1667     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1668     jj = idx;
1669     vv = v;
1670     for (k = 0; k < cn; k++) {
1671       idx = jj;
1672       v   = vv;
1673       for (j = 0; j < n; j++) {
1674         xb = b + 3 * (*idx);
1675         x1 = xb[0 + k * bm];
1676         x2 = xb[1 + k * bm];
1677         x3 = xb[2 + k * bm];
1678         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1679         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1680         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1681         if (*idx != i) {
1682           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1683           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1684           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1685         }
1686         v += 9;
1687         ++idx;
1688       }
1689     }
1690     z += 3;
1691   }
1692   PetscFunctionReturn(PETSC_SUCCESS);
1693 }
1694 
1695 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1696 {
1697   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1698   PetscScalar       *z = c;
1699   const PetscScalar *xb;
1700   PetscScalar        x1, x2, x3, x4;
1701   const MatScalar   *v   = a->a, *vv;
1702   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1703 
1704   PetscFunctionBegin;
1705   for (i = 0; i < mbs; i++) {
1706     n = ii[1] - ii[0];
1707     ii++;
1708     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1709     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1710     jj = idx;
1711     vv = v;
1712     for (k = 0; k < cn; k++) {
1713       idx = jj;
1714       v   = vv;
1715       for (j = 0; j < n; j++) {
1716         xb = b + 4 * (*idx);
1717         x1 = xb[0 + k * bm];
1718         x2 = xb[1 + k * bm];
1719         x3 = xb[2 + k * bm];
1720         x4 = xb[3 + k * bm];
1721         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1722         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1723         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1724         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1725         if (*idx != i) {
1726           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1727           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1728           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1729           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1730         }
1731         v += 16;
1732         ++idx;
1733       }
1734     }
1735     z += 4;
1736   }
1737   PetscFunctionReturn(PETSC_SUCCESS);
1738 }
1739 
1740 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1741 {
1742   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1743   PetscScalar       *z = c;
1744   const PetscScalar *xb;
1745   PetscScalar        x1, x2, x3, x4, x5;
1746   const MatScalar   *v   = a->a, *vv;
1747   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1748 
1749   PetscFunctionBegin;
1750   for (i = 0; i < mbs; i++) {
1751     n = ii[1] - ii[0];
1752     ii++;
1753     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1754     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1755     jj = idx;
1756     vv = v;
1757     for (k = 0; k < cn; k++) {
1758       idx = jj;
1759       v   = vv;
1760       for (j = 0; j < n; j++) {
1761         xb = b + 5 * (*idx);
1762         x1 = xb[0 + k * bm];
1763         x2 = xb[1 + k * bm];
1764         x3 = xb[2 + k * bm];
1765         x4 = xb[3 + k * bm];
1766         x5 = xb[4 + k * cm];
1767         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1768         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1769         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1770         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1771         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1772         if (*idx != i) {
1773           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1774           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1775           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1776           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1777           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1778         }
1779         v += 25;
1780         ++idx;
1781       }
1782     }
1783     z += 5;
1784   }
1785   PetscFunctionReturn(PETSC_SUCCESS);
1786 }
1787 
1788 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1789 {
1790   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1791   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1792   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1793   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1794   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1795   PetscBLASInt     bbs, bcn, bbm, bcm;
1796   PetscScalar     *z = NULL;
1797   PetscScalar     *c, *b;
1798   const MatScalar *v;
1799   const PetscInt  *idx, *ii;
1800   PetscScalar      _DOne = 1.0;
1801 
1802   PetscFunctionBegin;
1803   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1804   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1805   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1806   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1807   b = bd->v;
1808   PetscCall(MatZeroEntries(C));
1809   PetscCall(MatDenseGetArray(C, &c));
1810   switch (bs) {
1811   case 1:
1812     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1813     break;
1814   case 2:
1815     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1816     break;
1817   case 3:
1818     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1819     break;
1820   case 4:
1821     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1822     break;
1823   case 5:
1824     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1825     break;
1826   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1827     PetscCall(PetscBLASIntCast(bs, &bbs));
1828     PetscCall(PetscBLASIntCast(cn, &bcn));
1829     PetscCall(PetscBLASIntCast(bm, &bbm));
1830     PetscCall(PetscBLASIntCast(cm, &bcm));
1831     idx = a->j;
1832     v   = a->a;
1833     mbs = a->mbs;
1834     ii  = a->i;
1835     z   = c;
1836     for (i = 0; i < mbs; i++) {
1837       n = ii[1] - ii[0];
1838       ii++;
1839       for (j = 0; j < n; j++) {
1840         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1841         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1842         v += bs2;
1843       }
1844       z += bs;
1845     }
1846   }
1847   PetscCall(MatDenseRestoreArray(C, &c));
1848   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1849   PetscFunctionReturn(PETSC_SUCCESS);
1850 }
1851