xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision a4e35b1925eceef64945ea472b84f2bf06a67b5e)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
12   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
13   const PetscInt *idx;
14   PetscBT         table_out, table_in;
15 
16   PetscFunctionBegin;
17   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18   mbs = a->mbs;
19   ai  = a->i;
20   aj  = a->j;
21   bs  = A->rmap->bs;
22   PetscCall(PetscBTCreate(mbs, &table_out));
23   PetscCall(PetscMalloc1(mbs + 1, &nidx));
24   PetscCall(PetscBTCreate(mbs, &table_in));
25 
26   for (i = 0; i < is_max; i++) { /* for each is */
27     isz = 0;
28     PetscCall(PetscBTMemzero(mbs, table_out));
29 
30     /* Extract the indices, assume there can be duplicate entries */
31     PetscCall(ISGetIndices(is[i], &idx));
32     PetscCall(ISGetLocalSize(is[i], &n));
33 
34     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35     bcol_max = 0;
36     for (j = 0; j < n; ++j) {
37       brow = idx[j] / bs; /* convert the indices into block indices */
38       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39       if (!PetscBTLookupSet(table_out, brow)) {
40         nidx[isz++] = brow;
41         if (bcol_max < brow) bcol_max = brow;
42       }
43     }
44     PetscCall(ISRestoreIndices(is[i], &idx));
45     PetscCall(ISDestroy(&is[i]));
46 
47     k = 0;
48     for (j = 0; j < ov; j++) { /* for each overlap */
49       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50       PetscCall(PetscBTMemzero(mbs, table_in));
51       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52 
53       n = isz; /* length of the updated is[i] */
54       for (brow = 0; brow < mbs; brow++) {
55         start = ai[brow];
56         end   = ai[brow + 1];
57         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l < end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out, bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l < end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in, bcol)) {
72               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
80   } /* for each is */
81   PetscCall(PetscBTDestroy(&table_out));
82   PetscCall(PetscFree(nidx));
83   PetscCall(PetscBTDestroy(&table_in));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
88         Zero some ops' to avoid invalid use */
89 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90 {
91   PetscFunctionBegin;
92   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93   Bseq->ops->mult                   = NULL;
94   Bseq->ops->multadd                = NULL;
95   Bseq->ops->multtranspose          = NULL;
96   Bseq->ops->multtransposeadd       = NULL;
97   Bseq->ops->lufactor               = NULL;
98   Bseq->ops->choleskyfactor         = NULL;
99   Bseq->ops->lufactorsymbolic       = NULL;
100   Bseq->ops->choleskyfactorsymbolic = NULL;
101   Bseq->ops->getinertia             = NULL;
102   PetscFunctionReturn(PETSC_SUCCESS);
103 }
104 
105 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107 {
108   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
109   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111   const PetscInt *irow, *icol;
112   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113   PetscInt       *aj = a->j, *ai = a->i;
114   MatScalar      *mat_a;
115   Mat             C;
116   PetscBool       flag;
117 
118   PetscFunctionBegin;
119 
120   PetscCall(ISGetIndices(isrow, &irow));
121   PetscCall(ISGetIndices(iscol, &icol));
122   PetscCall(ISGetLocalSize(isrow, &nrows));
123   PetscCall(ISGetLocalSize(iscol, &ncols));
124 
125   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126   ssmap = smap;
127   PetscCall(PetscMalloc1(1 + nrows, &lens));
128   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129   /* determine lens of each row */
130   for (i = 0; i < nrows; i++) {
131     kstart  = ai[irow[i]];
132     kend    = kstart + a->ilen[irow[i]];
133     lens[i] = 0;
134     for (k = kstart; k < kend; k++) {
135       if (ssmap[aj[k]]) lens[i]++;
136     }
137   }
138   /* Create and fill new matrix */
139   if (scall == MAT_REUSE_MATRIX) {
140     c = (Mat_SeqSBAIJ *)((*B)->data);
141 
142     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
145     PetscCall(PetscArrayzero(c->ilen, c->mbs));
146     C = *B;
147   } else {
148     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
149     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
150     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
151     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152   }
153   c = (Mat_SeqSBAIJ *)(C->data);
154   for (i = 0; i < nrows; i++) {
155     row      = irow[i];
156     kstart   = ai[row];
157     kend     = kstart + a->ilen[row];
158     mat_i    = c->i[i];
159     mat_j    = c->j + mat_i;
160     mat_a    = c->a + mat_i * bs2;
161     mat_ilen = c->ilen + i;
162     for (k = kstart; k < kend; k++) {
163       if ((tcol = ssmap[a->j[k]])) {
164         *mat_j++ = tcol - 1;
165         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166         mat_a += bs2;
167         (*mat_ilen)++;
168       }
169     }
170   }
171   /* sort */
172   {
173     MatScalar *work;
174 
175     PetscCall(PetscMalloc1(bs2, &work));
176     for (i = 0; i < nrows; i++) {
177       PetscInt ilen;
178       mat_i = c->i[i];
179       mat_j = c->j + mat_i;
180       mat_a = c->a + mat_i * bs2;
181       ilen  = c->ilen[i];
182       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183     }
184     PetscCall(PetscFree(work));
185   }
186 
187   /* Free work space */
188   PetscCall(ISRestoreIndices(iscol, &icol));
189   PetscCall(PetscFree(smap));
190   PetscCall(PetscFree(lens));
191   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
192   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
193 
194   PetscCall(ISRestoreIndices(isrow, &irow));
195   *B = C;
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200 {
201   IS is1, is2;
202 
203   PetscFunctionBegin;
204   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
205   if (isrow == iscol) {
206     is2 = is1;
207     PetscCall(PetscObjectReference((PetscObject)is2));
208   } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
209   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
210   PetscCall(ISDestroy(&is1));
211   PetscCall(ISDestroy(&is2));
212 
213   if (isrow != iscol) {
214     PetscBool isequal;
215     PetscCall(ISEqual(isrow, iscol, &isequal));
216     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
217   }
218   PetscFunctionReturn(PETSC_SUCCESS);
219 }
220 
221 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
222 {
223   PetscInt i;
224 
225   PetscFunctionBegin;
226   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
227 
228   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
229   PetscFunctionReturn(PETSC_SUCCESS);
230 }
231 
232 /* Should check that shapes of vectors and matrices match */
233 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
234 {
235   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
236   PetscScalar       *z, x1, x2, zero = 0.0;
237   const PetscScalar *x, *xb;
238   const MatScalar   *v;
239   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
240   const PetscInt    *aj = a->j, *ai = a->i, *ib;
241   PetscInt           nonzerorow = 0;
242 
243   PetscFunctionBegin;
244   PetscCall(VecSet(zz, zero));
245   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
246   PetscCall(VecGetArrayRead(xx, &x));
247   PetscCall(VecGetArray(zz, &z));
248 
249   v  = a->a;
250   xb = x;
251 
252   for (i = 0; i < mbs; i++) {
253     n    = ai[1] - ai[0]; /* length of i_th block row of A */
254     x1   = xb[0];
255     x2   = xb[1];
256     ib   = aj + *ai;
257     jmin = 0;
258     nonzerorow += (n > 0);
259     if (*ib == i) { /* (diag of A)*x */
260       z[2 * i] += v[0] * x1 + v[2] * x2;
261       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
262       v += 4;
263       jmin++;
264     }
265     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
266     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267     for (j = jmin; j < n; j++) {
268       /* (strict lower triangular part of A)*x  */
269       cval = ib[j] * 2;
270       z[cval] += v[0] * x1 + v[1] * x2;
271       z[cval + 1] += v[2] * x1 + v[3] * x2;
272       /* (strict upper triangular part of A)*x  */
273       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
274       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
275       v += 4;
276     }
277     xb += 2;
278     ai++;
279   }
280 
281   PetscCall(VecRestoreArrayRead(xx, &x));
282   PetscCall(VecRestoreArray(zz, &z));
283   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
288 {
289   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
290   PetscScalar       *z, x1, x2, x3, zero = 0.0;
291   const PetscScalar *x, *xb;
292   const MatScalar   *v;
293   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
294   const PetscInt    *aj = a->j, *ai = a->i, *ib;
295   PetscInt           nonzerorow = 0;
296 
297   PetscFunctionBegin;
298   PetscCall(VecSet(zz, zero));
299   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300   PetscCall(VecGetArrayRead(xx, &x));
301   PetscCall(VecGetArray(zz, &z));
302 
303   v  = a->a;
304   xb = x;
305 
306   for (i = 0; i < mbs; i++) {
307     n    = ai[1] - ai[0]; /* length of i_th block row of A */
308     x1   = xb[0];
309     x2   = xb[1];
310     x3   = xb[2];
311     ib   = aj + *ai;
312     jmin = 0;
313     nonzerorow += (n > 0);
314     if (*ib == i) { /* (diag of A)*x */
315       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
316       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
317       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
318       v += 9;
319       jmin++;
320     }
321     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
322     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
323     for (j = jmin; j < n; j++) {
324       /* (strict lower triangular part of A)*x  */
325       cval = ib[j] * 3;
326       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
327       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
328       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
329       /* (strict upper triangular part of A)*x  */
330       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
331       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
332       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
333       v += 9;
334     }
335     xb += 3;
336     ai++;
337   }
338 
339   PetscCall(VecRestoreArrayRead(xx, &x));
340   PetscCall(VecRestoreArray(zz, &z));
341   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
342   PetscFunctionReturn(PETSC_SUCCESS);
343 }
344 
345 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
346 {
347   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
348   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
349   const PetscScalar *x, *xb;
350   const MatScalar   *v;
351   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
352   const PetscInt    *aj = a->j, *ai = a->i, *ib;
353   PetscInt           nonzerorow = 0;
354 
355   PetscFunctionBegin;
356   PetscCall(VecSet(zz, zero));
357   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
358   PetscCall(VecGetArrayRead(xx, &x));
359   PetscCall(VecGetArray(zz, &z));
360 
361   v  = a->a;
362   xb = x;
363 
364   for (i = 0; i < mbs; i++) {
365     n    = ai[1] - ai[0]; /* length of i_th block row of A */
366     x1   = xb[0];
367     x2   = xb[1];
368     x3   = xb[2];
369     x4   = xb[3];
370     ib   = aj + *ai;
371     jmin = 0;
372     nonzerorow += (n > 0);
373     if (*ib == i) { /* (diag of A)*x */
374       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
375       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
376       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
377       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
378       v += 16;
379       jmin++;
380     }
381     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
382     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
383     for (j = jmin; j < n; j++) {
384       /* (strict lower triangular part of A)*x  */
385       cval = ib[j] * 4;
386       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
387       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
388       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
389       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
390       /* (strict upper triangular part of A)*x  */
391       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
392       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
393       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
394       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
395       v += 16;
396     }
397     xb += 4;
398     ai++;
399   }
400 
401   PetscCall(VecRestoreArrayRead(xx, &x));
402   PetscCall(VecRestoreArray(zz, &z));
403   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
404   PetscFunctionReturn(PETSC_SUCCESS);
405 }
406 
407 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
408 {
409   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
410   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
411   const PetscScalar *x, *xb;
412   const MatScalar   *v;
413   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
414   const PetscInt    *aj = a->j, *ai = a->i, *ib;
415   PetscInt           nonzerorow = 0;
416 
417   PetscFunctionBegin;
418   PetscCall(VecSet(zz, zero));
419   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
420   PetscCall(VecGetArrayRead(xx, &x));
421   PetscCall(VecGetArray(zz, &z));
422 
423   v  = a->a;
424   xb = x;
425 
426   for (i = 0; i < mbs; i++) {
427     n    = ai[1] - ai[0]; /* length of i_th block row of A */
428     x1   = xb[0];
429     x2   = xb[1];
430     x3   = xb[2];
431     x4   = xb[3];
432     x5   = xb[4];
433     ib   = aj + *ai;
434     jmin = 0;
435     nonzerorow += (n > 0);
436     if (*ib == i) { /* (diag of A)*x */
437       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
438       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
439       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
440       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
441       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
442       v += 25;
443       jmin++;
444     }
445     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
446     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
447     for (j = jmin; j < n; j++) {
448       /* (strict lower triangular part of A)*x  */
449       cval = ib[j] * 5;
450       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
451       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
452       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
453       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
454       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
455       /* (strict upper triangular part of A)*x  */
456       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
457       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
458       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
459       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
460       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
461       v += 25;
462     }
463     xb += 5;
464     ai++;
465   }
466 
467   PetscCall(VecRestoreArrayRead(xx, &x));
468   PetscCall(VecRestoreArray(zz, &z));
469   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
470   PetscFunctionReturn(PETSC_SUCCESS);
471 }
472 
473 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
474 {
475   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
476   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
477   const PetscScalar *x, *xb;
478   const MatScalar   *v;
479   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
480   const PetscInt    *aj = a->j, *ai = a->i, *ib;
481   PetscInt           nonzerorow = 0;
482 
483   PetscFunctionBegin;
484   PetscCall(VecSet(zz, zero));
485   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
486   PetscCall(VecGetArrayRead(xx, &x));
487   PetscCall(VecGetArray(zz, &z));
488 
489   v  = a->a;
490   xb = x;
491 
492   for (i = 0; i < mbs; i++) {
493     n    = ai[1] - ai[0]; /* length of i_th block row of A */
494     x1   = xb[0];
495     x2   = xb[1];
496     x3   = xb[2];
497     x4   = xb[3];
498     x5   = xb[4];
499     x6   = xb[5];
500     ib   = aj + *ai;
501     jmin = 0;
502     nonzerorow += (n > 0);
503     if (*ib == i) { /* (diag of A)*x */
504       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
505       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
506       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
507       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
508       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
509       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
510       v += 36;
511       jmin++;
512     }
513     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
514     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
515     for (j = jmin; j < n; j++) {
516       /* (strict lower triangular part of A)*x  */
517       cval = ib[j] * 6;
518       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
519       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
520       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
521       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
522       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
523       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
524       /* (strict upper triangular part of A)*x  */
525       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
526       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
527       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
528       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
529       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
530       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
531       v += 36;
532     }
533     xb += 6;
534     ai++;
535   }
536 
537   PetscCall(VecRestoreArrayRead(xx, &x));
538   PetscCall(VecRestoreArray(zz, &z));
539   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
540   PetscFunctionReturn(PETSC_SUCCESS);
541 }
542 
543 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
544 {
545   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
546   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
547   const PetscScalar *x, *xb;
548   const MatScalar   *v;
549   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
550   const PetscInt    *aj = a->j, *ai = a->i, *ib;
551   PetscInt           nonzerorow = 0;
552 
553   PetscFunctionBegin;
554   PetscCall(VecSet(zz, zero));
555   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
556   PetscCall(VecGetArrayRead(xx, &x));
557   PetscCall(VecGetArray(zz, &z));
558 
559   v  = a->a;
560   xb = x;
561 
562   for (i = 0; i < mbs; i++) {
563     n    = ai[1] - ai[0]; /* length of i_th block row of A */
564     x1   = xb[0];
565     x2   = xb[1];
566     x3   = xb[2];
567     x4   = xb[3];
568     x5   = xb[4];
569     x6   = xb[5];
570     x7   = xb[6];
571     ib   = aj + *ai;
572     jmin = 0;
573     nonzerorow += (n > 0);
574     if (*ib == i) { /* (diag of A)*x */
575       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
576       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
577       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
578       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
579       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
580       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
581       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
582       v += 49;
583       jmin++;
584     }
585     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
586     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
587     for (j = jmin; j < n; j++) {
588       /* (strict lower triangular part of A)*x  */
589       cval = ib[j] * 7;
590       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
591       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
592       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
593       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
594       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
595       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
596       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
597       /* (strict upper triangular part of A)*x  */
598       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
599       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
600       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
601       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
602       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
603       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
604       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
605       v += 49;
606     }
607     xb += 7;
608     ai++;
609   }
610   PetscCall(VecRestoreArrayRead(xx, &x));
611   PetscCall(VecRestoreArray(zz, &z));
612   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
613   PetscFunctionReturn(PETSC_SUCCESS);
614 }
615 
616 /*
617     This will not work with MatScalar == float because it calls the BLAS
618 */
619 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
620 {
621   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
622   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
623   const PetscScalar *x, *x_ptr, *xb;
624   const MatScalar   *v;
625   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
626   const PetscInt    *idx, *aj, *ii;
627   PetscInt           nonzerorow = 0;
628 
629   PetscFunctionBegin;
630   PetscCall(VecSet(zz, zero));
631   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
632   PetscCall(VecGetArrayRead(xx, &x));
633   PetscCall(VecGetArray(zz, &z));
634 
635   x_ptr = x;
636   z_ptr = z;
637 
638   aj = a->j;
639   v  = a->a;
640   ii = a->i;
641 
642   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
643   work = a->mult_work;
644 
645   for (i = 0; i < mbs; i++) {
646     n     = ii[1] - ii[0];
647     ncols = n * bs;
648     workt = work;
649     idx   = aj + ii[0];
650     nonzerorow += (n > 0);
651 
652     /* upper triangular part */
653     for (j = 0; j < n; j++) {
654       xb = x_ptr + bs * (*idx++);
655       for (k = 0; k < bs; k++) workt[k] = xb[k];
656       workt += bs;
657     }
658     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
659     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
660 
661     /* strict lower triangular part */
662     idx = aj + ii[0];
663     if (n && *idx == i) {
664       ncols -= bs;
665       v += bs2;
666       idx++;
667       n--;
668     }
669 
670     if (ncols > 0) {
671       workt = work;
672       PetscCall(PetscArrayzero(workt, ncols));
673       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
674       for (j = 0; j < n; j++) {
675         zb = z_ptr + bs * (*idx++);
676         for (k = 0; k < bs; k++) zb[k] += workt[k];
677         workt += bs;
678       }
679     }
680     x += bs;
681     v += n * bs2;
682     z += bs;
683     ii++;
684   }
685 
686   PetscCall(VecRestoreArrayRead(xx, &x));
687   PetscCall(VecRestoreArray(zz, &z));
688   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
689   PetscFunctionReturn(PETSC_SUCCESS);
690 }
691 
692 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
693 {
694   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
695   PetscScalar       *z, x1;
696   const PetscScalar *x, *xb;
697   const MatScalar   *v;
698   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
699   const PetscInt    *aj = a->j, *ai = a->i, *ib;
700   PetscInt           nonzerorow = 0;
701 #if defined(PETSC_USE_COMPLEX)
702   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
703 #else
704   const int aconj = 0;
705 #endif
706 
707   PetscFunctionBegin;
708   PetscCall(VecCopy(yy, zz));
709   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
710   PetscCall(VecGetArrayRead(xx, &x));
711   PetscCall(VecGetArray(zz, &z));
712   v  = a->a;
713   xb = x;
714 
715   for (i = 0; i < mbs; i++) {
716     n    = ai[1] - ai[0]; /* length of i_th row of A */
717     x1   = xb[0];
718     ib   = aj + *ai;
719     jmin = 0;
720     nonzerorow += (n > 0);
721     if (n && *ib == i) { /* (diag of A)*x */
722       z[i] += *v++ * x[*ib++];
723       jmin++;
724     }
725     if (aconj) {
726       for (j = jmin; j < n; j++) {
727         cval = *ib;
728         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
729         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
730       }
731     } else {
732       for (j = jmin; j < n; j++) {
733         cval = *ib;
734         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
735         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
736       }
737     }
738     xb++;
739     ai++;
740   }
741 
742   PetscCall(VecRestoreArrayRead(xx, &x));
743   PetscCall(VecRestoreArray(zz, &z));
744 
745   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
746   PetscFunctionReturn(PETSC_SUCCESS);
747 }
748 
749 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
750 {
751   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
752   PetscScalar       *z, x1, x2;
753   const PetscScalar *x, *xb;
754   const MatScalar   *v;
755   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
756   const PetscInt    *aj = a->j, *ai = a->i, *ib;
757   PetscInt           nonzerorow = 0;
758 
759   PetscFunctionBegin;
760   PetscCall(VecCopy(yy, zz));
761   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
762   PetscCall(VecGetArrayRead(xx, &x));
763   PetscCall(VecGetArray(zz, &z));
764 
765   v  = a->a;
766   xb = x;
767 
768   for (i = 0; i < mbs; i++) {
769     n    = ai[1] - ai[0]; /* length of i_th block row of A */
770     x1   = xb[0];
771     x2   = xb[1];
772     ib   = aj + *ai;
773     jmin = 0;
774     nonzerorow += (n > 0);
775     if (n && *ib == i) { /* (diag of A)*x */
776       z[2 * i] += v[0] * x1 + v[2] * x2;
777       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
778       v += 4;
779       jmin++;
780     }
781     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
782     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
783     for (j = jmin; j < n; j++) {
784       /* (strict lower triangular part of A)*x  */
785       cval = ib[j] * 2;
786       z[cval] += v[0] * x1 + v[1] * x2;
787       z[cval + 1] += v[2] * x1 + v[3] * x2;
788       /* (strict upper triangular part of A)*x  */
789       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
790       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
791       v += 4;
792     }
793     xb += 2;
794     ai++;
795   }
796   PetscCall(VecRestoreArrayRead(xx, &x));
797   PetscCall(VecRestoreArray(zz, &z));
798 
799   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
800   PetscFunctionReturn(PETSC_SUCCESS);
801 }
802 
803 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
804 {
805   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
806   PetscScalar       *z, x1, x2, x3;
807   const PetscScalar *x, *xb;
808   const MatScalar   *v;
809   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
810   const PetscInt    *aj = a->j, *ai = a->i, *ib;
811   PetscInt           nonzerorow = 0;
812 
813   PetscFunctionBegin;
814   PetscCall(VecCopy(yy, zz));
815   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
816   PetscCall(VecGetArrayRead(xx, &x));
817   PetscCall(VecGetArray(zz, &z));
818 
819   v  = a->a;
820   xb = x;
821 
822   for (i = 0; i < mbs; i++) {
823     n    = ai[1] - ai[0]; /* length of i_th block row of A */
824     x1   = xb[0];
825     x2   = xb[1];
826     x3   = xb[2];
827     ib   = aj + *ai;
828     jmin = 0;
829     nonzerorow += (n > 0);
830     if (n && *ib == i) { /* (diag of A)*x */
831       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
832       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
833       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
834       v += 9;
835       jmin++;
836     }
837     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
838     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
839     for (j = jmin; j < n; j++) {
840       /* (strict lower triangular part of A)*x  */
841       cval = ib[j] * 3;
842       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
843       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
844       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
845       /* (strict upper triangular part of A)*x  */
846       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
847       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
848       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
849       v += 9;
850     }
851     xb += 3;
852     ai++;
853   }
854 
855   PetscCall(VecRestoreArrayRead(xx, &x));
856   PetscCall(VecRestoreArray(zz, &z));
857 
858   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
863 {
864   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
865   PetscScalar       *z, x1, x2, x3, x4;
866   const PetscScalar *x, *xb;
867   const MatScalar   *v;
868   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
869   const PetscInt    *aj = a->j, *ai = a->i, *ib;
870   PetscInt           nonzerorow = 0;
871 
872   PetscFunctionBegin;
873   PetscCall(VecCopy(yy, zz));
874   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
875   PetscCall(VecGetArrayRead(xx, &x));
876   PetscCall(VecGetArray(zz, &z));
877 
878   v  = a->a;
879   xb = x;
880 
881   for (i = 0; i < mbs; i++) {
882     n    = ai[1] - ai[0]; /* length of i_th block row of A */
883     x1   = xb[0];
884     x2   = xb[1];
885     x3   = xb[2];
886     x4   = xb[3];
887     ib   = aj + *ai;
888     jmin = 0;
889     nonzerorow += (n > 0);
890     if (n && *ib == i) { /* (diag of A)*x */
891       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
892       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
893       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
894       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
895       v += 16;
896       jmin++;
897     }
898     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
899     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
900     for (j = jmin; j < n; j++) {
901       /* (strict lower triangular part of A)*x  */
902       cval = ib[j] * 4;
903       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
904       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
905       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
906       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
907       /* (strict upper triangular part of A)*x  */
908       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
909       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
910       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
911       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
912       v += 16;
913     }
914     xb += 4;
915     ai++;
916   }
917 
918   PetscCall(VecRestoreArrayRead(xx, &x));
919   PetscCall(VecRestoreArray(zz, &z));
920 
921   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
922   PetscFunctionReturn(PETSC_SUCCESS);
923 }
924 
925 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
926 {
927   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
928   PetscScalar       *z, x1, x2, x3, x4, x5;
929   const PetscScalar *x, *xb;
930   const MatScalar   *v;
931   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
932   const PetscInt    *aj = a->j, *ai = a->i, *ib;
933   PetscInt           nonzerorow = 0;
934 
935   PetscFunctionBegin;
936   PetscCall(VecCopy(yy, zz));
937   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
938   PetscCall(VecGetArrayRead(xx, &x));
939   PetscCall(VecGetArray(zz, &z));
940 
941   v  = a->a;
942   xb = x;
943 
944   for (i = 0; i < mbs; i++) {
945     n    = ai[1] - ai[0]; /* length of i_th block row of A */
946     x1   = xb[0];
947     x2   = xb[1];
948     x3   = xb[2];
949     x4   = xb[3];
950     x5   = xb[4];
951     ib   = aj + *ai;
952     jmin = 0;
953     nonzerorow += (n > 0);
954     if (n && *ib == i) { /* (diag of A)*x */
955       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
956       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
957       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
958       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
959       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
960       v += 25;
961       jmin++;
962     }
963     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
964     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
965     for (j = jmin; j < n; j++) {
966       /* (strict lower triangular part of A)*x  */
967       cval = ib[j] * 5;
968       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
969       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
970       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
971       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
972       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
973       /* (strict upper triangular part of A)*x  */
974       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
975       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
976       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
977       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
978       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
979       v += 25;
980     }
981     xb += 5;
982     ai++;
983   }
984 
985   PetscCall(VecRestoreArrayRead(xx, &x));
986   PetscCall(VecRestoreArray(zz, &z));
987 
988   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
989   PetscFunctionReturn(PETSC_SUCCESS);
990 }
991 
992 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
993 {
994   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
995   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
996   const PetscScalar *x, *xb;
997   const MatScalar   *v;
998   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
999   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1000   PetscInt           nonzerorow = 0;
1001 
1002   PetscFunctionBegin;
1003   PetscCall(VecCopy(yy, zz));
1004   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1005   PetscCall(VecGetArrayRead(xx, &x));
1006   PetscCall(VecGetArray(zz, &z));
1007 
1008   v  = a->a;
1009   xb = x;
1010 
1011   for (i = 0; i < mbs; i++) {
1012     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1013     x1   = xb[0];
1014     x2   = xb[1];
1015     x3   = xb[2];
1016     x4   = xb[3];
1017     x5   = xb[4];
1018     x6   = xb[5];
1019     ib   = aj + *ai;
1020     jmin = 0;
1021     nonzerorow += (n > 0);
1022     if (n && *ib == i) { /* (diag of A)*x */
1023       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1024       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1025       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1026       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1027       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1028       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1029       v += 36;
1030       jmin++;
1031     }
1032     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1033     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1034     for (j = jmin; j < n; j++) {
1035       /* (strict lower triangular part of A)*x  */
1036       cval = ib[j] * 6;
1037       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1038       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1039       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1040       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1041       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1042       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1043       /* (strict upper triangular part of A)*x  */
1044       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1045       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1046       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1047       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1048       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1049       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1050       v += 36;
1051     }
1052     xb += 6;
1053     ai++;
1054   }
1055 
1056   PetscCall(VecRestoreArrayRead(xx, &x));
1057   PetscCall(VecRestoreArray(zz, &z));
1058 
1059   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1060   PetscFunctionReturn(PETSC_SUCCESS);
1061 }
1062 
1063 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1064 {
1065   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1066   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1067   const PetscScalar *x, *xb;
1068   const MatScalar   *v;
1069   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1070   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1071   PetscInt           nonzerorow = 0;
1072 
1073   PetscFunctionBegin;
1074   PetscCall(VecCopy(yy, zz));
1075   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1076   PetscCall(VecGetArrayRead(xx, &x));
1077   PetscCall(VecGetArray(zz, &z));
1078 
1079   v  = a->a;
1080   xb = x;
1081 
1082   for (i = 0; i < mbs; i++) {
1083     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1084     x1   = xb[0];
1085     x2   = xb[1];
1086     x3   = xb[2];
1087     x4   = xb[3];
1088     x5   = xb[4];
1089     x6   = xb[5];
1090     x7   = xb[6];
1091     ib   = aj + *ai;
1092     jmin = 0;
1093     nonzerorow += (n > 0);
1094     if (n && *ib == i) { /* (diag of A)*x */
1095       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1096       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1097       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1098       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1099       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1100       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1101       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1102       v += 49;
1103       jmin++;
1104     }
1105     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1106     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1107     for (j = jmin; j < n; j++) {
1108       /* (strict lower triangular part of A)*x  */
1109       cval = ib[j] * 7;
1110       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1111       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1112       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1113       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1114       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1115       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1116       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1117       /* (strict upper triangular part of A)*x  */
1118       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1119       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1120       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1121       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1122       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1123       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1124       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1125       v += 49;
1126     }
1127     xb += 7;
1128     ai++;
1129   }
1130 
1131   PetscCall(VecRestoreArrayRead(xx, &x));
1132   PetscCall(VecRestoreArray(zz, &z));
1133 
1134   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1135   PetscFunctionReturn(PETSC_SUCCESS);
1136 }
1137 
1138 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1139 {
1140   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1141   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1142   const PetscScalar *x, *x_ptr, *xb;
1143   const MatScalar   *v;
1144   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1145   const PetscInt    *idx, *aj, *ii;
1146   PetscInt           nonzerorow = 0;
1147 
1148   PetscFunctionBegin;
1149   PetscCall(VecCopy(yy, zz));
1150   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1151   PetscCall(VecGetArrayRead(xx, &x));
1152   x_ptr = x;
1153   PetscCall(VecGetArray(zz, &z));
1154   z_ptr = z;
1155 
1156   aj = a->j;
1157   v  = a->a;
1158   ii = a->i;
1159 
1160   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1161   work = a->mult_work;
1162 
1163   for (i = 0; i < mbs; i++) {
1164     n     = ii[1] - ii[0];
1165     ncols = n * bs;
1166     workt = work;
1167     idx   = aj + ii[0];
1168     nonzerorow += (n > 0);
1169 
1170     /* upper triangular part */
1171     for (j = 0; j < n; j++) {
1172       xb = x_ptr + bs * (*idx++);
1173       for (k = 0; k < bs; k++) workt[k] = xb[k];
1174       workt += bs;
1175     }
1176     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1177     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1178 
1179     /* strict lower triangular part */
1180     idx = aj + ii[0];
1181     if (n && *idx == i) {
1182       ncols -= bs;
1183       v += bs2;
1184       idx++;
1185       n--;
1186     }
1187     if (ncols > 0) {
1188       workt = work;
1189       PetscCall(PetscArrayzero(workt, ncols));
1190       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1191       for (j = 0; j < n; j++) {
1192         zb = z_ptr + bs * (*idx++);
1193         for (k = 0; k < bs; k++) zb[k] += workt[k];
1194         workt += bs;
1195       }
1196     }
1197 
1198     x += bs;
1199     v += n * bs2;
1200     z += bs;
1201     ii++;
1202   }
1203 
1204   PetscCall(VecRestoreArrayRead(xx, &x));
1205   PetscCall(VecRestoreArray(zz, &z));
1206 
1207   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 
1211 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1212 {
1213   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1214   PetscScalar   oalpha = alpha;
1215   PetscBLASInt  one    = 1, totalnz;
1216 
1217   PetscFunctionBegin;
1218   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1219   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1220   PetscCall(PetscLogFlops(totalnz));
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
1224 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1225 {
1226   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1227   const MatScalar *v        = a->a;
1228   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1229   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1230   const PetscInt  *aj = a->j, *col;
1231 
1232   PetscFunctionBegin;
1233   if (!a->nz) {
1234     *norm = 0.0;
1235     PetscFunctionReturn(PETSC_SUCCESS);
1236   }
1237   if (type == NORM_FROBENIUS) {
1238     for (k = 0; k < mbs; k++) {
1239       jmin = a->i[k];
1240       jmax = a->i[k + 1];
1241       col  = aj + jmin;
1242       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1243         for (i = 0; i < bs2; i++) {
1244           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1245           v++;
1246         }
1247         jmin++;
1248       }
1249       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1250         for (i = 0; i < bs2; i++) {
1251           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1252           v++;
1253         }
1254       }
1255     }
1256     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1257     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1258   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1259     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1260     for (i = 0; i < mbs; i++) jl[i] = mbs;
1261     il[0] = 0;
1262 
1263     *norm = 0.0;
1264     for (k = 0; k < mbs; k++) { /* k_th block row */
1265       for (j = 0; j < bs; j++) sum[j] = 0.0;
1266       /*-- col sum --*/
1267       i = jl[k]; /* first |A(i,k)| to be added */
1268       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1269                   at step k */
1270       while (i < mbs) {
1271         nexti = jl[i]; /* next block row to be added */
1272         ik    = il[i]; /* block index of A(i,k) in the array a */
1273         for (j = 0; j < bs; j++) {
1274           v = a->a + ik * bs2 + j * bs;
1275           for (k1 = 0; k1 < bs; k1++) {
1276             sum[j] += PetscAbsScalar(*v);
1277             v++;
1278           }
1279         }
1280         /* update il, jl */
1281         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1282         jmax = a->i[i + 1];
1283         if (jmin < jmax) {
1284           il[i] = jmin;
1285           j     = a->j[jmin];
1286           jl[i] = jl[j];
1287           jl[j] = i;
1288         }
1289         i = nexti;
1290       }
1291       /*-- row sum --*/
1292       jmin = a->i[k];
1293       jmax = a->i[k + 1];
1294       for (i = jmin; i < jmax; i++) {
1295         for (j = 0; j < bs; j++) {
1296           v = a->a + i * bs2 + j;
1297           for (k1 = 0; k1 < bs; k1++) {
1298             sum[j] += PetscAbsScalar(*v);
1299             v += bs;
1300           }
1301         }
1302       }
1303       /* add k_th block row to il, jl */
1304       col = aj + jmin;
1305       if (jmax - jmin > 0 && *col == k) jmin++;
1306       if (jmin < jmax) {
1307         il[k] = jmin;
1308         j     = a->j[jmin];
1309         jl[k] = jl[j];
1310         jl[j] = k;
1311       }
1312       for (j = 0; j < bs; j++) {
1313         if (sum[j] > *norm) *norm = sum[j];
1314       }
1315     }
1316     PetscCall(PetscFree3(sum, il, jl));
1317     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1318   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1319   PetscFunctionReturn(PETSC_SUCCESS);
1320 }
1321 
1322 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1323 {
1324   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1325 
1326   PetscFunctionBegin;
1327   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1328   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1329     *flg = PETSC_FALSE;
1330     PetscFunctionReturn(PETSC_SUCCESS);
1331   }
1332 
1333   /* if the a->i are the same */
1334   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1335   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1336 
1337   /* if a->j are the same */
1338   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1339   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1340 
1341   /* if a->a are the same */
1342   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1343   PetscFunctionReturn(PETSC_SUCCESS);
1344 }
1345 
1346 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1347 {
1348   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1349   PetscInt         i, j, k, row, bs, ambs, bs2;
1350   const PetscInt  *ai, *aj;
1351   PetscScalar     *x, zero = 0.0;
1352   const MatScalar *aa, *aa_j;
1353 
1354   PetscFunctionBegin;
1355   bs = A->rmap->bs;
1356   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1357 
1358   aa   = a->a;
1359   ambs = a->mbs;
1360 
1361   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1362     PetscInt *diag = a->diag;
1363     aa             = a->a;
1364     ambs           = a->mbs;
1365     PetscCall(VecGetArray(v, &x));
1366     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1367     PetscCall(VecRestoreArray(v, &x));
1368     PetscFunctionReturn(PETSC_SUCCESS);
1369   }
1370 
1371   ai  = a->i;
1372   aj  = a->j;
1373   bs2 = a->bs2;
1374   PetscCall(VecSet(v, zero));
1375   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1376   PetscCall(VecGetArray(v, &x));
1377   for (i = 0; i < ambs; i++) {
1378     j = ai[i];
1379     if (aj[j] == i) { /* if this is a diagonal element */
1380       row  = i * bs;
1381       aa_j = aa + j * bs2;
1382       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1383     }
1384   }
1385   PetscCall(VecRestoreArray(v, &x));
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1390 {
1391   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1392   PetscScalar        x;
1393   const PetscScalar *l, *li, *ri;
1394   MatScalar         *aa, *v;
1395   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1396   const PetscInt    *ai, *aj;
1397   PetscBool          flg;
1398 
1399   PetscFunctionBegin;
1400   if (ll != rr) {
1401     PetscCall(VecEqual(ll, rr, &flg));
1402     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1403   }
1404   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1405   ai  = a->i;
1406   aj  = a->j;
1407   aa  = a->a;
1408   m   = A->rmap->N;
1409   bs  = A->rmap->bs;
1410   mbs = a->mbs;
1411   bs2 = a->bs2;
1412 
1413   PetscCall(VecGetArrayRead(ll, &l));
1414   PetscCall(VecGetLocalSize(ll, &lm));
1415   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1416   for (i = 0; i < mbs; i++) { /* for each block row */
1417     M  = ai[i + 1] - ai[i];
1418     li = l + i * bs;
1419     v  = aa + bs2 * ai[i];
1420     for (j = 0; j < M; j++) { /* for each block */
1421       ri = l + bs * aj[ai[i] + j];
1422       for (k = 0; k < bs; k++) {
1423         x = ri[k];
1424         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1425       }
1426     }
1427   }
1428   PetscCall(VecRestoreArrayRead(ll, &l));
1429   PetscCall(PetscLogFlops(2.0 * a->nz));
1430   PetscFunctionReturn(PETSC_SUCCESS);
1431 }
1432 
1433 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1434 {
1435   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1436 
1437   PetscFunctionBegin;
1438   info->block_size   = a->bs2;
1439   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1440   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1441   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1442   info->assemblies   = A->num_ass;
1443   info->mallocs      = A->info.mallocs;
1444   info->memory       = 0; /* REVIEW ME */
1445   if (A->factortype) {
1446     info->fill_ratio_given  = A->info.fill_ratio_given;
1447     info->fill_ratio_needed = A->info.fill_ratio_needed;
1448     info->factor_mallocs    = A->info.factor_mallocs;
1449   } else {
1450     info->fill_ratio_given  = 0;
1451     info->fill_ratio_needed = 0;
1452     info->factor_mallocs    = 0;
1453   }
1454   PetscFunctionReturn(PETSC_SUCCESS);
1455 }
1456 
1457 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1458 {
1459   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1460 
1461   PetscFunctionBegin;
1462   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1463   PetscFunctionReturn(PETSC_SUCCESS);
1464 }
1465 
1466 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1467 {
1468   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1469   PetscInt         i, j, n, row, col, bs, mbs;
1470   const PetscInt  *ai, *aj;
1471   PetscReal        atmp;
1472   const MatScalar *aa;
1473   PetscScalar     *x;
1474   PetscInt         ncols, brow, bcol, krow, kcol;
1475 
1476   PetscFunctionBegin;
1477   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1478   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1479   bs  = A->rmap->bs;
1480   aa  = a->a;
1481   ai  = a->i;
1482   aj  = a->j;
1483   mbs = a->mbs;
1484 
1485   PetscCall(VecSet(v, 0.0));
1486   PetscCall(VecGetArray(v, &x));
1487   PetscCall(VecGetLocalSize(v, &n));
1488   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1489   for (i = 0; i < mbs; i++) {
1490     ncols = ai[1] - ai[0];
1491     ai++;
1492     brow = bs * i;
1493     for (j = 0; j < ncols; j++) {
1494       bcol = bs * (*aj);
1495       for (kcol = 0; kcol < bs; kcol++) {
1496         col = bcol + kcol; /* col index */
1497         for (krow = 0; krow < bs; krow++) {
1498           atmp = PetscAbsScalar(*aa);
1499           aa++;
1500           row = brow + krow; /* row index */
1501           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1502           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1503         }
1504       }
1505       aj++;
1506     }
1507   }
1508   PetscCall(VecRestoreArray(v, &x));
1509   PetscFunctionReturn(PETSC_SUCCESS);
1510 }
1511 
1512 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1513 {
1514   PetscFunctionBegin;
1515   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1516   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1517   PetscFunctionReturn(PETSC_SUCCESS);
1518 }
1519 
1520 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1521 {
1522   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1523   PetscScalar       *z = c;
1524   const PetscScalar *xb;
1525   PetscScalar        x1;
1526   const MatScalar   *v   = a->a, *vv;
1527   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1528 #if defined(PETSC_USE_COMPLEX)
1529   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1530 #else
1531   const int aconj = 0;
1532 #endif
1533 
1534   PetscFunctionBegin;
1535   for (i = 0; i < mbs; i++) {
1536     n = ii[1] - ii[0];
1537     ii++;
1538     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1539     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1540     jj = idx;
1541     vv = v;
1542     for (k = 0; k < cn; k++) {
1543       idx = jj;
1544       v   = vv;
1545       for (j = 0; j < n; j++) {
1546         xb = b + (*idx);
1547         x1 = xb[0 + k * bm];
1548         z[0 + k * cm] += v[0] * x1;
1549         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1550         v += 1;
1551         ++idx;
1552       }
1553     }
1554     z += 1;
1555   }
1556   PetscFunctionReturn(PETSC_SUCCESS);
1557 }
1558 
1559 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1560 {
1561   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1562   PetscScalar       *z = c;
1563   const PetscScalar *xb;
1564   PetscScalar        x1, x2;
1565   const MatScalar   *v   = a->a, *vv;
1566   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1567 
1568   PetscFunctionBegin;
1569   for (i = 0; i < mbs; i++) {
1570     n = ii[1] - ii[0];
1571     ii++;
1572     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1573     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1574     jj = idx;
1575     vv = v;
1576     for (k = 0; k < cn; k++) {
1577       idx = jj;
1578       v   = vv;
1579       for (j = 0; j < n; j++) {
1580         xb = b + 2 * (*idx);
1581         x1 = xb[0 + k * bm];
1582         x2 = xb[1 + k * bm];
1583         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1584         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1585         if (*idx != i) {
1586           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1587           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1588         }
1589         v += 4;
1590         ++idx;
1591       }
1592     }
1593     z += 2;
1594   }
1595   PetscFunctionReturn(PETSC_SUCCESS);
1596 }
1597 
1598 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1599 {
1600   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1601   PetscScalar       *z = c;
1602   const PetscScalar *xb;
1603   PetscScalar        x1, x2, x3;
1604   const MatScalar   *v   = a->a, *vv;
1605   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1606 
1607   PetscFunctionBegin;
1608   for (i = 0; i < mbs; i++) {
1609     n = ii[1] - ii[0];
1610     ii++;
1611     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1612     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1613     jj = idx;
1614     vv = v;
1615     for (k = 0; k < cn; k++) {
1616       idx = jj;
1617       v   = vv;
1618       for (j = 0; j < n; j++) {
1619         xb = b + 3 * (*idx);
1620         x1 = xb[0 + k * bm];
1621         x2 = xb[1 + k * bm];
1622         x3 = xb[2 + k * bm];
1623         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1624         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1625         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1626         if (*idx != i) {
1627           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1628           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1629           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1630         }
1631         v += 9;
1632         ++idx;
1633       }
1634     }
1635     z += 3;
1636   }
1637   PetscFunctionReturn(PETSC_SUCCESS);
1638 }
1639 
1640 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1641 {
1642   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1643   PetscScalar       *z = c;
1644   const PetscScalar *xb;
1645   PetscScalar        x1, x2, x3, x4;
1646   const MatScalar   *v   = a->a, *vv;
1647   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1648 
1649   PetscFunctionBegin;
1650   for (i = 0; i < mbs; i++) {
1651     n = ii[1] - ii[0];
1652     ii++;
1653     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1654     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1655     jj = idx;
1656     vv = v;
1657     for (k = 0; k < cn; k++) {
1658       idx = jj;
1659       v   = vv;
1660       for (j = 0; j < n; j++) {
1661         xb = b + 4 * (*idx);
1662         x1 = xb[0 + k * bm];
1663         x2 = xb[1 + k * bm];
1664         x3 = xb[2 + k * bm];
1665         x4 = xb[3 + k * bm];
1666         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1667         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1668         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1669         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1670         if (*idx != i) {
1671           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1672           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1673           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1674           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1675         }
1676         v += 16;
1677         ++idx;
1678       }
1679     }
1680     z += 4;
1681   }
1682   PetscFunctionReturn(PETSC_SUCCESS);
1683 }
1684 
1685 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1686 {
1687   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1688   PetscScalar       *z = c;
1689   const PetscScalar *xb;
1690   PetscScalar        x1, x2, x3, x4, x5;
1691   const MatScalar   *v   = a->a, *vv;
1692   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1693 
1694   PetscFunctionBegin;
1695   for (i = 0; i < mbs; i++) {
1696     n = ii[1] - ii[0];
1697     ii++;
1698     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1699     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1700     jj = idx;
1701     vv = v;
1702     for (k = 0; k < cn; k++) {
1703       idx = jj;
1704       v   = vv;
1705       for (j = 0; j < n; j++) {
1706         xb = b + 5 * (*idx);
1707         x1 = xb[0 + k * bm];
1708         x2 = xb[1 + k * bm];
1709         x3 = xb[2 + k * bm];
1710         x4 = xb[3 + k * bm];
1711         x5 = xb[4 + k * cm];
1712         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1713         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1714         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1715         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1716         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1717         if (*idx != i) {
1718           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1719           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1720           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1721           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1722           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1723         }
1724         v += 25;
1725         ++idx;
1726       }
1727     }
1728     z += 5;
1729   }
1730   PetscFunctionReturn(PETSC_SUCCESS);
1731 }
1732 
1733 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1734 {
1735   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1736   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1737   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1738   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1739   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1740   PetscBLASInt     bbs, bcn, bbm, bcm;
1741   PetscScalar     *z = NULL;
1742   PetscScalar     *c, *b;
1743   const MatScalar *v;
1744   const PetscInt  *idx, *ii;
1745   PetscScalar      _DOne = 1.0;
1746 
1747   PetscFunctionBegin;
1748   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1749   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1750   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1751   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1752   b = bd->v;
1753   PetscCall(MatZeroEntries(C));
1754   PetscCall(MatDenseGetArray(C, &c));
1755   switch (bs) {
1756   case 1:
1757     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1758     break;
1759   case 2:
1760     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1761     break;
1762   case 3:
1763     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1764     break;
1765   case 4:
1766     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1767     break;
1768   case 5:
1769     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1770     break;
1771   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1772     PetscCall(PetscBLASIntCast(bs, &bbs));
1773     PetscCall(PetscBLASIntCast(cn, &bcn));
1774     PetscCall(PetscBLASIntCast(bm, &bbm));
1775     PetscCall(PetscBLASIntCast(cm, &bcm));
1776     idx = a->j;
1777     v   = a->a;
1778     mbs = a->mbs;
1779     ii  = a->i;
1780     z   = c;
1781     for (i = 0; i < mbs; i++) {
1782       n = ii[1] - ii[0];
1783       ii++;
1784       for (j = 0; j < n; j++) {
1785         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1786         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1787         v += bs2;
1788       }
1789       z += bs;
1790     }
1791   }
1792   PetscCall(MatDenseRestoreArray(C, &c));
1793   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1794   PetscFunctionReturn(PETSC_SUCCESS);
1795 }
1796