xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision ae1ee55146a7ad071171b861759b85940c7e5c67) !
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <../src/mat/impls/sbaij/seq/sbaij.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7 
MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9 {
10   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
11   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
12   const PetscInt *idx;
13   PetscBT         table_out, table_in;
14 
15   PetscFunctionBegin;
16   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17   mbs = a->mbs;
18   ai  = a->i;
19   aj  = a->j;
20   bs  = A->rmap->bs;
21   PetscCall(PetscBTCreate(mbs, &table_out));
22   PetscCall(PetscMalloc1(mbs + 1, &nidx));
23   PetscCall(PetscBTCreate(mbs, &table_in));
24 
25   for (i = 0; i < is_max; i++) { /* for each is */
26     isz = 0;
27     PetscCall(PetscBTMemzero(mbs, table_out));
28 
29     /* Extract the indices, assume there can be duplicate entries */
30     PetscCall(ISGetIndices(is[i], &idx));
31     PetscCall(ISGetLocalSize(is[i], &n));
32 
33     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34     bcol_max = 0;
35     for (j = 0; j < n; ++j) {
36       brow = idx[j] / bs; /* convert the indices into block indices */
37       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38       if (!PetscBTLookupSet(table_out, brow)) {
39         nidx[isz++] = brow;
40         if (bcol_max < brow) bcol_max = brow;
41       }
42     }
43     PetscCall(ISRestoreIndices(is[i], &idx));
44     PetscCall(ISDestroy(&is[i]));
45 
46     k = 0;
47     for (j = 0; j < ov; j++) { /* for each overlap */
48       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49       PetscCall(PetscBTMemzero(mbs, table_in));
50       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
51 
52       n = isz; /* length of the updated is[i] */
53       for (brow = 0; brow < mbs; brow++) {
54         start = ai[brow];
55         end   = ai[brow + 1];
56         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57           for (l = start; l < end; l++) {
58             bcol = aj[l];
59             if (!PetscBTLookupSet(table_out, bcol)) {
60               nidx[isz++] = bcol;
61               if (bcol_max < bcol) bcol_max = bcol;
62             }
63           }
64           k++;
65           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67           for (l = start; l < end; l++) {
68             bcol = aj[l];
69             if (bcol > bcol_max) break;
70             if (PetscBTLookup(table_in, bcol)) {
71               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72               break; /* for l = start; l<end ; l++) */
73             }
74           }
75         }
76       }
77     } /* for each overlap */
78     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
79   } /* for each is */
80   PetscCall(PetscBTDestroy(&table_out));
81   PetscCall(PetscFree(nidx));
82   PetscCall(PetscBTDestroy(&table_in));
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87         Zero some ops' to avoid invalid use */
MatSeqSBAIJZeroOps_Private(Mat Bseq)88 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89 {
90   PetscFunctionBegin;
91   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92   Bseq->ops->mult                   = NULL;
93   Bseq->ops->multadd                = NULL;
94   Bseq->ops->multtranspose          = NULL;
95   Bseq->ops->multtransposeadd       = NULL;
96   Bseq->ops->lufactor               = NULL;
97   Bseq->ops->choleskyfactor         = NULL;
98   Bseq->ops->lufactorsymbolic       = NULL;
99   Bseq->ops->choleskyfactorsymbolic = NULL;
100   Bseq->ops->getinertia             = NULL;
101   PetscFunctionReturn(PETSC_SUCCESS);
102 }
103 
104 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B,PetscBool sym)105 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106 {
107   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108   Mat_SeqBAIJ    *d = NULL;
109   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111   const PetscInt *irow, *icol;
112   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113   PetscInt       *aj = a->j, *ai = a->i;
114   MatScalar      *mat_a;
115   Mat             C;
116   PetscBool       flag;
117 
118   PetscFunctionBegin;
119   PetscCall(ISGetIndices(isrow, &irow));
120   PetscCall(ISGetIndices(iscol, &icol));
121   PetscCall(ISGetLocalSize(isrow, &nrows));
122   PetscCall(ISGetLocalSize(iscol, &ncols));
123 
124   PetscCall(PetscCalloc1(1 + oldcols, &smap));
125   ssmap = smap;
126   PetscCall(PetscMalloc1(1 + nrows, &lens));
127   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
128   /* determine lens of each row */
129   for (i = 0; i < nrows; i++) {
130     kstart  = ai[irow[i]];
131     kend    = kstart + a->ilen[irow[i]];
132     lens[i] = 0;
133     for (k = kstart; k < kend; k++) {
134       if (ssmap[aj[k]]) lens[i]++;
135     }
136   }
137   /* Create and fill new matrix */
138   if (scall == MAT_REUSE_MATRIX) {
139     if (sym) {
140       c = (Mat_SeqSBAIJ *)((*B)->data);
141 
142       PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143       PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
145       PetscCall(PetscArrayzero(c->ilen, c->mbs));
146     } else {
147       d = (Mat_SeqBAIJ *)((*B)->data);
148 
149       PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150       PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
151       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
152       PetscCall(PetscArrayzero(d->ilen, d->mbs));
153     }
154     C = *B;
155   } else {
156     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
157     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
158     if (sym) {
159       PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
160       PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
161     } else {
162       PetscCall(MatSetType(C, MATSEQBAIJ));
163       PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
164     }
165   }
166   if (sym) c = (Mat_SeqSBAIJ *)C->data;
167   else d = (Mat_SeqBAIJ *)C->data;
168   for (i = 0; i < nrows; i++) {
169     row    = irow[i];
170     kstart = ai[row];
171     kend   = kstart + a->ilen[row];
172     if (sym) {
173       mat_i    = c->i[i];
174       mat_j    = PetscSafePointerPlusOffset(c->j, mat_i);
175       mat_a    = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
176       mat_ilen = c->ilen + i;
177     } else {
178       mat_i    = d->i[i];
179       mat_j    = PetscSafePointerPlusOffset(d->j, mat_i);
180       mat_a    = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
181       mat_ilen = d->ilen + i;
182     }
183     for (k = kstart; k < kend; k++) {
184       if ((tcol = ssmap[a->j[k]])) {
185         *mat_j++ = tcol - 1;
186         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
187         mat_a += bs2;
188         (*mat_ilen)++;
189       }
190     }
191   }
192   /* sort */
193   {
194     MatScalar *work;
195 
196     PetscCall(PetscMalloc1(bs2, &work));
197     for (i = 0; i < nrows; i++) {
198       PetscInt ilen;
199       if (sym) {
200         mat_i = c->i[i];
201         mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
202         mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
203         ilen  = c->ilen[i];
204       } else {
205         mat_i = d->i[i];
206         mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
207         mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
208         ilen  = d->ilen[i];
209       }
210       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
211     }
212     PetscCall(PetscFree(work));
213   }
214 
215   /* Free work space */
216   PetscCall(ISRestoreIndices(iscol, &icol));
217   PetscCall(PetscFree(smap));
218   PetscCall(PetscFree(lens));
219   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
220   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
221 
222   PetscCall(ISRestoreIndices(isrow, &irow));
223   *B = C;
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)227 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
228 {
229   Mat       C[2];
230   IS        is1, is2, intersect = NULL;
231   PetscInt  n1, n2, ni;
232   PetscBool sym = PETSC_TRUE;
233 
234   PetscFunctionBegin;
235   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
236   if (isrow == iscol) {
237     is2 = is1;
238     PetscCall(PetscObjectReference((PetscObject)is2));
239   } else {
240     PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
241     PetscCall(ISIntersect(is1, is2, &intersect));
242     PetscCall(ISGetLocalSize(intersect, &ni));
243     PetscCall(ISDestroy(&intersect));
244     if (ni == 0) sym = PETSC_FALSE;
245     else if (PetscDefined(USE_DEBUG)) {
246       PetscCall(ISGetLocalSize(is1, &n1));
247       PetscCall(ISGetLocalSize(is2, &n2));
248       PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
249     }
250   }
251   if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
252   else {
253     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
254     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
255     PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
256     PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
257     PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
258     if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
259     else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
260     else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261     PetscCall(MatDestroy(C));
262     PetscCall(MatDestroy(C + 1));
263   }
264   PetscCall(ISDestroy(&is1));
265   PetscCall(ISDestroy(&is2));
266 
267   if (sym && isrow != iscol) {
268     PetscBool isequal;
269     PetscCall(ISEqual(isrow, iscol, &isequal));
270     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
271   }
272   PetscFunctionReturn(PETSC_SUCCESS);
273 }
274 
MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])275 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
276 {
277   PetscInt i;
278 
279   PetscFunctionBegin;
280   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
281 
282   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
283   PetscFunctionReturn(PETSC_SUCCESS);
284 }
285 
286 /* Should check that shapes of vectors and matrices match */
MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
288 {
289   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
290   PetscScalar       *z, x1, x2, zero = 0.0;
291   const PetscScalar *x, *xb;
292   const MatScalar   *v;
293   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
294   const PetscInt    *aj = a->j, *ai = a->i, *ib;
295   PetscInt           nonzerorow = 0;
296 
297   PetscFunctionBegin;
298   PetscCall(VecSet(zz, zero));
299   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300   PetscCall(VecGetArrayRead(xx, &x));
301   PetscCall(VecGetArray(zz, &z));
302 
303   v  = a->a;
304   xb = x;
305 
306   for (i = 0; i < mbs; i++) {
307     n    = ai[1] - ai[0]; /* length of i_th block row of A */
308     x1   = xb[0];
309     x2   = xb[1];
310     ib   = aj + *ai;
311     jmin = 0;
312     nonzerorow += (n > 0);
313     if (*ib == i) { /* (diag of A)*x */
314       z[2 * i] += v[0] * x1 + v[2] * x2;
315       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
316       v += 4;
317       jmin++;
318     }
319     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321     for (j = jmin; j < n; j++) {
322       /* (strict lower triangular part of A)*x  */
323       cval = ib[j] * 2;
324       z[cval] += v[0] * x1 + v[1] * x2;
325       z[cval + 1] += v[2] * x1 + v[3] * x2;
326       /* (strict upper triangular part of A)*x  */
327       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
328       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
329       v += 4;
330     }
331     xb += 2;
332     ai++;
333   }
334 
335   PetscCall(VecRestoreArrayRead(xx, &x));
336   PetscCall(VecRestoreArray(zz, &z));
337   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
338   PetscFunctionReturn(PETSC_SUCCESS);
339 }
340 
MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)341 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
342 {
343   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
344   PetscScalar       *z, x1, x2, x3, zero = 0.0;
345   const PetscScalar *x, *xb;
346   const MatScalar   *v;
347   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
348   const PetscInt    *aj = a->j, *ai = a->i, *ib;
349   PetscInt           nonzerorow = 0;
350 
351   PetscFunctionBegin;
352   PetscCall(VecSet(zz, zero));
353   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
354   PetscCall(VecGetArrayRead(xx, &x));
355   PetscCall(VecGetArray(zz, &z));
356 
357   v  = a->a;
358   xb = x;
359 
360   for (i = 0; i < mbs; i++) {
361     n    = ai[1] - ai[0]; /* length of i_th block row of A */
362     x1   = xb[0];
363     x2   = xb[1];
364     x3   = xb[2];
365     ib   = aj + *ai;
366     jmin = 0;
367     nonzerorow += (n > 0);
368     if (*ib == i) { /* (diag of A)*x */
369       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
370       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
371       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
372       v += 9;
373       jmin++;
374     }
375     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
376     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
377     for (j = jmin; j < n; j++) {
378       /* (strict lower triangular part of A)*x  */
379       cval = ib[j] * 3;
380       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
381       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
382       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
383       /* (strict upper triangular part of A)*x  */
384       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
385       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
386       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
387       v += 9;
388     }
389     xb += 3;
390     ai++;
391   }
392 
393   PetscCall(VecRestoreArrayRead(xx, &x));
394   PetscCall(VecRestoreArray(zz, &z));
395   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)399 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
400 {
401   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
402   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
403   const PetscScalar *x, *xb;
404   const MatScalar   *v;
405   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
406   const PetscInt    *aj = a->j, *ai = a->i, *ib;
407   PetscInt           nonzerorow = 0;
408 
409   PetscFunctionBegin;
410   PetscCall(VecSet(zz, zero));
411   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
412   PetscCall(VecGetArrayRead(xx, &x));
413   PetscCall(VecGetArray(zz, &z));
414 
415   v  = a->a;
416   xb = x;
417 
418   for (i = 0; i < mbs; i++) {
419     n    = ai[1] - ai[0]; /* length of i_th block row of A */
420     x1   = xb[0];
421     x2   = xb[1];
422     x3   = xb[2];
423     x4   = xb[3];
424     ib   = aj + *ai;
425     jmin = 0;
426     nonzerorow += (n > 0);
427     if (*ib == i) { /* (diag of A)*x */
428       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
429       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
430       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
431       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
432       v += 16;
433       jmin++;
434     }
435     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
436     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437     for (j = jmin; j < n; j++) {
438       /* (strict lower triangular part of A)*x  */
439       cval = ib[j] * 4;
440       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
441       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
442       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
443       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
444       /* (strict upper triangular part of A)*x  */
445       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
446       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
447       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
448       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
449       v += 16;
450     }
451     xb += 4;
452     ai++;
453   }
454 
455   PetscCall(VecRestoreArrayRead(xx, &x));
456   PetscCall(VecRestoreArray(zz, &z));
457   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
458   PetscFunctionReturn(PETSC_SUCCESS);
459 }
460 
MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)461 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
462 {
463   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
464   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
465   const PetscScalar *x, *xb;
466   const MatScalar   *v;
467   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
468   const PetscInt    *aj = a->j, *ai = a->i, *ib;
469   PetscInt           nonzerorow = 0;
470 
471   PetscFunctionBegin;
472   PetscCall(VecSet(zz, zero));
473   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
474   PetscCall(VecGetArrayRead(xx, &x));
475   PetscCall(VecGetArray(zz, &z));
476 
477   v  = a->a;
478   xb = x;
479 
480   for (i = 0; i < mbs; i++) {
481     n    = ai[1] - ai[0]; /* length of i_th block row of A */
482     x1   = xb[0];
483     x2   = xb[1];
484     x3   = xb[2];
485     x4   = xb[3];
486     x5   = xb[4];
487     ib   = aj + *ai;
488     jmin = 0;
489     nonzerorow += (n > 0);
490     if (*ib == i) { /* (diag of A)*x */
491       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
492       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
493       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
494       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
495       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
496       v += 25;
497       jmin++;
498     }
499     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
500     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
501     for (j = jmin; j < n; j++) {
502       /* (strict lower triangular part of A)*x  */
503       cval = ib[j] * 5;
504       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
505       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
506       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
507       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
508       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
509       /* (strict upper triangular part of A)*x  */
510       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
511       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
512       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
513       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
514       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
515       v += 25;
516     }
517     xb += 5;
518     ai++;
519   }
520 
521   PetscCall(VecRestoreArrayRead(xx, &x));
522   PetscCall(VecRestoreArray(zz, &z));
523   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
524   PetscFunctionReturn(PETSC_SUCCESS);
525 }
526 
MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)527 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
528 {
529   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
530   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
531   const PetscScalar *x, *xb;
532   const MatScalar   *v;
533   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
534   const PetscInt    *aj = a->j, *ai = a->i, *ib;
535   PetscInt           nonzerorow = 0;
536 
537   PetscFunctionBegin;
538   PetscCall(VecSet(zz, zero));
539   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
540   PetscCall(VecGetArrayRead(xx, &x));
541   PetscCall(VecGetArray(zz, &z));
542 
543   v  = a->a;
544   xb = x;
545 
546   for (i = 0; i < mbs; i++) {
547     n    = ai[1] - ai[0]; /* length of i_th block row of A */
548     x1   = xb[0];
549     x2   = xb[1];
550     x3   = xb[2];
551     x4   = xb[3];
552     x5   = xb[4];
553     x6   = xb[5];
554     ib   = aj + *ai;
555     jmin = 0;
556     nonzerorow += (n > 0);
557     if (*ib == i) { /* (diag of A)*x */
558       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
559       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
560       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
561       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
562       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
563       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
564       v += 36;
565       jmin++;
566     }
567     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
568     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
569     for (j = jmin; j < n; j++) {
570       /* (strict lower triangular part of A)*x  */
571       cval = ib[j] * 6;
572       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
573       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
574       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
575       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
576       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
577       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
578       /* (strict upper triangular part of A)*x  */
579       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
580       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
581       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
582       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
583       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
584       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
585       v += 36;
586     }
587     xb += 6;
588     ai++;
589   }
590 
591   PetscCall(VecRestoreArrayRead(xx, &x));
592   PetscCall(VecRestoreArray(zz, &z));
593   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
594   PetscFunctionReturn(PETSC_SUCCESS);
595 }
596 
MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)597 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
598 {
599   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
600   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
601   const PetscScalar *x, *xb;
602   const MatScalar   *v;
603   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
604   const PetscInt    *aj = a->j, *ai = a->i, *ib;
605   PetscInt           nonzerorow = 0;
606 
607   PetscFunctionBegin;
608   PetscCall(VecSet(zz, zero));
609   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
610   PetscCall(VecGetArrayRead(xx, &x));
611   PetscCall(VecGetArray(zz, &z));
612 
613   v  = a->a;
614   xb = x;
615 
616   for (i = 0; i < mbs; i++) {
617     n    = ai[1] - ai[0]; /* length of i_th block row of A */
618     x1   = xb[0];
619     x2   = xb[1];
620     x3   = xb[2];
621     x4   = xb[3];
622     x5   = xb[4];
623     x6   = xb[5];
624     x7   = xb[6];
625     ib   = aj + *ai;
626     jmin = 0;
627     nonzerorow += (n > 0);
628     if (*ib == i) { /* (diag of A)*x */
629       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
630       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
631       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
632       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
633       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
634       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
635       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
636       v += 49;
637       jmin++;
638     }
639     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
640     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641     for (j = jmin; j < n; j++) {
642       /* (strict lower triangular part of A)*x  */
643       cval = ib[j] * 7;
644       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
645       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
646       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
647       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
648       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
649       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
650       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
651       /* (strict upper triangular part of A)*x  */
652       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
653       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
654       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
655       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
656       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
657       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
658       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
659       v += 49;
660     }
661     xb += 7;
662     ai++;
663   }
664   PetscCall(VecRestoreArrayRead(xx, &x));
665   PetscCall(VecRestoreArray(zz, &z));
666   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
667   PetscFunctionReturn(PETSC_SUCCESS);
668 }
669 
670 /*
671     This will not work with MatScalar == float because it calls the BLAS
672 */
MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)673 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
674 {
675   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
676   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
677   const PetscScalar *x, *x_ptr, *xb;
678   const MatScalar   *v;
679   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
680   const PetscInt    *idx, *aj, *ii;
681   PetscInt           nonzerorow = 0;
682 
683   PetscFunctionBegin;
684   PetscCall(VecSet(zz, zero));
685   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
686   PetscCall(VecGetArrayRead(xx, &x));
687   PetscCall(VecGetArray(zz, &z));
688 
689   x_ptr = x;
690   z_ptr = z;
691 
692   aj = a->j;
693   v  = a->a;
694   ii = a->i;
695 
696   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
697   work = a->mult_work;
698 
699   for (i = 0; i < mbs; i++) {
700     n     = ii[1] - ii[0];
701     ncols = n * bs;
702     workt = work;
703     idx   = aj + ii[0];
704     nonzerorow += (n > 0);
705 
706     /* upper triangular part */
707     for (j = 0; j < n; j++) {
708       xb = x_ptr + bs * (*idx++);
709       for (k = 0; k < bs; k++) workt[k] = xb[k];
710       workt += bs;
711     }
712     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
713     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
714 
715     /* strict lower triangular part */
716     idx = aj + ii[0];
717     if (n && *idx == i) {
718       ncols -= bs;
719       v += bs2;
720       idx++;
721       n--;
722     }
723 
724     if (ncols > 0) {
725       workt = work;
726       PetscCall(PetscArrayzero(workt, ncols));
727       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
728       for (j = 0; j < n; j++) {
729         zb = z_ptr + bs * (*idx++);
730         for (k = 0; k < bs; k++) zb[k] += workt[k];
731         workt += bs;
732       }
733     }
734     x += bs;
735     v += n * bs2;
736     z += bs;
737     ii++;
738   }
739 
740   PetscCall(VecRestoreArrayRead(xx, &x));
741   PetscCall(VecRestoreArray(zz, &z));
742   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
743   PetscFunctionReturn(PETSC_SUCCESS);
744 }
745 
MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)746 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
747 {
748   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
749   PetscScalar       *z, x1;
750   const PetscScalar *x, *xb;
751   const MatScalar   *v;
752   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
753   const PetscInt    *aj = a->j, *ai = a->i, *ib;
754   PetscInt           nonzerorow = 0;
755   const int          aconj      = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
756 
757   PetscFunctionBegin;
758   PetscCall(VecCopy(yy, zz));
759   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
760   PetscCall(VecGetArrayRead(xx, &x));
761   PetscCall(VecGetArray(zz, &z));
762   v  = a->a;
763   xb = x;
764 
765   for (i = 0; i < mbs; i++) {
766     n    = ai[1] - ai[0]; /* length of i_th row of A */
767     x1   = xb[0];
768     ib   = aj + *ai;
769     jmin = 0;
770     nonzerorow += (n > 0);
771     if (n && *ib == i) { /* (diag of A)*x */
772       z[i] += *v++ * x[*ib++];
773       jmin++;
774     }
775     if (aconj) {
776       for (j = jmin; j < n; j++) {
777         cval = *ib;
778         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
779         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
780       }
781     } else {
782       for (j = jmin; j < n; j++) {
783         cval = *ib;
784         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
785         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
786       }
787     }
788     xb++;
789     ai++;
790   }
791 
792   PetscCall(VecRestoreArrayRead(xx, &x));
793   PetscCall(VecRestoreArray(zz, &z));
794 
795   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
796   PetscFunctionReturn(PETSC_SUCCESS);
797 }
798 
MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)799 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
800 {
801   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
802   PetscScalar       *z, x1, x2;
803   const PetscScalar *x, *xb;
804   const MatScalar   *v;
805   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
806   const PetscInt    *aj = a->j, *ai = a->i, *ib;
807   PetscInt           nonzerorow = 0;
808 
809   PetscFunctionBegin;
810   PetscCall(VecCopy(yy, zz));
811   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
812   PetscCall(VecGetArrayRead(xx, &x));
813   PetscCall(VecGetArray(zz, &z));
814 
815   v  = a->a;
816   xb = x;
817 
818   for (i = 0; i < mbs; i++) {
819     n    = ai[1] - ai[0]; /* length of i_th block row of A */
820     x1   = xb[0];
821     x2   = xb[1];
822     ib   = aj + *ai;
823     jmin = 0;
824     nonzerorow += (n > 0);
825     if (n && *ib == i) { /* (diag of A)*x */
826       z[2 * i] += v[0] * x1 + v[2] * x2;
827       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
828       v += 4;
829       jmin++;
830     }
831     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
832     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
833     for (j = jmin; j < n; j++) {
834       /* (strict lower triangular part of A)*x  */
835       cval = ib[j] * 2;
836       z[cval] += v[0] * x1 + v[1] * x2;
837       z[cval + 1] += v[2] * x1 + v[3] * x2;
838       /* (strict upper triangular part of A)*x  */
839       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
840       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
841       v += 4;
842     }
843     xb += 2;
844     ai++;
845   }
846   PetscCall(VecRestoreArrayRead(xx, &x));
847   PetscCall(VecRestoreArray(zz, &z));
848 
849   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
850   PetscFunctionReturn(PETSC_SUCCESS);
851 }
852 
MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)853 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
854 {
855   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
856   PetscScalar       *z, x1, x2, x3;
857   const PetscScalar *x, *xb;
858   const MatScalar   *v;
859   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
860   const PetscInt    *aj = a->j, *ai = a->i, *ib;
861   PetscInt           nonzerorow = 0;
862 
863   PetscFunctionBegin;
864   PetscCall(VecCopy(yy, zz));
865   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
866   PetscCall(VecGetArrayRead(xx, &x));
867   PetscCall(VecGetArray(zz, &z));
868 
869   v  = a->a;
870   xb = x;
871 
872   for (i = 0; i < mbs; i++) {
873     n    = ai[1] - ai[0]; /* length of i_th block row of A */
874     x1   = xb[0];
875     x2   = xb[1];
876     x3   = xb[2];
877     ib   = aj + *ai;
878     jmin = 0;
879     nonzerorow += (n > 0);
880     if (n && *ib == i) { /* (diag of A)*x */
881       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
882       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
883       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
884       v += 9;
885       jmin++;
886     }
887     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
888     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
889     for (j = jmin; j < n; j++) {
890       /* (strict lower triangular part of A)*x  */
891       cval = ib[j] * 3;
892       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
893       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
894       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
895       /* (strict upper triangular part of A)*x  */
896       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
897       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
898       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
899       v += 9;
900     }
901     xb += 3;
902     ai++;
903   }
904 
905   PetscCall(VecRestoreArrayRead(xx, &x));
906   PetscCall(VecRestoreArray(zz, &z));
907 
908   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
909   PetscFunctionReturn(PETSC_SUCCESS);
910 }
911 
MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)912 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
913 {
914   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
915   PetscScalar       *z, x1, x2, x3, x4;
916   const PetscScalar *x, *xb;
917   const MatScalar   *v;
918   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
919   const PetscInt    *aj = a->j, *ai = a->i, *ib;
920   PetscInt           nonzerorow = 0;
921 
922   PetscFunctionBegin;
923   PetscCall(VecCopy(yy, zz));
924   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
925   PetscCall(VecGetArrayRead(xx, &x));
926   PetscCall(VecGetArray(zz, &z));
927 
928   v  = a->a;
929   xb = x;
930 
931   for (i = 0; i < mbs; i++) {
932     n    = ai[1] - ai[0]; /* length of i_th block row of A */
933     x1   = xb[0];
934     x2   = xb[1];
935     x3   = xb[2];
936     x4   = xb[3];
937     ib   = aj + *ai;
938     jmin = 0;
939     nonzerorow += (n > 0);
940     if (n && *ib == i) { /* (diag of A)*x */
941       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
942       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
943       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
944       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
945       v += 16;
946       jmin++;
947     }
948     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
949     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
950     for (j = jmin; j < n; j++) {
951       /* (strict lower triangular part of A)*x  */
952       cval = ib[j] * 4;
953       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
954       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
955       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
956       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
957       /* (strict upper triangular part of A)*x  */
958       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
959       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
960       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
961       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
962       v += 16;
963     }
964     xb += 4;
965     ai++;
966   }
967 
968   PetscCall(VecRestoreArrayRead(xx, &x));
969   PetscCall(VecRestoreArray(zz, &z));
970 
971   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
972   PetscFunctionReturn(PETSC_SUCCESS);
973 }
974 
MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)975 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
976 {
977   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
978   PetscScalar       *z, x1, x2, x3, x4, x5;
979   const PetscScalar *x, *xb;
980   const MatScalar   *v;
981   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
982   const PetscInt    *aj = a->j, *ai = a->i, *ib;
983   PetscInt           nonzerorow = 0;
984 
985   PetscFunctionBegin;
986   PetscCall(VecCopy(yy, zz));
987   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
988   PetscCall(VecGetArrayRead(xx, &x));
989   PetscCall(VecGetArray(zz, &z));
990 
991   v  = a->a;
992   xb = x;
993 
994   for (i = 0; i < mbs; i++) {
995     n    = ai[1] - ai[0]; /* length of i_th block row of A */
996     x1   = xb[0];
997     x2   = xb[1];
998     x3   = xb[2];
999     x4   = xb[3];
1000     x5   = xb[4];
1001     ib   = aj + *ai;
1002     jmin = 0;
1003     nonzerorow += (n > 0);
1004     if (n && *ib == i) { /* (diag of A)*x */
1005       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1006       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1007       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1008       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1009       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1010       v += 25;
1011       jmin++;
1012     }
1013     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1014     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1015     for (j = jmin; j < n; j++) {
1016       /* (strict lower triangular part of A)*x  */
1017       cval = ib[j] * 5;
1018       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1019       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1020       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1021       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1022       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1023       /* (strict upper triangular part of A)*x  */
1024       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1025       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1026       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1027       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1028       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1029       v += 25;
1030     }
1031     xb += 5;
1032     ai++;
1033   }
1034 
1035   PetscCall(VecRestoreArrayRead(xx, &x));
1036   PetscCall(VecRestoreArray(zz, &z));
1037 
1038   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1039   PetscFunctionReturn(PETSC_SUCCESS);
1040 }
1041 
MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)1042 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1043 {
1044   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1045   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1046   const PetscScalar *x, *xb;
1047   const MatScalar   *v;
1048   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1049   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1050   PetscInt           nonzerorow = 0;
1051 
1052   PetscFunctionBegin;
1053   PetscCall(VecCopy(yy, zz));
1054   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1055   PetscCall(VecGetArrayRead(xx, &x));
1056   PetscCall(VecGetArray(zz, &z));
1057 
1058   v  = a->a;
1059   xb = x;
1060 
1061   for (i = 0; i < mbs; i++) {
1062     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1063     x1   = xb[0];
1064     x2   = xb[1];
1065     x3   = xb[2];
1066     x4   = xb[3];
1067     x5   = xb[4];
1068     x6   = xb[5];
1069     ib   = aj + *ai;
1070     jmin = 0;
1071     nonzerorow += (n > 0);
1072     if (n && *ib == i) { /* (diag of A)*x */
1073       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1074       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1075       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1076       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1077       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1078       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1079       v += 36;
1080       jmin++;
1081     }
1082     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1083     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1084     for (j = jmin; j < n; j++) {
1085       /* (strict lower triangular part of A)*x  */
1086       cval = ib[j] * 6;
1087       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1088       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1089       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1090       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1091       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1092       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1093       /* (strict upper triangular part of A)*x  */
1094       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1095       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1096       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1097       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1098       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1099       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1100       v += 36;
1101     }
1102     xb += 6;
1103     ai++;
1104   }
1105 
1106   PetscCall(VecRestoreArrayRead(xx, &x));
1107   PetscCall(VecRestoreArray(zz, &z));
1108 
1109   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1110   PetscFunctionReturn(PETSC_SUCCESS);
1111 }
1112 
MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)1113 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1114 {
1115   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1116   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1117   const PetscScalar *x, *xb;
1118   const MatScalar   *v;
1119   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1120   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1121   PetscInt           nonzerorow = 0;
1122 
1123   PetscFunctionBegin;
1124   PetscCall(VecCopy(yy, zz));
1125   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1126   PetscCall(VecGetArrayRead(xx, &x));
1127   PetscCall(VecGetArray(zz, &z));
1128 
1129   v  = a->a;
1130   xb = x;
1131 
1132   for (i = 0; i < mbs; i++) {
1133     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1134     x1   = xb[0];
1135     x2   = xb[1];
1136     x3   = xb[2];
1137     x4   = xb[3];
1138     x5   = xb[4];
1139     x6   = xb[5];
1140     x7   = xb[6];
1141     ib   = aj + *ai;
1142     jmin = 0;
1143     nonzerorow += (n > 0);
1144     if (n && *ib == i) { /* (diag of A)*x */
1145       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1146       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1147       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1148       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1149       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1150       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1151       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1152       v += 49;
1153       jmin++;
1154     }
1155     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1156     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1157     for (j = jmin; j < n; j++) {
1158       /* (strict lower triangular part of A)*x  */
1159       cval = ib[j] * 7;
1160       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1161       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1162       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1163       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1164       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1165       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1166       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1167       /* (strict upper triangular part of A)*x  */
1168       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1169       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1170       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1171       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1172       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1173       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1174       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1175       v += 49;
1176     }
1177     xb += 7;
1178     ai++;
1179   }
1180 
1181   PetscCall(VecRestoreArrayRead(xx, &x));
1182   PetscCall(VecRestoreArray(zz, &z));
1183 
1184   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)1188 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1189 {
1190   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1191   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1192   const PetscScalar *x, *x_ptr, *xb;
1193   const MatScalar   *v;
1194   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1195   const PetscInt    *idx, *aj, *ii;
1196   PetscInt           nonzerorow = 0;
1197 
1198   PetscFunctionBegin;
1199   PetscCall(VecCopy(yy, zz));
1200   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1201   PetscCall(VecGetArrayRead(xx, &x));
1202   x_ptr = x;
1203   PetscCall(VecGetArray(zz, &z));
1204   z_ptr = z;
1205 
1206   aj = a->j;
1207   v  = a->a;
1208   ii = a->i;
1209 
1210   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1211   work = a->mult_work;
1212 
1213   for (i = 0; i < mbs; i++) {
1214     n     = ii[1] - ii[0];
1215     ncols = n * bs;
1216     workt = work;
1217     idx   = aj + ii[0];
1218     nonzerorow += (n > 0);
1219 
1220     /* upper triangular part */
1221     for (j = 0; j < n; j++) {
1222       xb = x_ptr + bs * (*idx++);
1223       for (k = 0; k < bs; k++) workt[k] = xb[k];
1224       workt += bs;
1225     }
1226     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1227     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1228 
1229     /* strict lower triangular part */
1230     idx = aj + ii[0];
1231     if (n && *idx == i) {
1232       ncols -= bs;
1233       v += bs2;
1234       idx++;
1235       n--;
1236     }
1237     if (ncols > 0) {
1238       workt = work;
1239       PetscCall(PetscArrayzero(workt, ncols));
1240       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1241       for (j = 0; j < n; j++) {
1242         zb = z_ptr + bs * (*idx++);
1243         for (k = 0; k < bs; k++) zb[k] += workt[k];
1244         workt += bs;
1245       }
1246     }
1247 
1248     x += bs;
1249     v += n * bs2;
1250     z += bs;
1251     ii++;
1252   }
1253 
1254   PetscCall(VecRestoreArrayRead(xx, &x));
1255   PetscCall(VecRestoreArray(zz, &z));
1256 
1257   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1258   PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260 
MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)1261 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1262 {
1263   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1264   PetscScalar   oalpha = alpha;
1265   PetscBLASInt  one    = 1, totalnz;
1266 
1267   PetscFunctionBegin;
1268   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1269   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1270   PetscCall(PetscLogFlops(totalnz));
1271   PetscFunctionReturn(PETSC_SUCCESS);
1272 }
1273 
MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal * norm)1274 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1275 {
1276   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1277   const MatScalar *v        = a->a;
1278   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1279   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1280   const PetscInt  *aj = a->j, *col;
1281 
1282   PetscFunctionBegin;
1283   if (!a->nz) {
1284     *norm = 0.0;
1285     PetscFunctionReturn(PETSC_SUCCESS);
1286   }
1287   if (type == NORM_FROBENIUS) {
1288     for (k = 0; k < mbs; k++) {
1289       jmin = a->i[k];
1290       jmax = a->i[k + 1];
1291       col  = aj + jmin;
1292       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1293         for (i = 0; i < bs2; i++) {
1294           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1295           v++;
1296         }
1297         jmin++;
1298       }
1299       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1300         for (i = 0; i < bs2; i++) {
1301           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1302           v++;
1303         }
1304       }
1305     }
1306     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1307     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1308   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1309     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1310     for (i = 0; i < mbs; i++) jl[i] = mbs;
1311     il[0] = 0;
1312 
1313     *norm = 0.0;
1314     for (k = 0; k < mbs; k++) { /* k_th block row */
1315       for (j = 0; j < bs; j++) sum[j] = 0.0;
1316       /*-- col sum --*/
1317       i = jl[k]; /* first |A(i,k)| to be added */
1318       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1319                   at step k */
1320       while (i < mbs) {
1321         nexti = jl[i]; /* next block row to be added */
1322         ik    = il[i]; /* block index of A(i,k) in the array a */
1323         for (j = 0; j < bs; j++) {
1324           v = a->a + ik * bs2 + j * bs;
1325           for (k1 = 0; k1 < bs; k1++) {
1326             sum[j] += PetscAbsScalar(*v);
1327             v++;
1328           }
1329         }
1330         /* update il, jl */
1331         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1332         jmax = a->i[i + 1];
1333         if (jmin < jmax) {
1334           il[i] = jmin;
1335           j     = a->j[jmin];
1336           jl[i] = jl[j];
1337           jl[j] = i;
1338         }
1339         i = nexti;
1340       }
1341       /*-- row sum --*/
1342       jmin = a->i[k];
1343       jmax = a->i[k + 1];
1344       for (i = jmin; i < jmax; i++) {
1345         for (j = 0; j < bs; j++) {
1346           v = a->a + i * bs2 + j;
1347           for (k1 = 0; k1 < bs; k1++) {
1348             sum[j] += PetscAbsScalar(*v);
1349             v += bs;
1350           }
1351         }
1352       }
1353       /* add k_th block row to il, jl */
1354       col = aj + jmin;
1355       if (jmax - jmin > 0 && *col == k) jmin++;
1356       if (jmin < jmax) {
1357         il[k] = jmin;
1358         j     = a->j[jmin];
1359         jl[k] = jl[j];
1360         jl[j] = k;
1361       }
1362       for (j = 0; j < bs; j++) {
1363         if (sum[j] > *norm) *norm = sum[j];
1364       }
1365     }
1366     PetscCall(PetscFree3(sum, il, jl));
1367     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1368   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1369   PetscFunctionReturn(PETSC_SUCCESS);
1370 }
1371 
MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)1372 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1373 {
1374   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1375 
1376   PetscFunctionBegin;
1377   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1378   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1379     *flg = PETSC_FALSE;
1380     PetscFunctionReturn(PETSC_SUCCESS);
1381   }
1382 
1383   /* if the a->i are the same */
1384   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1385   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1386 
1387   /* if a->j are the same */
1388   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1389   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1390 
1391   /* if a->a are the same */
1392   PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg));
1393   PetscFunctionReturn(PETSC_SUCCESS);
1394 }
1395 
MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)1396 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1397 {
1398   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1399   PetscInt         n;
1400   const PetscInt   bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2;
1401   PetscScalar     *x;
1402   const MatScalar *aa = a->a, *aa_j;
1403   const PetscInt  *ai = a->i, *adiag;
1404   PetscBool        diagDense;
1405 
1406   PetscFunctionBegin;
1407   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1408   PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, &diagDense));
1409   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1410     PetscCall(VecGetArrayWrite(v, &x));
1411     for (PetscInt i = 0; i < ambs; i++) x[i] = 1.0 / aa[adiag[i]];
1412     PetscCall(VecRestoreArrayWrite(v, &x));
1413     PetscFunctionReturn(PETSC_SUCCESS);
1414   }
1415 
1416   PetscCall(VecGetLocalSize(v, &n));
1417   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1418   PetscCall(VecGetArrayWrite(v, &x));
1419 
1420   if (diagDense) {
1421     for (PetscInt i = 0, row = 0; i < ambs; i++) {
1422       aa_j = aa + adiag[i] * bs2;
1423       for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1424     }
1425   } else {
1426     for (PetscInt i = 0, row = 0; i < ambs; i++) {
1427       const PetscInt j = adiag[i];
1428 
1429       if (j != ai[i + 1]) {
1430         aa_j = aa + j * bs2;
1431         for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1432       } else {
1433         for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0;
1434       }
1435     }
1436   }
1437   PetscCall(VecRestoreArrayWrite(v, &x));
1438   PetscFunctionReturn(PETSC_SUCCESS);
1439 }
1440 
MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)1441 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1442 {
1443   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1444   PetscScalar        x;
1445   const PetscScalar *l, *li, *ri;
1446   MatScalar         *aa, *v;
1447   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1448   const PetscInt    *ai, *aj;
1449 
1450   PetscFunctionBegin;
1451   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1452   ai  = a->i;
1453   aj  = a->j;
1454   aa  = a->a;
1455   m   = A->rmap->N;
1456   bs  = A->rmap->bs;
1457   mbs = a->mbs;
1458   bs2 = a->bs2;
1459 
1460   PetscCall(VecGetArrayRead(ll, &l));
1461   PetscCall(VecGetLocalSize(ll, &lm));
1462   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1463   for (i = 0; i < mbs; i++) { /* for each block row */
1464     M  = ai[i + 1] - ai[i];
1465     li = l + i * bs;
1466     v  = aa + bs2 * ai[i];
1467     for (j = 0; j < M; j++) { /* for each block */
1468       ri = l + bs * aj[ai[i] + j];
1469       for (k = 0; k < bs; k++) {
1470         x = ri[k];
1471         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1472       }
1473     }
1474   }
1475   PetscCall(VecRestoreArrayRead(ll, &l));
1476   PetscCall(PetscLogFlops(2.0 * a->nz));
1477   PetscFunctionReturn(PETSC_SUCCESS);
1478 }
1479 
MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo * info)1480 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1481 {
1482   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1483 
1484   PetscFunctionBegin;
1485   info->block_size   = a->bs2;
1486   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1487   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1488   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1489   info->assemblies   = A->num_ass;
1490   info->mallocs      = A->info.mallocs;
1491   info->memory       = 0; /* REVIEW ME */
1492   if (A->factortype) {
1493     info->fill_ratio_given  = A->info.fill_ratio_given;
1494     info->fill_ratio_needed = A->info.fill_ratio_needed;
1495     info->factor_mallocs    = A->info.factor_mallocs;
1496   } else {
1497     info->fill_ratio_given  = 0;
1498     info->fill_ratio_needed = 0;
1499     info->factor_mallocs    = 0;
1500   }
1501   PetscFunctionReturn(PETSC_SUCCESS);
1502 }
1503 
MatZeroEntries_SeqSBAIJ(Mat A)1504 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1505 {
1506   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1507 
1508   PetscFunctionBegin;
1509   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1510   PetscFunctionReturn(PETSC_SUCCESS);
1511 }
1512 
MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])1513 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1514 {
1515   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1516   PetscInt         i, j, n, row, col, bs, mbs;
1517   const PetscInt  *ai, *aj;
1518   PetscReal        atmp;
1519   const MatScalar *aa;
1520   PetscScalar     *x;
1521   PetscInt         ncols, brow, bcol, krow, kcol;
1522 
1523   PetscFunctionBegin;
1524   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1525   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1526   bs  = A->rmap->bs;
1527   aa  = a->a;
1528   ai  = a->i;
1529   aj  = a->j;
1530   mbs = a->mbs;
1531 
1532   PetscCall(VecSet(v, 0.0));
1533   PetscCall(VecGetArray(v, &x));
1534   PetscCall(VecGetLocalSize(v, &n));
1535   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1536   for (i = 0; i < mbs; i++) {
1537     ncols = ai[1] - ai[0];
1538     ai++;
1539     brow = bs * i;
1540     for (j = 0; j < ncols; j++) {
1541       bcol = bs * (*aj);
1542       for (kcol = 0; kcol < bs; kcol++) {
1543         col = bcol + kcol; /* col index */
1544         for (krow = 0; krow < bs; krow++) {
1545           atmp = PetscAbsScalar(*aa);
1546           aa++;
1547           row = brow + krow; /* row index */
1548           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1549           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1550         }
1551       }
1552       aj++;
1553     }
1554   }
1555   PetscCall(VecRestoreArray(v, &x));
1556   PetscFunctionReturn(PETSC_SUCCESS);
1557 }
1558 
MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)1559 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1560 {
1561   PetscFunctionBegin;
1562   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1563   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1564   PetscFunctionReturn(PETSC_SUCCESS);
1565 }
1566 
MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1567 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1568 {
1569   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1570   PetscScalar       *z = c;
1571   const PetscScalar *xb;
1572   PetscScalar        x1;
1573   const MatScalar   *v   = a->a, *vv;
1574   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1575   const int          aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
1576 
1577   PetscFunctionBegin;
1578   for (i = 0; i < mbs; i++) {
1579     n = ii[1] - ii[0];
1580     ii++;
1581     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1582     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1583     jj = idx;
1584     vv = v;
1585     for (k = 0; k < cn; k++) {
1586       idx = jj;
1587       v   = vv;
1588       for (j = 0; j < n; j++) {
1589         xb = b + (*idx);
1590         x1 = xb[0 + k * bm];
1591         z[0 + k * cm] += v[0] * x1;
1592         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1593         v += 1;
1594         ++idx;
1595       }
1596     }
1597     z += 1;
1598   }
1599   PetscFunctionReturn(PETSC_SUCCESS);
1600 }
1601 
MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1602 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1603 {
1604   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1605   PetscScalar       *z = c;
1606   const PetscScalar *xb;
1607   PetscScalar        x1, x2;
1608   const MatScalar   *v   = a->a, *vv;
1609   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1610 
1611   PetscFunctionBegin;
1612   for (i = 0; i < mbs; i++) {
1613     n = ii[1] - ii[0];
1614     ii++;
1615     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1616     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1617     jj = idx;
1618     vv = v;
1619     for (k = 0; k < cn; k++) {
1620       idx = jj;
1621       v   = vv;
1622       for (j = 0; j < n; j++) {
1623         xb = b + 2 * (*idx);
1624         x1 = xb[0 + k * bm];
1625         x2 = xb[1 + k * bm];
1626         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1627         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1628         if (*idx != i) {
1629           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1630           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1631         }
1632         v += 4;
1633         ++idx;
1634       }
1635     }
1636     z += 2;
1637   }
1638   PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640 
MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1641 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1642 {
1643   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1644   PetscScalar       *z = c;
1645   const PetscScalar *xb;
1646   PetscScalar        x1, x2, x3;
1647   const MatScalar   *v   = a->a, *vv;
1648   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1649 
1650   PetscFunctionBegin;
1651   for (i = 0; i < mbs; i++) {
1652     n = ii[1] - ii[0];
1653     ii++;
1654     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1655     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1656     jj = idx;
1657     vv = v;
1658     for (k = 0; k < cn; k++) {
1659       idx = jj;
1660       v   = vv;
1661       for (j = 0; j < n; j++) {
1662         xb = b + 3 * (*idx);
1663         x1 = xb[0 + k * bm];
1664         x2 = xb[1 + k * bm];
1665         x3 = xb[2 + k * bm];
1666         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1667         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1668         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1669         if (*idx != i) {
1670           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1671           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1672           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1673         }
1674         v += 9;
1675         ++idx;
1676       }
1677     }
1678     z += 3;
1679   }
1680   PetscFunctionReturn(PETSC_SUCCESS);
1681 }
1682 
MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1683 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1684 {
1685   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1686   PetscScalar       *z = c;
1687   const PetscScalar *xb;
1688   PetscScalar        x1, x2, x3, x4;
1689   const MatScalar   *v   = a->a, *vv;
1690   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1691 
1692   PetscFunctionBegin;
1693   for (i = 0; i < mbs; i++) {
1694     n = ii[1] - ii[0];
1695     ii++;
1696     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1697     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1698     jj = idx;
1699     vv = v;
1700     for (k = 0; k < cn; k++) {
1701       idx = jj;
1702       v   = vv;
1703       for (j = 0; j < n; j++) {
1704         xb = b + 4 * (*idx);
1705         x1 = xb[0 + k * bm];
1706         x2 = xb[1 + k * bm];
1707         x3 = xb[2 + k * bm];
1708         x4 = xb[3 + k * bm];
1709         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1710         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1711         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1712         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1713         if (*idx != i) {
1714           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1715           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1716           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1717           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1718         }
1719         v += 16;
1720         ++idx;
1721       }
1722     }
1723     z += 4;
1724   }
1725   PetscFunctionReturn(PETSC_SUCCESS);
1726 }
1727 
MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1728 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1729 {
1730   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1731   PetscScalar       *z = c;
1732   const PetscScalar *xb;
1733   PetscScalar        x1, x2, x3, x4, x5;
1734   const MatScalar   *v   = a->a, *vv;
1735   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1736 
1737   PetscFunctionBegin;
1738   for (i = 0; i < mbs; i++) {
1739     n = ii[1] - ii[0];
1740     ii++;
1741     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1742     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1743     jj = idx;
1744     vv = v;
1745     for (k = 0; k < cn; k++) {
1746       idx = jj;
1747       v   = vv;
1748       for (j = 0; j < n; j++) {
1749         xb = b + 5 * (*idx);
1750         x1 = xb[0 + k * bm];
1751         x2 = xb[1 + k * bm];
1752         x3 = xb[2 + k * bm];
1753         x4 = xb[3 + k * bm];
1754         x5 = xb[4 + k * cm];
1755         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1756         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1757         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1758         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1759         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1760         if (*idx != i) {
1761           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1762           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1763           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1764           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1765           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1766         }
1767         v += 25;
1768         ++idx;
1769       }
1770     }
1771     z += 5;
1772   }
1773   PetscFunctionReturn(PETSC_SUCCESS);
1774 }
1775 
MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)1776 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1777 {
1778   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1779   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1780   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1781   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1782   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1783   PetscBLASInt     bbs, bcn, bbm, bcm;
1784   PetscScalar     *z = NULL;
1785   PetscScalar     *c, *b;
1786   const MatScalar *v;
1787   const PetscInt  *idx, *ii;
1788   PetscScalar      _DOne = 1.0;
1789 
1790   PetscFunctionBegin;
1791   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1792   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1793   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1794   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1795   b = bd->v;
1796   PetscCall(MatZeroEntries(C));
1797   PetscCall(MatDenseGetArray(C, &c));
1798   switch (bs) {
1799   case 1:
1800     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1801     break;
1802   case 2:
1803     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1804     break;
1805   case 3:
1806     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1807     break;
1808   case 4:
1809     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1810     break;
1811   case 5:
1812     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1813     break;
1814   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1815     PetscCall(PetscBLASIntCast(bs, &bbs));
1816     PetscCall(PetscBLASIntCast(cn, &bcn));
1817     PetscCall(PetscBLASIntCast(bm, &bbm));
1818     PetscCall(PetscBLASIntCast(cm, &bcm));
1819     idx = a->j;
1820     v   = a->a;
1821     mbs = a->mbs;
1822     ii  = a->i;
1823     z   = c;
1824     for (i = 0; i < mbs; i++) {
1825       n = ii[1] - ii[0];
1826       ii++;
1827       for (j = 0; j < n; j++) {
1828         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1829         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1830         v += bs2;
1831       }
1832       z += bs;
1833     }
1834   }
1835   PetscCall(MatDenseRestoreArray(C, &c));
1836   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1837   PetscFunctionReturn(PETSC_SUCCESS);
1838 }
1839