xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision ae1ee55146a7ad071171b861759b85940c7e5c67)
1c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
2c2916339SPierre Jolivet #include <../src/mat/impls/dense/seq/dense.h>
3c2916339SPierre Jolivet #include <../src/mat/impls/sbaij/seq/sbaij.h>
4af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
5c6db04a5SJed Brown #include <petscbt.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
749b5e25fSSatish Balay 
MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)8d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9d71ae5a4SJacob Faibussowitsch {
105eee224dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
117bede89fSBarry Smith   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
125d0c19d7SBarry Smith   const PetscInt *idx;
13db41cccfSHong Zhang   PetscBT         table_out, table_in;
14d94109b8SHong Zhang 
15d94109b8SHong Zhang   PetscFunctionBegin;
1608401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17d94109b8SHong Zhang   mbs = a->mbs;
18d94109b8SHong Zhang   ai  = a->i;
19d94109b8SHong Zhang   aj  = a->j;
20d0f46423SBarry Smith   bs  = A->rmap->bs;
219566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_out));
229566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(mbs + 1, &nidx));
239566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(mbs, &table_in));
24d94109b8SHong Zhang 
25d94109b8SHong Zhang   for (i = 0; i < is_max; i++) { /* for each is */
26d94109b8SHong Zhang     isz = 0;
279566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(mbs, table_out));
28d94109b8SHong Zhang 
29d94109b8SHong Zhang     /* Extract the indices, assume there can be duplicate entries */
309566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx));
319566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n));
32d94109b8SHong Zhang 
33db41cccfSHong Zhang     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34dbe03f88SHong Zhang     bcol_max = 0;
35d94109b8SHong Zhang     for (j = 0; j < n; ++j) {
36d94109b8SHong Zhang       brow = idx[j] / bs; /* convert the indices into block indices */
3708401ef6SPierre Jolivet       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38db41cccfSHong Zhang       if (!PetscBTLookupSet(table_out, brow)) {
39dbe03f88SHong Zhang         nidx[isz++] = brow;
40dbe03f88SHong Zhang         if (bcol_max < brow) bcol_max = brow;
41dbe03f88SHong Zhang       }
42d94109b8SHong Zhang     }
439566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx));
449566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&is[i]));
45d94109b8SHong Zhang 
46d94109b8SHong Zhang     k = 0;
47d94109b8SHong Zhang     for (j = 0; j < ov; j++) { /* for each overlap */
48db41cccfSHong Zhang       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
499566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(mbs, table_in));
509566063dSJacob Faibussowitsch       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
51d94109b8SHong Zhang 
52d94109b8SHong Zhang       n = isz; /* length of the updated is[i] */
53d94109b8SHong Zhang       for (brow = 0; brow < mbs; brow++) {
549371c9d4SSatish Balay         start = ai[brow];
559371c9d4SSatish Balay         end   = ai[brow + 1];
56db41cccfSHong Zhang         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57d94109b8SHong Zhang           for (l = start; l < end; l++) {
58d94109b8SHong Zhang             bcol = aj[l];
59d7b97159SHong Zhang             if (!PetscBTLookupSet(table_out, bcol)) {
60d7b97159SHong Zhang               nidx[isz++] = bcol;
61d7b97159SHong Zhang               if (bcol_max < bcol) bcol_max = bcol;
62d7b97159SHong Zhang             }
63d94109b8SHong Zhang           }
64d94109b8SHong Zhang           k++;
65d94109b8SHong Zhang           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66da81f932SPierre Jolivet         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67d94109b8SHong Zhang           for (l = start; l < end; l++) {
68d94109b8SHong Zhang             bcol = aj[l];
69dbe03f88SHong Zhang             if (bcol > bcol_max) break;
70db41cccfSHong Zhang             if (PetscBTLookup(table_in, bcol)) {
7126fbe8dcSKarl Rupp               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72d94109b8SHong Zhang               break; /* for l = start; l<end ; l++) */
73d94109b8SHong Zhang             }
74d94109b8SHong Zhang           }
75d94109b8SHong Zhang         }
76d94109b8SHong Zhang       }
77d94109b8SHong Zhang     } /* for each overlap */
787bede89fSBarry Smith     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
797bede89fSBarry Smith   } /* for each is */
809566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_out));
819566063dSJacob Faibussowitsch   PetscCall(PetscFree(nidx));
829566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table_in));
833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8449b5e25fSSatish Balay }
8549b5e25fSSatish Balay 
86847daeccSHong Zhang /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
877cf5f706SPierre Jolivet         Zero some ops' to avoid invalid use */
MatSeqSBAIJZeroOps_Private(Mat Bseq)88d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89d71ae5a4SJacob Faibussowitsch {
9049b5e25fSSatish Balay   PetscFunctionBegin;
919566063dSJacob Faibussowitsch   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92f4259b30SLisandro Dalcin   Bseq->ops->mult                   = NULL;
93f4259b30SLisandro Dalcin   Bseq->ops->multadd                = NULL;
94f4259b30SLisandro Dalcin   Bseq->ops->multtranspose          = NULL;
95f4259b30SLisandro Dalcin   Bseq->ops->multtransposeadd       = NULL;
96f4259b30SLisandro Dalcin   Bseq->ops->lufactor               = NULL;
97f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactor         = NULL;
98f4259b30SLisandro Dalcin   Bseq->ops->lufactorsymbolic       = NULL;
99f4259b30SLisandro Dalcin   Bseq->ops->choleskyfactorsymbolic = NULL;
100f4259b30SLisandro Dalcin   Bseq->ops->getinertia             = NULL;
1013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
10249b5e25fSSatish Balay }
10349b5e25fSSatish Balay 
1047dae84e0SHong Zhang /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B,PetscBool sym)105fdfbdca6SPierre Jolivet static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106d71ae5a4SJacob Faibussowitsch {
107fdfbdca6SPierre Jolivet   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108fdfbdca6SPierre Jolivet   Mat_SeqBAIJ    *d = NULL;
109e50a8114SHong Zhang   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110e50a8114SHong Zhang   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111e50a8114SHong Zhang   const PetscInt *irow, *icol;
112e50a8114SHong Zhang   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113e50a8114SHong Zhang   PetscInt       *aj = a->j, *ai = a->i;
114e50a8114SHong Zhang   MatScalar      *mat_a;
115e50a8114SHong Zhang   Mat             C;
116e50a8114SHong Zhang   PetscBool       flag;
117e50a8114SHong Zhang 
118e50a8114SHong Zhang   PetscFunctionBegin;
1199566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &irow));
1209566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &icol));
1219566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(isrow, &nrows));
1229566063dSJacob Faibussowitsch   PetscCall(ISGetLocalSize(iscol, &ncols));
123e50a8114SHong Zhang 
1249566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(1 + oldcols, &smap));
125e50a8114SHong Zhang   ssmap = smap;
1269566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1 + nrows, &lens));
127e50a8114SHong Zhang   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
128e50a8114SHong Zhang   /* determine lens of each row */
129e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
130e50a8114SHong Zhang     kstart  = ai[irow[i]];
131e50a8114SHong Zhang     kend    = kstart + a->ilen[irow[i]];
132e50a8114SHong Zhang     lens[i] = 0;
133e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
134e50a8114SHong Zhang       if (ssmap[aj[k]]) lens[i]++;
135e50a8114SHong Zhang     }
136e50a8114SHong Zhang   }
137e50a8114SHong Zhang   /* Create and fill new matrix */
138e50a8114SHong Zhang   if (scall == MAT_REUSE_MATRIX) {
139fdfbdca6SPierre Jolivet     if (sym) {
140e50a8114SHong Zhang       c = (Mat_SeqSBAIJ *)((*B)->data);
141e50a8114SHong Zhang 
142aed4548fSBarry Smith       PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
1439566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144fdfbdca6SPierre Jolivet       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
1459566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(c->ilen, c->mbs));
146fdfbdca6SPierre Jolivet     } else {
147fdfbdca6SPierre Jolivet       d = (Mat_SeqBAIJ *)((*B)->data);
148fdfbdca6SPierre Jolivet 
149fdfbdca6SPierre Jolivet       PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150fdfbdca6SPierre Jolivet       PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
151fdfbdca6SPierre Jolivet       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
152fdfbdca6SPierre Jolivet       PetscCall(PetscArrayzero(d->ilen, d->mbs));
153fdfbdca6SPierre Jolivet     }
154e50a8114SHong Zhang     C = *B;
155e50a8114SHong Zhang   } else {
1569566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1579566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
158fdfbdca6SPierre Jolivet     if (sym) {
1599566063dSJacob Faibussowitsch       PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1609566063dSJacob Faibussowitsch       PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
161fdfbdca6SPierre Jolivet     } else {
162fdfbdca6SPierre Jolivet       PetscCall(MatSetType(C, MATSEQBAIJ));
163fdfbdca6SPierre Jolivet       PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
164e50a8114SHong Zhang     }
165fdfbdca6SPierre Jolivet   }
166f4f49eeaSPierre Jolivet   if (sym) c = (Mat_SeqSBAIJ *)C->data;
167f4f49eeaSPierre Jolivet   else d = (Mat_SeqBAIJ *)C->data;
168e50a8114SHong Zhang   for (i = 0; i < nrows; i++) {
169e50a8114SHong Zhang     row    = irow[i];
170e50a8114SHong Zhang     kstart = ai[row];
171e50a8114SHong Zhang     kend   = kstart + a->ilen[row];
172fdfbdca6SPierre Jolivet     if (sym) {
173e50a8114SHong Zhang       mat_i    = c->i[i];
1748e3a54c0SPierre Jolivet       mat_j    = PetscSafePointerPlusOffset(c->j, mat_i);
1758e3a54c0SPierre Jolivet       mat_a    = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
176e50a8114SHong Zhang       mat_ilen = c->ilen + i;
177fdfbdca6SPierre Jolivet     } else {
178fdfbdca6SPierre Jolivet       mat_i    = d->i[i];
1798e3a54c0SPierre Jolivet       mat_j    = PetscSafePointerPlusOffset(d->j, mat_i);
1808e3a54c0SPierre Jolivet       mat_a    = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
181fdfbdca6SPierre Jolivet       mat_ilen = d->ilen + i;
182fdfbdca6SPierre Jolivet     }
183e50a8114SHong Zhang     for (k = kstart; k < kend; k++) {
184e50a8114SHong Zhang       if ((tcol = ssmap[a->j[k]])) {
185e50a8114SHong Zhang         *mat_j++ = tcol - 1;
1869566063dSJacob Faibussowitsch         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
187e50a8114SHong Zhang         mat_a += bs2;
188e50a8114SHong Zhang         (*mat_ilen)++;
189e50a8114SHong Zhang       }
190e50a8114SHong Zhang     }
191e50a8114SHong Zhang   }
192e50a8114SHong Zhang   /* sort */
193e50a8114SHong Zhang   {
194e50a8114SHong Zhang     MatScalar *work;
195e50a8114SHong Zhang 
1969566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(bs2, &work));
197e50a8114SHong Zhang     for (i = 0; i < nrows; i++) {
198e50a8114SHong Zhang       PetscInt ilen;
199fdfbdca6SPierre Jolivet       if (sym) {
200e50a8114SHong Zhang         mat_i = c->i[i];
2018e3a54c0SPierre Jolivet         mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
2028e3a54c0SPierre Jolivet         mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
203e50a8114SHong Zhang         ilen  = c->ilen[i];
204fdfbdca6SPierre Jolivet       } else {
205fdfbdca6SPierre Jolivet         mat_i = d->i[i];
2068e3a54c0SPierre Jolivet         mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
2078e3a54c0SPierre Jolivet         mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
208fdfbdca6SPierre Jolivet         ilen  = d->ilen[i];
209fdfbdca6SPierre Jolivet       }
2109566063dSJacob Faibussowitsch       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
211e50a8114SHong Zhang     }
2129566063dSJacob Faibussowitsch     PetscCall(PetscFree(work));
213e50a8114SHong Zhang   }
214e50a8114SHong Zhang 
215e50a8114SHong Zhang   /* Free work space */
2169566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &icol));
2179566063dSJacob Faibussowitsch   PetscCall(PetscFree(smap));
2189566063dSJacob Faibussowitsch   PetscCall(PetscFree(lens));
2199566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2209566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
221e50a8114SHong Zhang 
2229566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &irow));
223e50a8114SHong Zhang   *B = C;
2243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
225e50a8114SHong Zhang }
226e50a8114SHong Zhang 
MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)227d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
228d71ae5a4SJacob Faibussowitsch {
229fdfbdca6SPierre Jolivet   Mat       C[2];
230fdfbdca6SPierre Jolivet   IS        is1, is2, intersect = NULL;
231fdfbdca6SPierre Jolivet   PetscInt  n1, n2, ni;
232fdfbdca6SPierre Jolivet   PetscBool sym = PETSC_TRUE;
23349b5e25fSSatish Balay 
23449b5e25fSSatish Balay   PetscFunctionBegin;
235f9a48b90SPierre Jolivet   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
236f9a48b90SPierre Jolivet   if (isrow == iscol) {
237f9a48b90SPierre Jolivet     is2 = is1;
238f9a48b90SPierre Jolivet     PetscCall(PetscObjectReference((PetscObject)is2));
239fdfbdca6SPierre Jolivet   } else {
240fdfbdca6SPierre Jolivet     PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
241fdfbdca6SPierre Jolivet     PetscCall(ISIntersect(is1, is2, &intersect));
242fdfbdca6SPierre Jolivet     PetscCall(ISGetLocalSize(intersect, &ni));
243fdfbdca6SPierre Jolivet     PetscCall(ISDestroy(&intersect));
244fdfbdca6SPierre Jolivet     if (ni == 0) sym = PETSC_FALSE;
245fdfbdca6SPierre Jolivet     else if (PetscDefined(USE_DEBUG)) {
246fdfbdca6SPierre Jolivet       PetscCall(ISGetLocalSize(is1, &n1));
247fdfbdca6SPierre Jolivet       PetscCall(ISGetLocalSize(is2, &n2));
248fdfbdca6SPierre Jolivet       PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
249fdfbdca6SPierre Jolivet     }
250fdfbdca6SPierre Jolivet   }
251fdfbdca6SPierre Jolivet   if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
252fdfbdca6SPierre Jolivet   else {
253fdfbdca6SPierre Jolivet     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
254fdfbdca6SPierre Jolivet     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
255fdfbdca6SPierre Jolivet     PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
256fdfbdca6SPierre Jolivet     PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
257fdfbdca6SPierre Jolivet     PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
258fdfbdca6SPierre Jolivet     if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
259fdfbdca6SPierre Jolivet     else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
260fdfbdca6SPierre Jolivet     else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261fdfbdca6SPierre Jolivet     PetscCall(MatDestroy(C));
262fdfbdca6SPierre Jolivet     PetscCall(MatDestroy(C + 1));
263fdfbdca6SPierre Jolivet   }
2649566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is1));
2659566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&is2));
266847daeccSHong Zhang 
267fdfbdca6SPierre Jolivet   if (sym && isrow != iscol) {
2688f46ffcaSHong Zhang     PetscBool isequal;
2699566063dSJacob Faibussowitsch     PetscCall(ISEqual(isrow, iscol, &isequal));
27048a46eb9SPierre Jolivet     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
27149b5e25fSSatish Balay   }
2723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
27349b5e25fSSatish Balay }
27449b5e25fSSatish Balay 
MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])275d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
276d71ae5a4SJacob Faibussowitsch {
27713f74950SBarry Smith   PetscInt i;
27849b5e25fSSatish Balay 
27949b5e25fSSatish Balay   PetscFunctionBegin;
280314f503fSPierre Jolivet   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
281e50a8114SHong Zhang 
28248a46eb9SPierre Jolivet   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
2833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28449b5e25fSSatish Balay }
28549b5e25fSSatish Balay 
28649b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)287d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
288d71ae5a4SJacob Faibussowitsch {
28949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
290d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, zero = 0.0;
291d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
292d9ca1df4SBarry Smith   const MatScalar   *v;
293d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
294d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
29598c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
29649b5e25fSSatish Balay 
29749b5e25fSSatish Balay   PetscFunctionBegin;
2989566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
2993ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
3009566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3019566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
30249b5e25fSSatish Balay 
30349b5e25fSSatish Balay   v  = a->a;
30449b5e25fSSatish Balay   xb = x;
30549b5e25fSSatish Balay 
30649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
30749b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3089371c9d4SSatish Balay     x1   = xb[0];
3099371c9d4SSatish Balay     x2   = xb[1];
31049b5e25fSSatish Balay     ib   = aj + *ai;
311831a3094SHong Zhang     jmin = 0;
31298c9bda7SSatish Balay     nonzerorow += (n > 0);
3137fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
31449b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
31549b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
3169371c9d4SSatish Balay       v += 4;
3179371c9d4SSatish Balay       jmin++;
3187fbae186SHong Zhang     }
319444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321831a3094SHong Zhang     for (j = jmin; j < n; j++) {
32249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
32349b5e25fSSatish Balay       cval = ib[j] * 2;
32449b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
32549b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
32649b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
32749b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
32849b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
32949b5e25fSSatish Balay       v += 4;
33049b5e25fSSatish Balay     }
3319371c9d4SSatish Balay     xb += 2;
3329371c9d4SSatish Balay     ai++;
33349b5e25fSSatish Balay   }
33449b5e25fSSatish Balay 
3359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3369566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3379566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
3383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
33949b5e25fSSatish Balay }
34049b5e25fSSatish Balay 
MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)341d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
342d71ae5a4SJacob Faibussowitsch {
34349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
344d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, zero = 0.0;
345d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
346d9ca1df4SBarry Smith   const MatScalar   *v;
347d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
348d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
34998c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
35049b5e25fSSatish Balay 
35149b5e25fSSatish Balay   PetscFunctionBegin;
3529566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
3533ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
3549566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3559566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
35649b5e25fSSatish Balay 
35749b5e25fSSatish Balay   v  = a->a;
35849b5e25fSSatish Balay   xb = x;
35949b5e25fSSatish Balay 
36049b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
36149b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
3629371c9d4SSatish Balay     x1   = xb[0];
3639371c9d4SSatish Balay     x2   = xb[1];
3649371c9d4SSatish Balay     x3   = xb[2];
36549b5e25fSSatish Balay     ib   = aj + *ai;
366831a3094SHong Zhang     jmin = 0;
36798c9bda7SSatish Balay     nonzerorow += (n > 0);
3687fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
36949b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
37049b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
37149b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3729371c9d4SSatish Balay       v += 9;
3739371c9d4SSatish Balay       jmin++;
3747fbae186SHong Zhang     }
375444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
376444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
377831a3094SHong Zhang     for (j = jmin; j < n; j++) {
37849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
37949b5e25fSSatish Balay       cval = ib[j] * 3;
38049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
38149b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
38249b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
38349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
38449b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
38549b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
38649b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
38749b5e25fSSatish Balay       v += 9;
38849b5e25fSSatish Balay     }
3899371c9d4SSatish Balay     xb += 3;
3909371c9d4SSatish Balay     ai++;
39149b5e25fSSatish Balay   }
39249b5e25fSSatish Balay 
3939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
3949566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
3959566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
3963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39749b5e25fSSatish Balay }
39849b5e25fSSatish Balay 
MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)399d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
400d71ae5a4SJacob Faibussowitsch {
40149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
402d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
403d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
404d9ca1df4SBarry Smith   const MatScalar   *v;
405d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
406d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
40798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
40849b5e25fSSatish Balay 
40949b5e25fSSatish Balay   PetscFunctionBegin;
4109566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
4113ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
4129566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4139566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
41449b5e25fSSatish Balay 
41549b5e25fSSatish Balay   v  = a->a;
41649b5e25fSSatish Balay   xb = x;
41749b5e25fSSatish Balay 
41849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
41949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4209371c9d4SSatish Balay     x1   = xb[0];
4219371c9d4SSatish Balay     x2   = xb[1];
4229371c9d4SSatish Balay     x3   = xb[2];
4239371c9d4SSatish Balay     x4   = xb[3];
42449b5e25fSSatish Balay     ib   = aj + *ai;
425831a3094SHong Zhang     jmin = 0;
42698c9bda7SSatish Balay     nonzerorow += (n > 0);
4277fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
42849b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
42949b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
43049b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
43149b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
4329371c9d4SSatish Balay       v += 16;
4339371c9d4SSatish Balay       jmin++;
4347fbae186SHong Zhang     }
435444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
436444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437831a3094SHong Zhang     for (j = jmin; j < n; j++) {
43849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
43949b5e25fSSatish Balay       cval = ib[j] * 4;
44049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
44149b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
44249b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
44349b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
44449b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
44549b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
44649b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
44749b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
44849b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
44949b5e25fSSatish Balay       v += 16;
45049b5e25fSSatish Balay     }
4519371c9d4SSatish Balay     xb += 4;
4529371c9d4SSatish Balay     ai++;
45349b5e25fSSatish Balay   }
45449b5e25fSSatish Balay 
4559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
4579566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
4583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45949b5e25fSSatish Balay }
46049b5e25fSSatish Balay 
MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)461d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
462d71ae5a4SJacob Faibussowitsch {
46349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
464d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
465d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
466d9ca1df4SBarry Smith   const MatScalar   *v;
467d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
468d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
46998c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
47049b5e25fSSatish Balay 
47149b5e25fSSatish Balay   PetscFunctionBegin;
4729566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
4733ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
4749566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4759566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
47649b5e25fSSatish Balay 
47749b5e25fSSatish Balay   v  = a->a;
47849b5e25fSSatish Balay   xb = x;
47949b5e25fSSatish Balay 
48049b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
48149b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
4829371c9d4SSatish Balay     x1   = xb[0];
4839371c9d4SSatish Balay     x2   = xb[1];
4849371c9d4SSatish Balay     x3   = xb[2];
4859371c9d4SSatish Balay     x4   = xb[3];
4869371c9d4SSatish Balay     x5   = xb[4];
48749b5e25fSSatish Balay     ib   = aj + *ai;
488831a3094SHong Zhang     jmin = 0;
48998c9bda7SSatish Balay     nonzerorow += (n > 0);
4907fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
49149b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
49249b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
49349b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
49449b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
49549b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
4969371c9d4SSatish Balay       v += 25;
4979371c9d4SSatish Balay       jmin++;
4987fbae186SHong Zhang     }
499444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
500444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
501831a3094SHong Zhang     for (j = jmin; j < n; j++) {
50249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
50349b5e25fSSatish Balay       cval = ib[j] * 5;
50449b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
50549b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
50649b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
50749b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
50849b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
50949b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
51049b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
51149b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
51249b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
51349b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
51449b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
51549b5e25fSSatish Balay       v += 25;
51649b5e25fSSatish Balay     }
5179371c9d4SSatish Balay     xb += 5;
5189371c9d4SSatish Balay     ai++;
51949b5e25fSSatish Balay   }
52049b5e25fSSatish Balay 
5219566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5229566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5239566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
5243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
52549b5e25fSSatish Balay }
52649b5e25fSSatish Balay 
MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)527d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
528d71ae5a4SJacob Faibussowitsch {
52949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
530d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
531d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
532d9ca1df4SBarry Smith   const MatScalar   *v;
533d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
534d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
53598c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
53649b5e25fSSatish Balay 
53749b5e25fSSatish Balay   PetscFunctionBegin;
5389566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
5393ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
5409566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5419566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
54249b5e25fSSatish Balay 
54349b5e25fSSatish Balay   v  = a->a;
54449b5e25fSSatish Balay   xb = x;
54549b5e25fSSatish Balay 
54649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
54749b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
5489371c9d4SSatish Balay     x1   = xb[0];
5499371c9d4SSatish Balay     x2   = xb[1];
5509371c9d4SSatish Balay     x3   = xb[2];
5519371c9d4SSatish Balay     x4   = xb[3];
5529371c9d4SSatish Balay     x5   = xb[4];
5539371c9d4SSatish Balay     x6   = xb[5];
55449b5e25fSSatish Balay     ib   = aj + *ai;
555831a3094SHong Zhang     jmin = 0;
55698c9bda7SSatish Balay     nonzerorow += (n > 0);
5577fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
55849b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
55949b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
56049b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
56149b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
56249b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
56349b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
5649371c9d4SSatish Balay       v += 36;
5659371c9d4SSatish Balay       jmin++;
5667fbae186SHong Zhang     }
567444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
568444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
569831a3094SHong Zhang     for (j = jmin; j < n; j++) {
57049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
57149b5e25fSSatish Balay       cval = ib[j] * 6;
57249b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
57349b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
57449b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
57549b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
57649b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
57749b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
57849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
57949b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
58049b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
58149b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
58249b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
58349b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
58449b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
58549b5e25fSSatish Balay       v += 36;
58649b5e25fSSatish Balay     }
5879371c9d4SSatish Balay     xb += 6;
5889371c9d4SSatish Balay     ai++;
58949b5e25fSSatish Balay   }
59049b5e25fSSatish Balay 
5919566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
5939566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
5943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
59549b5e25fSSatish Balay }
596c2916339SPierre Jolivet 
MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)597d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
598d71ae5a4SJacob Faibussowitsch {
59949b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
600d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
601d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
602d9ca1df4SBarry Smith   const MatScalar   *v;
603d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
604d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
60598c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
60649b5e25fSSatish Balay 
60749b5e25fSSatish Balay   PetscFunctionBegin;
6089566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
6093ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
6109566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6119566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
61249b5e25fSSatish Balay 
61349b5e25fSSatish Balay   v  = a->a;
61449b5e25fSSatish Balay   xb = x;
61549b5e25fSSatish Balay 
61649b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
61749b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
6189371c9d4SSatish Balay     x1   = xb[0];
6199371c9d4SSatish Balay     x2   = xb[1];
6209371c9d4SSatish Balay     x3   = xb[2];
6219371c9d4SSatish Balay     x4   = xb[3];
6229371c9d4SSatish Balay     x5   = xb[4];
6239371c9d4SSatish Balay     x6   = xb[5];
6249371c9d4SSatish Balay     x7   = xb[6];
62549b5e25fSSatish Balay     ib   = aj + *ai;
626831a3094SHong Zhang     jmin = 0;
62798c9bda7SSatish Balay     nonzerorow += (n > 0);
6287fbae186SHong Zhang     if (*ib == i) { /* (diag of A)*x */
62949b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
63049b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
63149b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
63249b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
63349b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
63449b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
63549b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
6369371c9d4SSatish Balay       v += 49;
6379371c9d4SSatish Balay       jmin++;
6387fbae186SHong Zhang     }
639444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
640444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641831a3094SHong Zhang     for (j = jmin; j < n; j++) {
64249b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
64349b5e25fSSatish Balay       cval = ib[j] * 7;
64449b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
64549b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
64649b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
64749b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
64849b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
64949b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
65049b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
65149b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
65249b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
65349b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
65449b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
65549b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
65649b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
65749b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
65849b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
65949b5e25fSSatish Balay       v += 49;
66049b5e25fSSatish Balay     }
6619371c9d4SSatish Balay     xb += 7;
6629371c9d4SSatish Balay     ai++;
66349b5e25fSSatish Balay   }
6649566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6659566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
6669566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
6673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
66849b5e25fSSatish Balay }
66949b5e25fSSatish Balay 
67049b5e25fSSatish Balay /*
67149b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
67249b5e25fSSatish Balay */
MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)673d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
674d71ae5a4SJacob Faibussowitsch {
67549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
676d9ca1df4SBarry Smith   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
677d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
678d9ca1df4SBarry Smith   const MatScalar   *v;
679d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
680d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
68198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
68249b5e25fSSatish Balay 
68349b5e25fSSatish Balay   PetscFunctionBegin;
6849566063dSJacob Faibussowitsch   PetscCall(VecSet(zz, zero));
6853ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
6869566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
6879566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
68859689ffbSStefano Zampini 
68959689ffbSStefano Zampini   x_ptr = x;
69059689ffbSStefano Zampini   z_ptr = z;
69149b5e25fSSatish Balay 
69249b5e25fSSatish Balay   aj = a->j;
69349b5e25fSSatish Balay   v  = a->a;
69449b5e25fSSatish Balay   ii = a->i;
69549b5e25fSSatish Balay 
69648a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
69749b5e25fSSatish Balay   work = a->mult_work;
69849b5e25fSSatish Balay 
69949b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
7009371c9d4SSatish Balay     n     = ii[1] - ii[0];
7019371c9d4SSatish Balay     ncols = n * bs;
7029371c9d4SSatish Balay     workt = work;
7039371c9d4SSatish Balay     idx   = aj + ii[0];
70498c9bda7SSatish Balay     nonzerorow += (n > 0);
70549b5e25fSSatish Balay 
70649b5e25fSSatish Balay     /* upper triangular part */
70749b5e25fSSatish Balay     for (j = 0; j < n; j++) {
70849b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
70949b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
71049b5e25fSSatish Balay       workt += bs;
71149b5e25fSSatish Balay     }
71249b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
71396b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
71449b5e25fSSatish Balay 
71549b5e25fSSatish Balay     /* strict lower triangular part */
716831a3094SHong Zhang     idx = aj + ii[0];
71759689ffbSStefano Zampini     if (n && *idx == i) {
7189371c9d4SSatish Balay       ncols -= bs;
7199371c9d4SSatish Balay       v += bs2;
7209371c9d4SSatish Balay       idx++;
7219371c9d4SSatish Balay       n--;
722831a3094SHong Zhang     }
72396b9376eSHong Zhang 
72449b5e25fSSatish Balay     if (ncols > 0) {
72549b5e25fSSatish Balay       workt = work;
7269566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
72796b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
728831a3094SHong Zhang       for (j = 0; j < n; j++) {
729831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
73049b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
73149b5e25fSSatish Balay         workt += bs;
73249b5e25fSSatish Balay       }
73349b5e25fSSatish Balay     }
7349371c9d4SSatish Balay     x += bs;
7359371c9d4SSatish Balay     v += n * bs2;
7369371c9d4SSatish Balay     z += bs;
7379371c9d4SSatish Balay     ii++;
73849b5e25fSSatish Balay   }
73949b5e25fSSatish Balay 
7409566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
7429566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
7433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
74449b5e25fSSatish Balay }
74549b5e25fSSatish Balay 
MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)746d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
747d71ae5a4SJacob Faibussowitsch {
74849b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
749d9ca1df4SBarry Smith   PetscScalar       *z, x1;
750d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
751d9ca1df4SBarry Smith   const MatScalar   *v;
752d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
753d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
75498c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
755*fc2fb351SPierre Jolivet   const int          aconj      = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
75649b5e25fSSatish Balay 
75749b5e25fSSatish Balay   PetscFunctionBegin;
7589566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
7593ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
7609566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
7619566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
76249b5e25fSSatish Balay   v  = a->a;
76349b5e25fSSatish Balay   xb = x;
76449b5e25fSSatish Balay 
76549b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
76649b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th row of A */
76749b5e25fSSatish Balay     x1   = xb[0];
76849b5e25fSSatish Balay     ib   = aj + *ai;
769831a3094SHong Zhang     jmin = 0;
77098c9bda7SSatish Balay     nonzerorow += (n > 0);
7713d9ade75SStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
7729371c9d4SSatish Balay       z[i] += *v++ * x[*ib++];
7739371c9d4SSatish Balay       jmin++;
774831a3094SHong Zhang     }
775eb1ec7c1SStefano Zampini     if (aconj) {
776eb1ec7c1SStefano Zampini       for (j = jmin; j < n; j++) {
777eb1ec7c1SStefano Zampini         cval = *ib;
778eb1ec7c1SStefano Zampini         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
779eb1ec7c1SStefano Zampini         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
780eb1ec7c1SStefano Zampini       }
781eb1ec7c1SStefano Zampini     } else {
782831a3094SHong Zhang       for (j = jmin; j < n; j++) {
78349b5e25fSSatish Balay         cval = *ib;
78449b5e25fSSatish Balay         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
78549b5e25fSSatish Balay         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
78649b5e25fSSatish Balay       }
787eb1ec7c1SStefano Zampini     }
7889371c9d4SSatish Balay     xb++;
7899371c9d4SSatish Balay     ai++;
79049b5e25fSSatish Balay   }
79149b5e25fSSatish Balay 
7929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
79449b5e25fSSatish Balay 
7959566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
7963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
79749b5e25fSSatish Balay }
79849b5e25fSSatish Balay 
MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)799d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
800d71ae5a4SJacob Faibussowitsch {
80149b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
802d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2;
803d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
804d9ca1df4SBarry Smith   const MatScalar   *v;
805d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
806d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
80798c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
80849b5e25fSSatish Balay 
80949b5e25fSSatish Balay   PetscFunctionBegin;
8109566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
8113ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
8129566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8139566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
81449b5e25fSSatish Balay 
81549b5e25fSSatish Balay   v  = a->a;
81649b5e25fSSatish Balay   xb = x;
81749b5e25fSSatish Balay 
81849b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
81949b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8209371c9d4SSatish Balay     x1   = xb[0];
8219371c9d4SSatish Balay     x2   = xb[1];
82249b5e25fSSatish Balay     ib   = aj + *ai;
823831a3094SHong Zhang     jmin = 0;
82498c9bda7SSatish Balay     nonzerorow += (n > 0);
82559689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
82649b5e25fSSatish Balay       z[2 * i] += v[0] * x1 + v[2] * x2;
82749b5e25fSSatish Balay       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
8289371c9d4SSatish Balay       v += 4;
8299371c9d4SSatish Balay       jmin++;
8307fbae186SHong Zhang     }
831444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
832444d8c10SJed Brown     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
833831a3094SHong Zhang     for (j = jmin; j < n; j++) {
83449b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
83549b5e25fSSatish Balay       cval = ib[j] * 2;
83649b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2;
83749b5e25fSSatish Balay       z[cval + 1] += v[2] * x1 + v[3] * x2;
83849b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
83949b5e25fSSatish Balay       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
84049b5e25fSSatish Balay       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
84149b5e25fSSatish Balay       v += 4;
84249b5e25fSSatish Balay     }
8439371c9d4SSatish Balay     xb += 2;
8449371c9d4SSatish Balay     ai++;
84549b5e25fSSatish Balay   }
8469566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
8479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
84849b5e25fSSatish Balay 
8499566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
8503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
85149b5e25fSSatish Balay }
85249b5e25fSSatish Balay 
MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)853d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
854d71ae5a4SJacob Faibussowitsch {
85549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
856d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3;
857d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
858d9ca1df4SBarry Smith   const MatScalar   *v;
859d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
860d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
86198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
86249b5e25fSSatish Balay 
86349b5e25fSSatish Balay   PetscFunctionBegin;
8649566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
8653ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
8669566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
8679566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
86849b5e25fSSatish Balay 
86949b5e25fSSatish Balay   v  = a->a;
87049b5e25fSSatish Balay   xb = x;
87149b5e25fSSatish Balay 
87249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
87349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
8749371c9d4SSatish Balay     x1   = xb[0];
8759371c9d4SSatish Balay     x2   = xb[1];
8769371c9d4SSatish Balay     x3   = xb[2];
87749b5e25fSSatish Balay     ib   = aj + *ai;
878831a3094SHong Zhang     jmin = 0;
87998c9bda7SSatish Balay     nonzerorow += (n > 0);
88059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
88149b5e25fSSatish Balay       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
88249b5e25fSSatish Balay       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
88349b5e25fSSatish Balay       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
8849371c9d4SSatish Balay       v += 9;
8859371c9d4SSatish Balay       jmin++;
8867fbae186SHong Zhang     }
887444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
888444d8c10SJed Brown     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
889831a3094SHong Zhang     for (j = jmin; j < n; j++) {
89049b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
89149b5e25fSSatish Balay       cval = ib[j] * 3;
89249b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
89349b5e25fSSatish Balay       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
89449b5e25fSSatish Balay       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
89549b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
89649b5e25fSSatish Balay       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
89749b5e25fSSatish Balay       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
89849b5e25fSSatish Balay       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
89949b5e25fSSatish Balay       v += 9;
90049b5e25fSSatish Balay     }
9019371c9d4SSatish Balay     xb += 3;
9029371c9d4SSatish Balay     ai++;
90349b5e25fSSatish Balay   }
90449b5e25fSSatish Balay 
9059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
90749b5e25fSSatish Balay 
9089566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
9093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
91049b5e25fSSatish Balay }
91149b5e25fSSatish Balay 
MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)912d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
913d71ae5a4SJacob Faibussowitsch {
91449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
915d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4;
916d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
917d9ca1df4SBarry Smith   const MatScalar   *v;
918d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
919d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
92098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
92149b5e25fSSatish Balay 
92249b5e25fSSatish Balay   PetscFunctionBegin;
9239566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
9243ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
9259566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
92749b5e25fSSatish Balay 
92849b5e25fSSatish Balay   v  = a->a;
92949b5e25fSSatish Balay   xb = x;
93049b5e25fSSatish Balay 
93149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
93249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9339371c9d4SSatish Balay     x1   = xb[0];
9349371c9d4SSatish Balay     x2   = xb[1];
9359371c9d4SSatish Balay     x3   = xb[2];
9369371c9d4SSatish Balay     x4   = xb[3];
93749b5e25fSSatish Balay     ib   = aj + *ai;
938831a3094SHong Zhang     jmin = 0;
93998c9bda7SSatish Balay     nonzerorow += (n > 0);
94059689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
94149b5e25fSSatish Balay       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
94249b5e25fSSatish Balay       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
94349b5e25fSSatish Balay       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
94449b5e25fSSatish Balay       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
9459371c9d4SSatish Balay       v += 16;
9469371c9d4SSatish Balay       jmin++;
9477fbae186SHong Zhang     }
948444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
949444d8c10SJed Brown     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
950831a3094SHong Zhang     for (j = jmin; j < n; j++) {
95149b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
95249b5e25fSSatish Balay       cval = ib[j] * 4;
95349b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
95449b5e25fSSatish Balay       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
95549b5e25fSSatish Balay       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
95649b5e25fSSatish Balay       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
95749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
95849b5e25fSSatish Balay       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
95949b5e25fSSatish Balay       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
96049b5e25fSSatish Balay       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
96149b5e25fSSatish Balay       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
96249b5e25fSSatish Balay       v += 16;
96349b5e25fSSatish Balay     }
9649371c9d4SSatish Balay     xb += 4;
9659371c9d4SSatish Balay     ai++;
96649b5e25fSSatish Balay   }
96749b5e25fSSatish Balay 
9689566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
9699566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
97049b5e25fSSatish Balay 
9719566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
9723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
97349b5e25fSSatish Balay }
97449b5e25fSSatish Balay 
MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)975d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
976d71ae5a4SJacob Faibussowitsch {
97749b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
978d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5;
979d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
980d9ca1df4SBarry Smith   const MatScalar   *v;
981d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
982d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
98398c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
98449b5e25fSSatish Balay 
98549b5e25fSSatish Balay   PetscFunctionBegin;
9869566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
9873ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
9889566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
9899566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
99049b5e25fSSatish Balay 
99149b5e25fSSatish Balay   v  = a->a;
99249b5e25fSSatish Balay   xb = x;
99349b5e25fSSatish Balay 
99449b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
99549b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
9969371c9d4SSatish Balay     x1   = xb[0];
9979371c9d4SSatish Balay     x2   = xb[1];
9989371c9d4SSatish Balay     x3   = xb[2];
9999371c9d4SSatish Balay     x4   = xb[3];
10009371c9d4SSatish Balay     x5   = xb[4];
100149b5e25fSSatish Balay     ib   = aj + *ai;
1002831a3094SHong Zhang     jmin = 0;
100398c9bda7SSatish Balay     nonzerorow += (n > 0);
100459689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
100549b5e25fSSatish Balay       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
100649b5e25fSSatish Balay       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
100749b5e25fSSatish Balay       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
100849b5e25fSSatish Balay       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
100949b5e25fSSatish Balay       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
10109371c9d4SSatish Balay       v += 25;
10119371c9d4SSatish Balay       jmin++;
10127fbae186SHong Zhang     }
1013444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1014444d8c10SJed Brown     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1015831a3094SHong Zhang     for (j = jmin; j < n; j++) {
101649b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
101749b5e25fSSatish Balay       cval = ib[j] * 5;
101849b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
101949b5e25fSSatish Balay       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
102049b5e25fSSatish Balay       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
102149b5e25fSSatish Balay       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
102249b5e25fSSatish Balay       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
102349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
102449b5e25fSSatish Balay       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
102549b5e25fSSatish Balay       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
102649b5e25fSSatish Balay       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
102749b5e25fSSatish Balay       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
102849b5e25fSSatish Balay       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
102949b5e25fSSatish Balay       v += 25;
103049b5e25fSSatish Balay     }
10319371c9d4SSatish Balay     xb += 5;
10329371c9d4SSatish Balay     ai++;
103349b5e25fSSatish Balay   }
103449b5e25fSSatish Balay 
10359566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
10369566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
103749b5e25fSSatish Balay 
10389566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
10393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
104049b5e25fSSatish Balay }
1041c2916339SPierre Jolivet 
MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)1042d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1043d71ae5a4SJacob Faibussowitsch {
104449b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1045d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1046d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1047d9ca1df4SBarry Smith   const MatScalar   *v;
1048d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1049d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
105098c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
105149b5e25fSSatish Balay 
105249b5e25fSSatish Balay   PetscFunctionBegin;
10539566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
10543ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
10559566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
10569566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
105749b5e25fSSatish Balay 
105849b5e25fSSatish Balay   v  = a->a;
105949b5e25fSSatish Balay   xb = x;
106049b5e25fSSatish Balay 
106149b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
106249b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
10639371c9d4SSatish Balay     x1   = xb[0];
10649371c9d4SSatish Balay     x2   = xb[1];
10659371c9d4SSatish Balay     x3   = xb[2];
10669371c9d4SSatish Balay     x4   = xb[3];
10679371c9d4SSatish Balay     x5   = xb[4];
10689371c9d4SSatish Balay     x6   = xb[5];
106949b5e25fSSatish Balay     ib   = aj + *ai;
1070831a3094SHong Zhang     jmin = 0;
107198c9bda7SSatish Balay     nonzerorow += (n > 0);
107259689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
107349b5e25fSSatish Balay       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
107449b5e25fSSatish Balay       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
107549b5e25fSSatish Balay       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
107649b5e25fSSatish Balay       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
107749b5e25fSSatish Balay       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
107849b5e25fSSatish Balay       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
10799371c9d4SSatish Balay       v += 36;
10809371c9d4SSatish Balay       jmin++;
10817fbae186SHong Zhang     }
1082444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1083444d8c10SJed Brown     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1084831a3094SHong Zhang     for (j = jmin; j < n; j++) {
108549b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
108649b5e25fSSatish Balay       cval = ib[j] * 6;
108749b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
108849b5e25fSSatish Balay       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
108949b5e25fSSatish Balay       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
109049b5e25fSSatish Balay       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
109149b5e25fSSatish Balay       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
109249b5e25fSSatish Balay       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
109349b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
109449b5e25fSSatish Balay       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
109549b5e25fSSatish Balay       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
109649b5e25fSSatish Balay       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
109749b5e25fSSatish Balay       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
109849b5e25fSSatish Balay       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
109949b5e25fSSatish Balay       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
110049b5e25fSSatish Balay       v += 36;
110149b5e25fSSatish Balay     }
11029371c9d4SSatish Balay     xb += 6;
11039371c9d4SSatish Balay     ai++;
110449b5e25fSSatish Balay   }
110549b5e25fSSatish Balay 
11069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
110849b5e25fSSatish Balay 
11099566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
11103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
111149b5e25fSSatish Balay }
111249b5e25fSSatish Balay 
MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)1113d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1114d71ae5a4SJacob Faibussowitsch {
111549b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1116d9ca1df4SBarry Smith   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1117d9ca1df4SBarry Smith   const PetscScalar *x, *xb;
1118d9ca1df4SBarry Smith   const MatScalar   *v;
1119d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1120d9ca1df4SBarry Smith   const PetscInt    *aj = a->j, *ai = a->i, *ib;
112198c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
112249b5e25fSSatish Balay 
112349b5e25fSSatish Balay   PetscFunctionBegin;
11249566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
11253ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
11269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
11279566063dSJacob Faibussowitsch   PetscCall(VecGetArray(zz, &z));
112849b5e25fSSatish Balay 
112949b5e25fSSatish Balay   v  = a->a;
113049b5e25fSSatish Balay   xb = x;
113149b5e25fSSatish Balay 
113249b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
113349b5e25fSSatish Balay     n    = ai[1] - ai[0]; /* length of i_th block row of A */
11349371c9d4SSatish Balay     x1   = xb[0];
11359371c9d4SSatish Balay     x2   = xb[1];
11369371c9d4SSatish Balay     x3   = xb[2];
11379371c9d4SSatish Balay     x4   = xb[3];
11389371c9d4SSatish Balay     x5   = xb[4];
11399371c9d4SSatish Balay     x6   = xb[5];
11409371c9d4SSatish Balay     x7   = xb[6];
114149b5e25fSSatish Balay     ib   = aj + *ai;
1142831a3094SHong Zhang     jmin = 0;
114398c9bda7SSatish Balay     nonzerorow += (n > 0);
114459689ffbSStefano Zampini     if (n && *ib == i) { /* (diag of A)*x */
114549b5e25fSSatish Balay       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
114649b5e25fSSatish Balay       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
114749b5e25fSSatish Balay       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
114849b5e25fSSatish Balay       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
114949b5e25fSSatish Balay       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
115049b5e25fSSatish Balay       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
115149b5e25fSSatish Balay       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
11529371c9d4SSatish Balay       v += 49;
11539371c9d4SSatish Balay       jmin++;
11547fbae186SHong Zhang     }
1155444d8c10SJed Brown     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1156444d8c10SJed Brown     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1157831a3094SHong Zhang     for (j = jmin; j < n; j++) {
115849b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
115949b5e25fSSatish Balay       cval = ib[j] * 7;
116049b5e25fSSatish Balay       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
116149b5e25fSSatish Balay       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
116249b5e25fSSatish Balay       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
116349b5e25fSSatish Balay       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
116449b5e25fSSatish Balay       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
116549b5e25fSSatish Balay       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
116649b5e25fSSatish Balay       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
116749b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
116849b5e25fSSatish Balay       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
116949b5e25fSSatish Balay       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
117049b5e25fSSatish Balay       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
117149b5e25fSSatish Balay       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
117249b5e25fSSatish Balay       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
117349b5e25fSSatish Balay       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
117449b5e25fSSatish Balay       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
117549b5e25fSSatish Balay       v += 49;
117649b5e25fSSatish Balay     }
11779371c9d4SSatish Balay     xb += 7;
11789371c9d4SSatish Balay     ai++;
117949b5e25fSSatish Balay   }
118049b5e25fSSatish Balay 
11819566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
11829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
118349b5e25fSSatish Balay 
11849566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
11853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
118649b5e25fSSatish Balay }
118749b5e25fSSatish Balay 
MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)1188d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1189d71ae5a4SJacob Faibussowitsch {
119049b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1191f4259b30SLisandro Dalcin   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1192d9ca1df4SBarry Smith   const PetscScalar *x, *x_ptr, *xb;
1193d9ca1df4SBarry Smith   const MatScalar   *v;
1194d9ca1df4SBarry Smith   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1195d9ca1df4SBarry Smith   const PetscInt    *idx, *aj, *ii;
119698c9bda7SSatish Balay   PetscInt           nonzerorow = 0;
119749b5e25fSSatish Balay 
119849b5e25fSSatish Balay   PetscFunctionBegin;
11999566063dSJacob Faibussowitsch   PetscCall(VecCopy(yy, zz));
12003ba16761SJacob Faibussowitsch   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
12019371c9d4SSatish Balay   PetscCall(VecGetArrayRead(xx, &x));
12029371c9d4SSatish Balay   x_ptr = x;
12039371c9d4SSatish Balay   PetscCall(VecGetArray(zz, &z));
12049371c9d4SSatish Balay   z_ptr = z;
120549b5e25fSSatish Balay 
120649b5e25fSSatish Balay   aj = a->j;
120749b5e25fSSatish Balay   v  = a->a;
120849b5e25fSSatish Balay   ii = a->i;
120949b5e25fSSatish Balay 
121048a46eb9SPierre Jolivet   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
121149b5e25fSSatish Balay   work = a->mult_work;
121249b5e25fSSatish Balay 
121349b5e25fSSatish Balay   for (i = 0; i < mbs; i++) {
12149371c9d4SSatish Balay     n     = ii[1] - ii[0];
12159371c9d4SSatish Balay     ncols = n * bs;
12169371c9d4SSatish Balay     workt = work;
12179371c9d4SSatish Balay     idx   = aj + ii[0];
121898c9bda7SSatish Balay     nonzerorow += (n > 0);
121949b5e25fSSatish Balay 
122049b5e25fSSatish Balay     /* upper triangular part */
122149b5e25fSSatish Balay     for (j = 0; j < n; j++) {
122249b5e25fSSatish Balay       xb = x_ptr + bs * (*idx++);
122349b5e25fSSatish Balay       for (k = 0; k < bs; k++) workt[k] = xb[k];
122449b5e25fSSatish Balay       workt += bs;
122549b5e25fSSatish Balay     }
122649b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
122796b95a6bSBarry Smith     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
122849b5e25fSSatish Balay 
122949b5e25fSSatish Balay     /* strict lower triangular part */
1230831a3094SHong Zhang     idx = aj + ii[0];
123159689ffbSStefano Zampini     if (n && *idx == i) {
12329371c9d4SSatish Balay       ncols -= bs;
12339371c9d4SSatish Balay       v += bs2;
12349371c9d4SSatish Balay       idx++;
12359371c9d4SSatish Balay       n--;
1236831a3094SHong Zhang     }
123749b5e25fSSatish Balay     if (ncols > 0) {
123849b5e25fSSatish Balay       workt = work;
12399566063dSJacob Faibussowitsch       PetscCall(PetscArrayzero(workt, ncols));
124096b95a6bSBarry Smith       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1241831a3094SHong Zhang       for (j = 0; j < n; j++) {
1242831a3094SHong Zhang         zb = z_ptr + bs * (*idx++);
124349b5e25fSSatish Balay         for (k = 0; k < bs; k++) zb[k] += workt[k];
124449b5e25fSSatish Balay         workt += bs;
124549b5e25fSSatish Balay       }
124649b5e25fSSatish Balay     }
124749b5e25fSSatish Balay 
12489371c9d4SSatish Balay     x += bs;
12499371c9d4SSatish Balay     v += n * bs2;
12509371c9d4SSatish Balay     z += bs;
12519371c9d4SSatish Balay     ii++;
125249b5e25fSSatish Balay   }
125349b5e25fSSatish Balay 
12549566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
12559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(zz, &z));
125649b5e25fSSatish Balay 
12579566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
12583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
125949b5e25fSSatish Balay }
126049b5e25fSSatish Balay 
MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)1261d71ae5a4SJacob Faibussowitsch PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1262d71ae5a4SJacob Faibussowitsch {
126349b5e25fSSatish Balay   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1264f4df32b1SMatthew Knepley   PetscScalar   oalpha = alpha;
1265c5df96a5SBarry Smith   PetscBLASInt  one    = 1, totalnz;
126649b5e25fSSatish Balay 
126749b5e25fSSatish Balay   PetscFunctionBegin;
12689566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1269792fecdfSBarry Smith   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
12709566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(totalnz));
12713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
127249b5e25fSSatish Balay }
127349b5e25fSSatish Balay 
MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal * norm)1274d71ae5a4SJacob Faibussowitsch PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1275d71ae5a4SJacob Faibussowitsch {
127649b5e25fSSatish Balay   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1277d9ca1df4SBarry Smith   const MatScalar *v        = a->a;
127849b5e25fSSatish Balay   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1279d9ca1df4SBarry Smith   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1280d9ca1df4SBarry Smith   const PetscInt  *aj = a->j, *col;
128149b5e25fSSatish Balay 
128249b5e25fSSatish Balay   PetscFunctionBegin;
1283c40ae873SPierre Jolivet   if (!a->nz) {
1284c40ae873SPierre Jolivet     *norm = 0.0;
12853ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1286c40ae873SPierre Jolivet   }
128749b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
128849b5e25fSSatish Balay     for (k = 0; k < mbs; k++) {
128959689ffbSStefano Zampini       jmin = a->i[k];
129059689ffbSStefano Zampini       jmax = a->i[k + 1];
1291831a3094SHong Zhang       col  = aj + jmin;
129259689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
129349b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
12949371c9d4SSatish Balay           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
12959371c9d4SSatish Balay           v++;
129649b5e25fSSatish Balay         }
1297831a3094SHong Zhang         jmin++;
1298831a3094SHong Zhang       }
1299831a3094SHong Zhang       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
130049b5e25fSSatish Balay         for (i = 0; i < bs2; i++) {
13019371c9d4SSatish Balay           sum_off += PetscRealPart(PetscConj(*v) * (*v));
13029371c9d4SSatish Balay           v++;
130349b5e25fSSatish Balay         }
130449b5e25fSSatish Balay       }
130549b5e25fSSatish Balay     }
13068f1a2a5eSBarry Smith     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
13079566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
13080b8dc8d2SHong Zhang   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
13099566063dSJacob Faibussowitsch     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
13100b8dc8d2SHong Zhang     for (i = 0; i < mbs; i++) jl[i] = mbs;
13110b8dc8d2SHong Zhang     il[0] = 0;
131249b5e25fSSatish Balay 
131349b5e25fSSatish Balay     *norm = 0.0;
131449b5e25fSSatish Balay     for (k = 0; k < mbs; k++) { /* k_th block row */
131549b5e25fSSatish Balay       for (j = 0; j < bs; j++) sum[j] = 0.0;
131649b5e25fSSatish Balay       /*-- col sum --*/
131749b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1318bbea24aaSStefano Zampini       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1319831a3094SHong Zhang                   at step k */
132049b5e25fSSatish Balay       while (i < mbs) {
132149b5e25fSSatish Balay         nexti = jl[i]; /* next block row to be added */
132249b5e25fSSatish Balay         ik    = il[i]; /* block index of A(i,k) in the array a */
132349b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
132449b5e25fSSatish Balay           v = a->a + ik * bs2 + j * bs;
132549b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13269371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13279371c9d4SSatish Balay             v++;
132849b5e25fSSatish Balay           }
132949b5e25fSSatish Balay         }
133049b5e25fSSatish Balay         /* update il, jl */
1331831a3094SHong Zhang         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1332831a3094SHong Zhang         jmax = a->i[i + 1];
133349b5e25fSSatish Balay         if (jmin < jmax) {
133449b5e25fSSatish Balay           il[i] = jmin;
133549b5e25fSSatish Balay           j     = a->j[jmin];
13369371c9d4SSatish Balay           jl[i] = jl[j];
13379371c9d4SSatish Balay           jl[j] = i;
133849b5e25fSSatish Balay         }
133949b5e25fSSatish Balay         i = nexti;
134049b5e25fSSatish Balay       }
134149b5e25fSSatish Balay       /*-- row sum --*/
134259689ffbSStefano Zampini       jmin = a->i[k];
134359689ffbSStefano Zampini       jmax = a->i[k + 1];
134449b5e25fSSatish Balay       for (i = jmin; i < jmax; i++) {
134549b5e25fSSatish Balay         for (j = 0; j < bs; j++) {
134649b5e25fSSatish Balay           v = a->a + i * bs2 + j;
134749b5e25fSSatish Balay           for (k1 = 0; k1 < bs; k1++) {
13489371c9d4SSatish Balay             sum[j] += PetscAbsScalar(*v);
13499371c9d4SSatish Balay             v += bs;
135049b5e25fSSatish Balay           }
135149b5e25fSSatish Balay         }
135249b5e25fSSatish Balay       }
135349b5e25fSSatish Balay       /* add k_th block row to il, jl */
1354831a3094SHong Zhang       col = aj + jmin;
135559689ffbSStefano Zampini       if (jmax - jmin > 0 && *col == k) jmin++;
135649b5e25fSSatish Balay       if (jmin < jmax) {
135749b5e25fSSatish Balay         il[k] = jmin;
13589371c9d4SSatish Balay         j     = a->j[jmin];
13599371c9d4SSatish Balay         jl[k] = jl[j];
13609371c9d4SSatish Balay         jl[j] = k;
136149b5e25fSSatish Balay       }
136249b5e25fSSatish Balay       for (j = 0; j < bs; j++) {
136349b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
136449b5e25fSSatish Balay       }
136549b5e25fSSatish Balay     }
13669566063dSJacob Faibussowitsch     PetscCall(PetscFree3(sum, il, jl));
13679566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1368f23aa3ddSBarry Smith   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
13693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
137049b5e25fSSatish Balay }
137149b5e25fSSatish Balay 
MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)1372d71ae5a4SJacob Faibussowitsch PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1373d71ae5a4SJacob Faibussowitsch {
137449b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
137549b5e25fSSatish Balay 
137649b5e25fSSatish Balay   PetscFunctionBegin;
137749b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1378d0f46423SBarry Smith   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1379ef511fbeSHong Zhang     *flg = PETSC_FALSE;
13803ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
138149b5e25fSSatish Balay   }
138249b5e25fSSatish Balay 
138349b5e25fSSatish Balay   /* if the a->i are the same */
13849566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
13853ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
138649b5e25fSSatish Balay 
138749b5e25fSSatish Balay   /* if a->j are the same */
13889566063dSJacob Faibussowitsch   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
13893ba16761SJacob Faibussowitsch   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
139026fbe8dcSKarl Rupp 
139149b5e25fSSatish Balay   /* if a->a are the same */
1392418fb43bSPierre Jolivet   PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg));
13933ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
139449b5e25fSSatish Balay }
139549b5e25fSSatish Balay 
MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)1396d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1397d71ae5a4SJacob Faibussowitsch {
139849b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
13999234b629SBarry Smith   PetscInt         n;
14009234b629SBarry Smith   const PetscInt   bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2;
14019234b629SBarry Smith   PetscScalar     *x;
14029234b629SBarry Smith   const MatScalar *aa = a->a, *aa_j;
14039234b629SBarry Smith   const PetscInt  *ai = a->i, *adiag;
14049234b629SBarry Smith   PetscBool        diagDense;
140549b5e25fSSatish Balay 
140649b5e25fSSatish Balay   PetscFunctionBegin;
1407aed4548fSBarry Smith   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
14089234b629SBarry Smith   PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, &diagDense));
14098a0c6efdSHong Zhang   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
14109234b629SBarry Smith     PetscCall(VecGetArrayWrite(v, &x));
14119234b629SBarry Smith     for (PetscInt i = 0; i < ambs; i++) x[i] = 1.0 / aa[adiag[i]];
14129234b629SBarry Smith     PetscCall(VecRestoreArrayWrite(v, &x));
14133ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14148a0c6efdSHong Zhang   }
14158a0c6efdSHong Zhang 
14169234b629SBarry Smith   PetscCall(VecGetLocalSize(v, &n));
14179234b629SBarry Smith   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
14189234b629SBarry Smith   PetscCall(VecGetArrayWrite(v, &x));
14199234b629SBarry Smith 
14209234b629SBarry Smith   if (diagDense) {
14219234b629SBarry Smith     for (PetscInt i = 0, row = 0; i < ambs; i++) {
14229234b629SBarry Smith       aa_j = aa + adiag[i] * bs2;
14239234b629SBarry Smith       for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
14249234b629SBarry Smith     }
14259234b629SBarry Smith   } else {
14269234b629SBarry Smith     for (PetscInt i = 0, row = 0; i < ambs; i++) {
14279234b629SBarry Smith       const PetscInt j = adiag[i];
14289234b629SBarry Smith 
14299234b629SBarry Smith       if (j != ai[i + 1]) {
143049b5e25fSSatish Balay         aa_j = aa + j * bs2;
14319234b629SBarry Smith         for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
14329234b629SBarry Smith       } else {
14339234b629SBarry Smith         for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0;
143449b5e25fSSatish Balay       }
143549b5e25fSSatish Balay     }
14369234b629SBarry Smith   }
14379234b629SBarry Smith   PetscCall(VecRestoreArrayWrite(v, &x));
14383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
143949b5e25fSSatish Balay }
144049b5e25fSSatish Balay 
MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)1441d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1442d71ae5a4SJacob Faibussowitsch {
144349b5e25fSSatish Balay   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1444d9ca1df4SBarry Smith   PetscScalar        x;
1445d9ca1df4SBarry Smith   const PetscScalar *l, *li, *ri;
144649b5e25fSSatish Balay   MatScalar         *aa, *v;
1447fff8e43fSBarry Smith   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1448fff8e43fSBarry Smith   const PetscInt    *ai, *aj;
144949b5e25fSSatish Balay 
145049b5e25fSSatish Balay   PetscFunctionBegin;
14513ba16761SJacob Faibussowitsch   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
145249b5e25fSSatish Balay   ai  = a->i;
145349b5e25fSSatish Balay   aj  = a->j;
145449b5e25fSSatish Balay   aa  = a->a;
1455d0f46423SBarry Smith   m   = A->rmap->N;
1456d0f46423SBarry Smith   bs  = A->rmap->bs;
145749b5e25fSSatish Balay   mbs = a->mbs;
145849b5e25fSSatish Balay   bs2 = a->bs2;
145949b5e25fSSatish Balay 
14609566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(ll, &l));
14619566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(ll, &lm));
146208401ef6SPierre Jolivet   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
146349b5e25fSSatish Balay   for (i = 0; i < mbs; i++) { /* for each block row */
146449b5e25fSSatish Balay     M  = ai[i + 1] - ai[i];
146549b5e25fSSatish Balay     li = l + i * bs;
146649b5e25fSSatish Balay     v  = aa + bs2 * ai[i];
146749b5e25fSSatish Balay     for (j = 0; j < M; j++) { /* for each block */
146849b5e25fSSatish Balay       ri = l + bs * aj[ai[i] + j];
14695e90f9d9SHong Zhang       for (k = 0; k < bs; k++) {
147049b5e25fSSatish Balay         x = ri[k];
147149b5e25fSSatish Balay         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
147249b5e25fSSatish Balay       }
147349b5e25fSSatish Balay     }
147449b5e25fSSatish Balay   }
14759566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(ll, &l));
14769566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
14773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
147849b5e25fSSatish Balay }
147949b5e25fSSatish Balay 
MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo * info)1480d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1481d71ae5a4SJacob Faibussowitsch {
148249b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
148349b5e25fSSatish Balay 
148449b5e25fSSatish Balay   PetscFunctionBegin;
148549b5e25fSSatish Balay   info->block_size   = a->bs2;
1486ceed8ce5SJed Brown   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
14876c6c5352SBarry Smith   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
14883966268fSBarry Smith   info->nz_unneeded  = info->nz_allocated - info->nz_used;
148949b5e25fSSatish Balay   info->assemblies   = A->num_ass;
14908e58a170SBarry Smith   info->mallocs      = A->info.mallocs;
14914dfa11a4SJacob Faibussowitsch   info->memory       = 0; /* REVIEW ME */
1492d5f3da31SBarry Smith   if (A->factortype) {
149349b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
149449b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
149549b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
149649b5e25fSSatish Balay   } else {
149749b5e25fSSatish Balay     info->fill_ratio_given  = 0;
149849b5e25fSSatish Balay     info->fill_ratio_needed = 0;
149949b5e25fSSatish Balay     info->factor_mallocs    = 0;
150049b5e25fSSatish Balay   }
15013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
150249b5e25fSSatish Balay }
150349b5e25fSSatish Balay 
MatZeroEntries_SeqSBAIJ(Mat A)1504d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1505d71ae5a4SJacob Faibussowitsch {
150649b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
150749b5e25fSSatish Balay 
150849b5e25fSSatish Balay   PetscFunctionBegin;
15099566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
15103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
151149b5e25fSSatish Balay }
1512dc354874SHong Zhang 
MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])1513d71ae5a4SJacob Faibussowitsch PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1514d71ae5a4SJacob Faibussowitsch {
1515dc354874SHong Zhang   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1516d9ca1df4SBarry Smith   PetscInt         i, j, n, row, col, bs, mbs;
1517d9ca1df4SBarry Smith   const PetscInt  *ai, *aj;
1518c3fca9a7SHong Zhang   PetscReal        atmp;
1519d9ca1df4SBarry Smith   const MatScalar *aa;
1520985db425SBarry Smith   PetscScalar     *x;
152113f74950SBarry Smith   PetscInt         ncols, brow, bcol, krow, kcol;
1522dc354874SHong Zhang 
1523dc354874SHong Zhang   PetscFunctionBegin;
152428b400f6SJacob Faibussowitsch   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
152528b400f6SJacob Faibussowitsch   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1526d0f46423SBarry Smith   bs  = A->rmap->bs;
1527dc354874SHong Zhang   aa  = a->a;
1528dc354874SHong Zhang   ai  = a->i;
1529dc354874SHong Zhang   aj  = a->j;
153044117c81SHong Zhang   mbs = a->mbs;
1531dc354874SHong Zhang 
15329566063dSJacob Faibussowitsch   PetscCall(VecSet(v, 0.0));
15339566063dSJacob Faibussowitsch   PetscCall(VecGetArray(v, &x));
15349566063dSJacob Faibussowitsch   PetscCall(VecGetLocalSize(v, &n));
153508401ef6SPierre Jolivet   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
153644117c81SHong Zhang   for (i = 0; i < mbs; i++) {
15379371c9d4SSatish Balay     ncols = ai[1] - ai[0];
15389371c9d4SSatish Balay     ai++;
1539d0f6400bSHong Zhang     brow = bs * i;
154044117c81SHong Zhang     for (j = 0; j < ncols; j++) {
1541d0f6400bSHong Zhang       bcol = bs * (*aj);
154244117c81SHong Zhang       for (kcol = 0; kcol < bs; kcol++) {
1543d0f6400bSHong Zhang         col = bcol + kcol; /* col index */
154444117c81SHong Zhang         for (krow = 0; krow < bs; krow++) {
15459371c9d4SSatish Balay           atmp = PetscAbsScalar(*aa);
15469371c9d4SSatish Balay           aa++;
1547d0f6400bSHong Zhang           row = brow + krow; /* row index */
1548c3fca9a7SHong Zhang           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1549c3fca9a7SHong Zhang           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
155044117c81SHong Zhang         }
155144117c81SHong Zhang       }
1552d0f6400bSHong Zhang       aj++;
1553dc354874SHong Zhang     }
1554dc354874SHong Zhang   }
15559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(v, &x));
15563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1557dc354874SHong Zhang }
1558c2916339SPierre Jolivet 
MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)1559d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1560d71ae5a4SJacob Faibussowitsch {
1561c2916339SPierre Jolivet   PetscFunctionBegin;
15629566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
15634222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
15643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1565c2916339SPierre Jolivet }
1566c2916339SPierre Jolivet 
MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)156766976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1568d71ae5a4SJacob Faibussowitsch {
1569c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1570c2916339SPierre Jolivet   PetscScalar       *z = c;
1571c2916339SPierre Jolivet   const PetscScalar *xb;
1572c2916339SPierre Jolivet   PetscScalar        x1;
1573c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1574c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1575*fc2fb351SPierre Jolivet   const int          aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
1576c2916339SPierre Jolivet 
1577c2916339SPierre Jolivet   PetscFunctionBegin;
1578c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
15799371c9d4SSatish Balay     n = ii[1] - ii[0];
15809371c9d4SSatish Balay     ii++;
1581c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1582c2916339SPierre Jolivet     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1583c2916339SPierre Jolivet     jj = idx;
1584c2916339SPierre Jolivet     vv = v;
1585c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1586c2916339SPierre Jolivet       idx = jj;
1587c2916339SPierre Jolivet       v   = vv;
1588c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
15899371c9d4SSatish Balay         xb = b + (*idx);
15909371c9d4SSatish Balay         x1 = xb[0 + k * bm];
1591c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1;
15923c2e41e1SStefano Zampini         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1593c2916339SPierre Jolivet         v += 1;
1594c2916339SPierre Jolivet         ++idx;
1595c2916339SPierre Jolivet       }
1596c2916339SPierre Jolivet     }
1597c2916339SPierre Jolivet     z += 1;
1598c2916339SPierre Jolivet   }
15993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1600c2916339SPierre Jolivet }
1601c2916339SPierre Jolivet 
MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)160266976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1603d71ae5a4SJacob Faibussowitsch {
1604c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1605c2916339SPierre Jolivet   PetscScalar       *z = c;
1606c2916339SPierre Jolivet   const PetscScalar *xb;
1607c2916339SPierre Jolivet   PetscScalar        x1, x2;
1608c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1609c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1610c2916339SPierre Jolivet 
1611c2916339SPierre Jolivet   PetscFunctionBegin;
1612c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16139371c9d4SSatish Balay     n = ii[1] - ii[0];
16149371c9d4SSatish Balay     ii++;
1615c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1616c2916339SPierre Jolivet     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1617c2916339SPierre Jolivet     jj = idx;
1618c2916339SPierre Jolivet     vv = v;
1619c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1620c2916339SPierre Jolivet       idx = jj;
1621c2916339SPierre Jolivet       v   = vv;
1622c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16239371c9d4SSatish Balay         xb = b + 2 * (*idx);
16249371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16259371c9d4SSatish Balay         x2 = xb[1 + k * bm];
1626c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1627c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1628c2916339SPierre Jolivet         if (*idx != i) {
1629c2916339SPierre Jolivet           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1630c2916339SPierre Jolivet           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1631c2916339SPierre Jolivet         }
1632c2916339SPierre Jolivet         v += 4;
1633c2916339SPierre Jolivet         ++idx;
1634c2916339SPierre Jolivet       }
1635c2916339SPierre Jolivet     }
1636c2916339SPierre Jolivet     z += 2;
1637c2916339SPierre Jolivet   }
16383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1639c2916339SPierre Jolivet }
1640c2916339SPierre Jolivet 
MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)164166976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1642d71ae5a4SJacob Faibussowitsch {
1643c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1644c2916339SPierre Jolivet   PetscScalar       *z = c;
1645c2916339SPierre Jolivet   const PetscScalar *xb;
1646c2916339SPierre Jolivet   PetscScalar        x1, x2, x3;
1647c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1648c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1649c2916339SPierre Jolivet 
1650c2916339SPierre Jolivet   PetscFunctionBegin;
1651c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16529371c9d4SSatish Balay     n = ii[1] - ii[0];
16539371c9d4SSatish Balay     ii++;
1654c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1655c2916339SPierre Jolivet     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1656c2916339SPierre Jolivet     jj = idx;
1657c2916339SPierre Jolivet     vv = v;
1658c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1659c2916339SPierre Jolivet       idx = jj;
1660c2916339SPierre Jolivet       v   = vv;
1661c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
16629371c9d4SSatish Balay         xb = b + 3 * (*idx);
16639371c9d4SSatish Balay         x1 = xb[0 + k * bm];
16649371c9d4SSatish Balay         x2 = xb[1 + k * bm];
16659371c9d4SSatish Balay         x3 = xb[2 + k * bm];
1666c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1667c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1668c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1669c2916339SPierre Jolivet         if (*idx != i) {
1670c2916339SPierre Jolivet           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1671c2916339SPierre Jolivet           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1672c2916339SPierre Jolivet           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1673c2916339SPierre Jolivet         }
1674c2916339SPierre Jolivet         v += 9;
1675c2916339SPierre Jolivet         ++idx;
1676c2916339SPierre Jolivet       }
1677c2916339SPierre Jolivet     }
1678c2916339SPierre Jolivet     z += 3;
1679c2916339SPierre Jolivet   }
16803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1681c2916339SPierre Jolivet }
1682c2916339SPierre Jolivet 
MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)168366976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1684d71ae5a4SJacob Faibussowitsch {
1685c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1686c2916339SPierre Jolivet   PetscScalar       *z = c;
1687c2916339SPierre Jolivet   const PetscScalar *xb;
1688c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4;
1689c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1690c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1691c2916339SPierre Jolivet 
1692c2916339SPierre Jolivet   PetscFunctionBegin;
1693c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
16949371c9d4SSatish Balay     n = ii[1] - ii[0];
16959371c9d4SSatish Balay     ii++;
1696c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1697c2916339SPierre Jolivet     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1698c2916339SPierre Jolivet     jj = idx;
1699c2916339SPierre Jolivet     vv = v;
1700c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1701c2916339SPierre Jolivet       idx = jj;
1702c2916339SPierre Jolivet       v   = vv;
1703c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17049371c9d4SSatish Balay         xb = b + 4 * (*idx);
17059371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17069371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17079371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17089371c9d4SSatish Balay         x4 = xb[3 + k * bm];
1709c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1710c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1711c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1712c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1713c2916339SPierre Jolivet         if (*idx != i) {
1714c2916339SPierre Jolivet           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1715c2916339SPierre Jolivet           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1716c2916339SPierre Jolivet           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1717c2916339SPierre Jolivet           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1718c2916339SPierre Jolivet         }
1719c2916339SPierre Jolivet         v += 16;
1720c2916339SPierre Jolivet         ++idx;
1721c2916339SPierre Jolivet       }
1722c2916339SPierre Jolivet     }
1723c2916339SPierre Jolivet     z += 4;
1724c2916339SPierre Jolivet   }
17253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1726c2916339SPierre Jolivet }
1727c2916339SPierre Jolivet 
MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)172866976f2fSJacob Faibussowitsch static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1729d71ae5a4SJacob Faibussowitsch {
1730c2916339SPierre Jolivet   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1731c2916339SPierre Jolivet   PetscScalar       *z = c;
1732c2916339SPierre Jolivet   const PetscScalar *xb;
1733c2916339SPierre Jolivet   PetscScalar        x1, x2, x3, x4, x5;
1734c2916339SPierre Jolivet   const MatScalar   *v   = a->a, *vv;
1735c2916339SPierre Jolivet   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1736c2916339SPierre Jolivet 
1737c2916339SPierre Jolivet   PetscFunctionBegin;
1738c2916339SPierre Jolivet   for (i = 0; i < mbs; i++) {
17399371c9d4SSatish Balay     n = ii[1] - ii[0];
17409371c9d4SSatish Balay     ii++;
1741c2916339SPierre Jolivet     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1742c2916339SPierre Jolivet     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1743c2916339SPierre Jolivet     jj = idx;
1744c2916339SPierre Jolivet     vv = v;
1745c2916339SPierre Jolivet     for (k = 0; k < cn; k++) {
1746c2916339SPierre Jolivet       idx = jj;
1747c2916339SPierre Jolivet       v   = vv;
1748c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
17499371c9d4SSatish Balay         xb = b + 5 * (*idx);
17509371c9d4SSatish Balay         x1 = xb[0 + k * bm];
17519371c9d4SSatish Balay         x2 = xb[1 + k * bm];
17529371c9d4SSatish Balay         x3 = xb[2 + k * bm];
17539371c9d4SSatish Balay         x4 = xb[3 + k * bm];
17549371c9d4SSatish Balay         x5 = xb[4 + k * cm];
1755c2916339SPierre Jolivet         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1756c2916339SPierre Jolivet         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1757c2916339SPierre Jolivet         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1758c2916339SPierre Jolivet         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1759c2916339SPierre Jolivet         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1760c2916339SPierre Jolivet         if (*idx != i) {
1761c2916339SPierre Jolivet           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1762c2916339SPierre Jolivet           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1763c2916339SPierre Jolivet           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1764c2916339SPierre Jolivet           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1765c2916339SPierre Jolivet           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1766c2916339SPierre Jolivet         }
1767c2916339SPierre Jolivet         v += 25;
1768c2916339SPierre Jolivet         ++idx;
1769c2916339SPierre Jolivet       }
1770c2916339SPierre Jolivet     }
1771c2916339SPierre Jolivet     z += 5;
1772c2916339SPierre Jolivet   }
17733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1774c2916339SPierre Jolivet }
1775c2916339SPierre Jolivet 
MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)1776d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1777d71ae5a4SJacob Faibussowitsch {
1778c2916339SPierre Jolivet   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1779c2916339SPierre Jolivet   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1780281439baSStefano Zampini   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1781c2916339SPierre Jolivet   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1782c2916339SPierre Jolivet   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1783c2916339SPierre Jolivet   PetscBLASInt     bbs, bcn, bbm, bcm;
1784f4259b30SLisandro Dalcin   PetscScalar     *z = NULL;
1785c2916339SPierre Jolivet   PetscScalar     *c, *b;
1786c2916339SPierre Jolivet   const MatScalar *v;
1787c2916339SPierre Jolivet   const PetscInt  *idx, *ii;
1788c2916339SPierre Jolivet   PetscScalar      _DOne = 1.0;
1789c2916339SPierre Jolivet 
1790c2916339SPierre Jolivet   PetscFunctionBegin;
17913ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
179208401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
179308401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
179408401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1795c2916339SPierre Jolivet   b = bd->v;
17969566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
17979566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArray(C, &c));
1798c2916339SPierre Jolivet   switch (bs) {
1799d71ae5a4SJacob Faibussowitsch   case 1:
1800d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1801d71ae5a4SJacob Faibussowitsch     break;
1802d71ae5a4SJacob Faibussowitsch   case 2:
1803d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1804d71ae5a4SJacob Faibussowitsch     break;
1805d71ae5a4SJacob Faibussowitsch   case 3:
1806d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1807d71ae5a4SJacob Faibussowitsch     break;
1808d71ae5a4SJacob Faibussowitsch   case 4:
1809d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1810d71ae5a4SJacob Faibussowitsch     break;
1811d71ae5a4SJacob Faibussowitsch   case 5:
1812d71ae5a4SJacob Faibussowitsch     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1813d71ae5a4SJacob Faibussowitsch     break;
1814c2916339SPierre Jolivet   default: /* block sizes larger than 5 by 5 are handled by BLAS */
18159566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bs, &bbs));
18169566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cn, &bcn));
18179566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(bm, &bbm));
18189566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(cm, &bcm));
1819c2916339SPierre Jolivet     idx = a->j;
1820c2916339SPierre Jolivet     v   = a->a;
1821c2916339SPierre Jolivet     mbs = a->mbs;
1822c2916339SPierre Jolivet     ii  = a->i;
1823c2916339SPierre Jolivet     z   = c;
1824c2916339SPierre Jolivet     for (i = 0; i < mbs; i++) {
18259371c9d4SSatish Balay       n = ii[1] - ii[0];
18269371c9d4SSatish Balay       ii++;
1827c2916339SPierre Jolivet       for (j = 0; j < n; j++) {
1828792fecdfSBarry Smith         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1829792fecdfSBarry Smith         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1830c2916339SPierre Jolivet         v += bs2;
1831c2916339SPierre Jolivet       }
1832c2916339SPierre Jolivet       z += bs;
1833c2916339SPierre Jolivet     }
1834c2916339SPierre Jolivet   }
18359566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArray(C, &c));
18369566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
18373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1838c2916339SPierre Jolivet }
1839