1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <../src/mat/impls/sbaij/seq/sbaij.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7
MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9 {
10 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
11 PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
12 const PetscInt *idx;
13 PetscBT table_out, table_in;
14
15 PetscFunctionBegin;
16 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17 mbs = a->mbs;
18 ai = a->i;
19 aj = a->j;
20 bs = A->rmap->bs;
21 PetscCall(PetscBTCreate(mbs, &table_out));
22 PetscCall(PetscMalloc1(mbs + 1, &nidx));
23 PetscCall(PetscBTCreate(mbs, &table_in));
24
25 for (i = 0; i < is_max; i++) { /* for each is */
26 isz = 0;
27 PetscCall(PetscBTMemzero(mbs, table_out));
28
29 /* Extract the indices, assume there can be duplicate entries */
30 PetscCall(ISGetIndices(is[i], &idx));
31 PetscCall(ISGetLocalSize(is[i], &n));
32
33 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34 bcol_max = 0;
35 for (j = 0; j < n; ++j) {
36 brow = idx[j] / bs; /* convert the indices into block indices */
37 PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38 if (!PetscBTLookupSet(table_out, brow)) {
39 nidx[isz++] = brow;
40 if (bcol_max < brow) bcol_max = brow;
41 }
42 }
43 PetscCall(ISRestoreIndices(is[i], &idx));
44 PetscCall(ISDestroy(&is[i]));
45
46 k = 0;
47 for (j = 0; j < ov; j++) { /* for each overlap */
48 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49 PetscCall(PetscBTMemzero(mbs, table_in));
50 for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
51
52 n = isz; /* length of the updated is[i] */
53 for (brow = 0; brow < mbs; brow++) {
54 start = ai[brow];
55 end = ai[brow + 1];
56 if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57 for (l = start; l < end; l++) {
58 bcol = aj[l];
59 if (!PetscBTLookupSet(table_out, bcol)) {
60 nidx[isz++] = bcol;
61 if (bcol_max < bcol) bcol_max = bcol;
62 }
63 }
64 k++;
65 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66 } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67 for (l = start; l < end; l++) {
68 bcol = aj[l];
69 if (bcol > bcol_max) break;
70 if (PetscBTLookup(table_in, bcol)) {
71 if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72 break; /* for l = start; l<end ; l++) */
73 }
74 }
75 }
76 }
77 } /* for each overlap */
78 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
79 } /* for each is */
80 PetscCall(PetscBTDestroy(&table_out));
81 PetscCall(PetscFree(nidx));
82 PetscCall(PetscBTDestroy(&table_in));
83 PetscFunctionReturn(PETSC_SUCCESS);
84 }
85
86 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87 Zero some ops' to avoid invalid use */
MatSeqSBAIJZeroOps_Private(Mat Bseq)88 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89 {
90 PetscFunctionBegin;
91 PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92 Bseq->ops->mult = NULL;
93 Bseq->ops->multadd = NULL;
94 Bseq->ops->multtranspose = NULL;
95 Bseq->ops->multtransposeadd = NULL;
96 Bseq->ops->lufactor = NULL;
97 Bseq->ops->choleskyfactor = NULL;
98 Bseq->ops->lufactorsymbolic = NULL;
99 Bseq->ops->choleskyfactorsymbolic = NULL;
100 Bseq->ops->getinertia = NULL;
101 PetscFunctionReturn(PETSC_SUCCESS);
102 }
103
104 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B,PetscBool sym)105 static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106 {
107 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108 Mat_SeqBAIJ *d = NULL;
109 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
111 const PetscInt *irow, *icol;
112 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113 PetscInt *aj = a->j, *ai = a->i;
114 MatScalar *mat_a;
115 Mat C;
116 PetscBool flag;
117
118 PetscFunctionBegin;
119 PetscCall(ISGetIndices(isrow, &irow));
120 PetscCall(ISGetIndices(iscol, &icol));
121 PetscCall(ISGetLocalSize(isrow, &nrows));
122 PetscCall(ISGetLocalSize(iscol, &ncols));
123
124 PetscCall(PetscCalloc1(1 + oldcols, &smap));
125 ssmap = smap;
126 PetscCall(PetscMalloc1(1 + nrows, &lens));
127 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
128 /* determine lens of each row */
129 for (i = 0; i < nrows; i++) {
130 kstart = ai[irow[i]];
131 kend = kstart + a->ilen[irow[i]];
132 lens[i] = 0;
133 for (k = kstart; k < kend; k++) {
134 if (ssmap[aj[k]]) lens[i]++;
135 }
136 }
137 /* Create and fill new matrix */
138 if (scall == MAT_REUSE_MATRIX) {
139 if (sym) {
140 c = (Mat_SeqSBAIJ *)((*B)->data);
141
142 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
145 PetscCall(PetscArrayzero(c->ilen, c->mbs));
146 } else {
147 d = (Mat_SeqBAIJ *)((*B)->data);
148
149 PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150 PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
151 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
152 PetscCall(PetscArrayzero(d->ilen, d->mbs));
153 }
154 C = *B;
155 } else {
156 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
157 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
158 if (sym) {
159 PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
160 PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
161 } else {
162 PetscCall(MatSetType(C, MATSEQBAIJ));
163 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
164 }
165 }
166 if (sym) c = (Mat_SeqSBAIJ *)C->data;
167 else d = (Mat_SeqBAIJ *)C->data;
168 for (i = 0; i < nrows; i++) {
169 row = irow[i];
170 kstart = ai[row];
171 kend = kstart + a->ilen[row];
172 if (sym) {
173 mat_i = c->i[i];
174 mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
175 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
176 mat_ilen = c->ilen + i;
177 } else {
178 mat_i = d->i[i];
179 mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
180 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
181 mat_ilen = d->ilen + i;
182 }
183 for (k = kstart; k < kend; k++) {
184 if ((tcol = ssmap[a->j[k]])) {
185 *mat_j++ = tcol - 1;
186 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
187 mat_a += bs2;
188 (*mat_ilen)++;
189 }
190 }
191 }
192 /* sort */
193 {
194 MatScalar *work;
195
196 PetscCall(PetscMalloc1(bs2, &work));
197 for (i = 0; i < nrows; i++) {
198 PetscInt ilen;
199 if (sym) {
200 mat_i = c->i[i];
201 mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
202 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
203 ilen = c->ilen[i];
204 } else {
205 mat_i = d->i[i];
206 mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
207 mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
208 ilen = d->ilen[i];
209 }
210 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
211 }
212 PetscCall(PetscFree(work));
213 }
214
215 /* Free work space */
216 PetscCall(ISRestoreIndices(iscol, &icol));
217 PetscCall(PetscFree(smap));
218 PetscCall(PetscFree(lens));
219 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
220 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
221
222 PetscCall(ISRestoreIndices(isrow, &irow));
223 *B = C;
224 PetscFunctionReturn(PETSC_SUCCESS);
225 }
226
MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)227 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
228 {
229 Mat C[2];
230 IS is1, is2, intersect = NULL;
231 PetscInt n1, n2, ni;
232 PetscBool sym = PETSC_TRUE;
233
234 PetscFunctionBegin;
235 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
236 if (isrow == iscol) {
237 is2 = is1;
238 PetscCall(PetscObjectReference((PetscObject)is2));
239 } else {
240 PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
241 PetscCall(ISIntersect(is1, is2, &intersect));
242 PetscCall(ISGetLocalSize(intersect, &ni));
243 PetscCall(ISDestroy(&intersect));
244 if (ni == 0) sym = PETSC_FALSE;
245 else if (PetscDefined(USE_DEBUG)) {
246 PetscCall(ISGetLocalSize(is1, &n1));
247 PetscCall(ISGetLocalSize(is2, &n2));
248 PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
249 }
250 }
251 if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
252 else {
253 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
254 PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
255 PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
256 PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
257 PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
258 if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
259 else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
260 else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261 PetscCall(MatDestroy(C));
262 PetscCall(MatDestroy(C + 1));
263 }
264 PetscCall(ISDestroy(&is1));
265 PetscCall(ISDestroy(&is2));
266
267 if (sym && isrow != iscol) {
268 PetscBool isequal;
269 PetscCall(ISEqual(isrow, iscol, &isequal));
270 if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
271 }
272 PetscFunctionReturn(PETSC_SUCCESS);
273 }
274
MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])275 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
276 {
277 PetscInt i;
278
279 PetscFunctionBegin;
280 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
281
282 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
283 PetscFunctionReturn(PETSC_SUCCESS);
284 }
285
286 /* Should check that shapes of vectors and matrices match */
MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
288 {
289 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
290 PetscScalar *z, x1, x2, zero = 0.0;
291 const PetscScalar *x, *xb;
292 const MatScalar *v;
293 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
294 const PetscInt *aj = a->j, *ai = a->i, *ib;
295 PetscInt nonzerorow = 0;
296
297 PetscFunctionBegin;
298 PetscCall(VecSet(zz, zero));
299 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300 PetscCall(VecGetArrayRead(xx, &x));
301 PetscCall(VecGetArray(zz, &z));
302
303 v = a->a;
304 xb = x;
305
306 for (i = 0; i < mbs; i++) {
307 n = ai[1] - ai[0]; /* length of i_th block row of A */
308 x1 = xb[0];
309 x2 = xb[1];
310 ib = aj + *ai;
311 jmin = 0;
312 nonzerorow += (n > 0);
313 if (*ib == i) { /* (diag of A)*x */
314 z[2 * i] += v[0] * x1 + v[2] * x2;
315 z[2 * i + 1] += v[2] * x1 + v[3] * x2;
316 v += 4;
317 jmin++;
318 }
319 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321 for (j = jmin; j < n; j++) {
322 /* (strict lower triangular part of A)*x */
323 cval = ib[j] * 2;
324 z[cval] += v[0] * x1 + v[1] * x2;
325 z[cval + 1] += v[2] * x1 + v[3] * x2;
326 /* (strict upper triangular part of A)*x */
327 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
328 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
329 v += 4;
330 }
331 xb += 2;
332 ai++;
333 }
334
335 PetscCall(VecRestoreArrayRead(xx, &x));
336 PetscCall(VecRestoreArray(zz, &z));
337 PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
338 PetscFunctionReturn(PETSC_SUCCESS);
339 }
340
MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)341 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
342 {
343 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
344 PetscScalar *z, x1, x2, x3, zero = 0.0;
345 const PetscScalar *x, *xb;
346 const MatScalar *v;
347 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
348 const PetscInt *aj = a->j, *ai = a->i, *ib;
349 PetscInt nonzerorow = 0;
350
351 PetscFunctionBegin;
352 PetscCall(VecSet(zz, zero));
353 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
354 PetscCall(VecGetArrayRead(xx, &x));
355 PetscCall(VecGetArray(zz, &z));
356
357 v = a->a;
358 xb = x;
359
360 for (i = 0; i < mbs; i++) {
361 n = ai[1] - ai[0]; /* length of i_th block row of A */
362 x1 = xb[0];
363 x2 = xb[1];
364 x3 = xb[2];
365 ib = aj + *ai;
366 jmin = 0;
367 nonzerorow += (n > 0);
368 if (*ib == i) { /* (diag of A)*x */
369 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
370 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
371 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
372 v += 9;
373 jmin++;
374 }
375 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
376 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
377 for (j = jmin; j < n; j++) {
378 /* (strict lower triangular part of A)*x */
379 cval = ib[j] * 3;
380 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
381 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
382 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
383 /* (strict upper triangular part of A)*x */
384 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
385 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
386 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
387 v += 9;
388 }
389 xb += 3;
390 ai++;
391 }
392
393 PetscCall(VecRestoreArrayRead(xx, &x));
394 PetscCall(VecRestoreArray(zz, &z));
395 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
396 PetscFunctionReturn(PETSC_SUCCESS);
397 }
398
MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)399 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
400 {
401 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
402 PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
403 const PetscScalar *x, *xb;
404 const MatScalar *v;
405 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
406 const PetscInt *aj = a->j, *ai = a->i, *ib;
407 PetscInt nonzerorow = 0;
408
409 PetscFunctionBegin;
410 PetscCall(VecSet(zz, zero));
411 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
412 PetscCall(VecGetArrayRead(xx, &x));
413 PetscCall(VecGetArray(zz, &z));
414
415 v = a->a;
416 xb = x;
417
418 for (i = 0; i < mbs; i++) {
419 n = ai[1] - ai[0]; /* length of i_th block row of A */
420 x1 = xb[0];
421 x2 = xb[1];
422 x3 = xb[2];
423 x4 = xb[3];
424 ib = aj + *ai;
425 jmin = 0;
426 nonzerorow += (n > 0);
427 if (*ib == i) { /* (diag of A)*x */
428 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
429 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
430 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
431 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
432 v += 16;
433 jmin++;
434 }
435 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
436 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437 for (j = jmin; j < n; j++) {
438 /* (strict lower triangular part of A)*x */
439 cval = ib[j] * 4;
440 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
441 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
442 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
443 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
444 /* (strict upper triangular part of A)*x */
445 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
446 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
447 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
448 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
449 v += 16;
450 }
451 xb += 4;
452 ai++;
453 }
454
455 PetscCall(VecRestoreArrayRead(xx, &x));
456 PetscCall(VecRestoreArray(zz, &z));
457 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
458 PetscFunctionReturn(PETSC_SUCCESS);
459 }
460
MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)461 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
462 {
463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
464 PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
465 const PetscScalar *x, *xb;
466 const MatScalar *v;
467 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
468 const PetscInt *aj = a->j, *ai = a->i, *ib;
469 PetscInt nonzerorow = 0;
470
471 PetscFunctionBegin;
472 PetscCall(VecSet(zz, zero));
473 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
474 PetscCall(VecGetArrayRead(xx, &x));
475 PetscCall(VecGetArray(zz, &z));
476
477 v = a->a;
478 xb = x;
479
480 for (i = 0; i < mbs; i++) {
481 n = ai[1] - ai[0]; /* length of i_th block row of A */
482 x1 = xb[0];
483 x2 = xb[1];
484 x3 = xb[2];
485 x4 = xb[3];
486 x5 = xb[4];
487 ib = aj + *ai;
488 jmin = 0;
489 nonzerorow += (n > 0);
490 if (*ib == i) { /* (diag of A)*x */
491 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
492 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
493 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
494 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
495 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
496 v += 25;
497 jmin++;
498 }
499 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
500 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
501 for (j = jmin; j < n; j++) {
502 /* (strict lower triangular part of A)*x */
503 cval = ib[j] * 5;
504 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
505 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
506 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
507 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
508 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
509 /* (strict upper triangular part of A)*x */
510 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
511 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
512 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
513 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
514 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
515 v += 25;
516 }
517 xb += 5;
518 ai++;
519 }
520
521 PetscCall(VecRestoreArrayRead(xx, &x));
522 PetscCall(VecRestoreArray(zz, &z));
523 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
524 PetscFunctionReturn(PETSC_SUCCESS);
525 }
526
MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)527 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
528 {
529 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
530 PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
531 const PetscScalar *x, *xb;
532 const MatScalar *v;
533 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
534 const PetscInt *aj = a->j, *ai = a->i, *ib;
535 PetscInt nonzerorow = 0;
536
537 PetscFunctionBegin;
538 PetscCall(VecSet(zz, zero));
539 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
540 PetscCall(VecGetArrayRead(xx, &x));
541 PetscCall(VecGetArray(zz, &z));
542
543 v = a->a;
544 xb = x;
545
546 for (i = 0; i < mbs; i++) {
547 n = ai[1] - ai[0]; /* length of i_th block row of A */
548 x1 = xb[0];
549 x2 = xb[1];
550 x3 = xb[2];
551 x4 = xb[3];
552 x5 = xb[4];
553 x6 = xb[5];
554 ib = aj + *ai;
555 jmin = 0;
556 nonzerorow += (n > 0);
557 if (*ib == i) { /* (diag of A)*x */
558 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
559 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
560 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
561 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
562 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
563 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
564 v += 36;
565 jmin++;
566 }
567 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
568 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
569 for (j = jmin; j < n; j++) {
570 /* (strict lower triangular part of A)*x */
571 cval = ib[j] * 6;
572 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
573 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
574 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
575 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
576 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
577 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
578 /* (strict upper triangular part of A)*x */
579 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
580 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
581 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
582 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
583 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
584 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
585 v += 36;
586 }
587 xb += 6;
588 ai++;
589 }
590
591 PetscCall(VecRestoreArrayRead(xx, &x));
592 PetscCall(VecRestoreArray(zz, &z));
593 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
594 PetscFunctionReturn(PETSC_SUCCESS);
595 }
596
MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)597 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
598 {
599 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
600 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
601 const PetscScalar *x, *xb;
602 const MatScalar *v;
603 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
604 const PetscInt *aj = a->j, *ai = a->i, *ib;
605 PetscInt nonzerorow = 0;
606
607 PetscFunctionBegin;
608 PetscCall(VecSet(zz, zero));
609 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
610 PetscCall(VecGetArrayRead(xx, &x));
611 PetscCall(VecGetArray(zz, &z));
612
613 v = a->a;
614 xb = x;
615
616 for (i = 0; i < mbs; i++) {
617 n = ai[1] - ai[0]; /* length of i_th block row of A */
618 x1 = xb[0];
619 x2 = xb[1];
620 x3 = xb[2];
621 x4 = xb[3];
622 x5 = xb[4];
623 x6 = xb[5];
624 x7 = xb[6];
625 ib = aj + *ai;
626 jmin = 0;
627 nonzerorow += (n > 0);
628 if (*ib == i) { /* (diag of A)*x */
629 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
630 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
631 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
632 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
633 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
634 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
635 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
636 v += 49;
637 jmin++;
638 }
639 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
640 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641 for (j = jmin; j < n; j++) {
642 /* (strict lower triangular part of A)*x */
643 cval = ib[j] * 7;
644 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
645 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
646 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
647 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
648 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
649 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
650 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
651 /* (strict upper triangular part of A)*x */
652 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
653 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
654 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
655 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
656 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
657 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
658 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
659 v += 49;
660 }
661 xb += 7;
662 ai++;
663 }
664 PetscCall(VecRestoreArrayRead(xx, &x));
665 PetscCall(VecRestoreArray(zz, &z));
666 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
667 PetscFunctionReturn(PETSC_SUCCESS);
668 }
669
670 /*
671 This will not work with MatScalar == float because it calls the BLAS
672 */
MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)673 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
674 {
675 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
676 PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
677 const PetscScalar *x, *x_ptr, *xb;
678 const MatScalar *v;
679 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
680 const PetscInt *idx, *aj, *ii;
681 PetscInt nonzerorow = 0;
682
683 PetscFunctionBegin;
684 PetscCall(VecSet(zz, zero));
685 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
686 PetscCall(VecGetArrayRead(xx, &x));
687 PetscCall(VecGetArray(zz, &z));
688
689 x_ptr = x;
690 z_ptr = z;
691
692 aj = a->j;
693 v = a->a;
694 ii = a->i;
695
696 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
697 work = a->mult_work;
698
699 for (i = 0; i < mbs; i++) {
700 n = ii[1] - ii[0];
701 ncols = n * bs;
702 workt = work;
703 idx = aj + ii[0];
704 nonzerorow += (n > 0);
705
706 /* upper triangular part */
707 for (j = 0; j < n; j++) {
708 xb = x_ptr + bs * (*idx++);
709 for (k = 0; k < bs; k++) workt[k] = xb[k];
710 workt += bs;
711 }
712 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
713 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
714
715 /* strict lower triangular part */
716 idx = aj + ii[0];
717 if (n && *idx == i) {
718 ncols -= bs;
719 v += bs2;
720 idx++;
721 n--;
722 }
723
724 if (ncols > 0) {
725 workt = work;
726 PetscCall(PetscArrayzero(workt, ncols));
727 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
728 for (j = 0; j < n; j++) {
729 zb = z_ptr + bs * (*idx++);
730 for (k = 0; k < bs; k++) zb[k] += workt[k];
731 workt += bs;
732 }
733 }
734 x += bs;
735 v += n * bs2;
736 z += bs;
737 ii++;
738 }
739
740 PetscCall(VecRestoreArrayRead(xx, &x));
741 PetscCall(VecRestoreArray(zz, &z));
742 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
743 PetscFunctionReturn(PETSC_SUCCESS);
744 }
745
MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)746 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
747 {
748 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
749 PetscScalar *z, x1;
750 const PetscScalar *x, *xb;
751 const MatScalar *v;
752 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
753 const PetscInt *aj = a->j, *ai = a->i, *ib;
754 PetscInt nonzerorow = 0;
755 const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
756
757 PetscFunctionBegin;
758 PetscCall(VecCopy(yy, zz));
759 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
760 PetscCall(VecGetArrayRead(xx, &x));
761 PetscCall(VecGetArray(zz, &z));
762 v = a->a;
763 xb = x;
764
765 for (i = 0; i < mbs; i++) {
766 n = ai[1] - ai[0]; /* length of i_th row of A */
767 x1 = xb[0];
768 ib = aj + *ai;
769 jmin = 0;
770 nonzerorow += (n > 0);
771 if (n && *ib == i) { /* (diag of A)*x */
772 z[i] += *v++ * x[*ib++];
773 jmin++;
774 }
775 if (aconj) {
776 for (j = jmin; j < n; j++) {
777 cval = *ib;
778 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
779 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
780 }
781 } else {
782 for (j = jmin; j < n; j++) {
783 cval = *ib;
784 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
785 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
786 }
787 }
788 xb++;
789 ai++;
790 }
791
792 PetscCall(VecRestoreArrayRead(xx, &x));
793 PetscCall(VecRestoreArray(zz, &z));
794
795 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
796 PetscFunctionReturn(PETSC_SUCCESS);
797 }
798
MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)799 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
800 {
801 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
802 PetscScalar *z, x1, x2;
803 const PetscScalar *x, *xb;
804 const MatScalar *v;
805 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
806 const PetscInt *aj = a->j, *ai = a->i, *ib;
807 PetscInt nonzerorow = 0;
808
809 PetscFunctionBegin;
810 PetscCall(VecCopy(yy, zz));
811 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
812 PetscCall(VecGetArrayRead(xx, &x));
813 PetscCall(VecGetArray(zz, &z));
814
815 v = a->a;
816 xb = x;
817
818 for (i = 0; i < mbs; i++) {
819 n = ai[1] - ai[0]; /* length of i_th block row of A */
820 x1 = xb[0];
821 x2 = xb[1];
822 ib = aj + *ai;
823 jmin = 0;
824 nonzerorow += (n > 0);
825 if (n && *ib == i) { /* (diag of A)*x */
826 z[2 * i] += v[0] * x1 + v[2] * x2;
827 z[2 * i + 1] += v[2] * x1 + v[3] * x2;
828 v += 4;
829 jmin++;
830 }
831 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
832 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
833 for (j = jmin; j < n; j++) {
834 /* (strict lower triangular part of A)*x */
835 cval = ib[j] * 2;
836 z[cval] += v[0] * x1 + v[1] * x2;
837 z[cval + 1] += v[2] * x1 + v[3] * x2;
838 /* (strict upper triangular part of A)*x */
839 z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
840 z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
841 v += 4;
842 }
843 xb += 2;
844 ai++;
845 }
846 PetscCall(VecRestoreArrayRead(xx, &x));
847 PetscCall(VecRestoreArray(zz, &z));
848
849 PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
850 PetscFunctionReturn(PETSC_SUCCESS);
851 }
852
MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)853 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
854 {
855 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
856 PetscScalar *z, x1, x2, x3;
857 const PetscScalar *x, *xb;
858 const MatScalar *v;
859 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
860 const PetscInt *aj = a->j, *ai = a->i, *ib;
861 PetscInt nonzerorow = 0;
862
863 PetscFunctionBegin;
864 PetscCall(VecCopy(yy, zz));
865 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
866 PetscCall(VecGetArrayRead(xx, &x));
867 PetscCall(VecGetArray(zz, &z));
868
869 v = a->a;
870 xb = x;
871
872 for (i = 0; i < mbs; i++) {
873 n = ai[1] - ai[0]; /* length of i_th block row of A */
874 x1 = xb[0];
875 x2 = xb[1];
876 x3 = xb[2];
877 ib = aj + *ai;
878 jmin = 0;
879 nonzerorow += (n > 0);
880 if (n && *ib == i) { /* (diag of A)*x */
881 z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
882 z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
883 z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
884 v += 9;
885 jmin++;
886 }
887 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
888 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
889 for (j = jmin; j < n; j++) {
890 /* (strict lower triangular part of A)*x */
891 cval = ib[j] * 3;
892 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
893 z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
894 z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
895 /* (strict upper triangular part of A)*x */
896 z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
897 z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
898 z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
899 v += 9;
900 }
901 xb += 3;
902 ai++;
903 }
904
905 PetscCall(VecRestoreArrayRead(xx, &x));
906 PetscCall(VecRestoreArray(zz, &z));
907
908 PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
909 PetscFunctionReturn(PETSC_SUCCESS);
910 }
911
MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)912 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
913 {
914 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
915 PetscScalar *z, x1, x2, x3, x4;
916 const PetscScalar *x, *xb;
917 const MatScalar *v;
918 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
919 const PetscInt *aj = a->j, *ai = a->i, *ib;
920 PetscInt nonzerorow = 0;
921
922 PetscFunctionBegin;
923 PetscCall(VecCopy(yy, zz));
924 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
925 PetscCall(VecGetArrayRead(xx, &x));
926 PetscCall(VecGetArray(zz, &z));
927
928 v = a->a;
929 xb = x;
930
931 for (i = 0; i < mbs; i++) {
932 n = ai[1] - ai[0]; /* length of i_th block row of A */
933 x1 = xb[0];
934 x2 = xb[1];
935 x3 = xb[2];
936 x4 = xb[3];
937 ib = aj + *ai;
938 jmin = 0;
939 nonzerorow += (n > 0);
940 if (n && *ib == i) { /* (diag of A)*x */
941 z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
942 z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
943 z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
944 z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
945 v += 16;
946 jmin++;
947 }
948 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
949 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
950 for (j = jmin; j < n; j++) {
951 /* (strict lower triangular part of A)*x */
952 cval = ib[j] * 4;
953 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
954 z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
955 z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
956 z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
957 /* (strict upper triangular part of A)*x */
958 z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
959 z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
960 z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
961 z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
962 v += 16;
963 }
964 xb += 4;
965 ai++;
966 }
967
968 PetscCall(VecRestoreArrayRead(xx, &x));
969 PetscCall(VecRestoreArray(zz, &z));
970
971 PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
972 PetscFunctionReturn(PETSC_SUCCESS);
973 }
974
MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)975 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
976 {
977 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
978 PetscScalar *z, x1, x2, x3, x4, x5;
979 const PetscScalar *x, *xb;
980 const MatScalar *v;
981 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
982 const PetscInt *aj = a->j, *ai = a->i, *ib;
983 PetscInt nonzerorow = 0;
984
985 PetscFunctionBegin;
986 PetscCall(VecCopy(yy, zz));
987 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
988 PetscCall(VecGetArrayRead(xx, &x));
989 PetscCall(VecGetArray(zz, &z));
990
991 v = a->a;
992 xb = x;
993
994 for (i = 0; i < mbs; i++) {
995 n = ai[1] - ai[0]; /* length of i_th block row of A */
996 x1 = xb[0];
997 x2 = xb[1];
998 x3 = xb[2];
999 x4 = xb[3];
1000 x5 = xb[4];
1001 ib = aj + *ai;
1002 jmin = 0;
1003 nonzerorow += (n > 0);
1004 if (n && *ib == i) { /* (diag of A)*x */
1005 z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1006 z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1007 z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1008 z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1009 z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1010 v += 25;
1011 jmin++;
1012 }
1013 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1014 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1015 for (j = jmin; j < n; j++) {
1016 /* (strict lower triangular part of A)*x */
1017 cval = ib[j] * 5;
1018 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1019 z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1020 z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1021 z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1022 z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1023 /* (strict upper triangular part of A)*x */
1024 z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1025 z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1026 z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1027 z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1028 z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1029 v += 25;
1030 }
1031 xb += 5;
1032 ai++;
1033 }
1034
1035 PetscCall(VecRestoreArrayRead(xx, &x));
1036 PetscCall(VecRestoreArray(zz, &z));
1037
1038 PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1039 PetscFunctionReturn(PETSC_SUCCESS);
1040 }
1041
MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)1042 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1043 {
1044 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1045 PetscScalar *z, x1, x2, x3, x4, x5, x6;
1046 const PetscScalar *x, *xb;
1047 const MatScalar *v;
1048 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1049 const PetscInt *aj = a->j, *ai = a->i, *ib;
1050 PetscInt nonzerorow = 0;
1051
1052 PetscFunctionBegin;
1053 PetscCall(VecCopy(yy, zz));
1054 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1055 PetscCall(VecGetArrayRead(xx, &x));
1056 PetscCall(VecGetArray(zz, &z));
1057
1058 v = a->a;
1059 xb = x;
1060
1061 for (i = 0; i < mbs; i++) {
1062 n = ai[1] - ai[0]; /* length of i_th block row of A */
1063 x1 = xb[0];
1064 x2 = xb[1];
1065 x3 = xb[2];
1066 x4 = xb[3];
1067 x5 = xb[4];
1068 x6 = xb[5];
1069 ib = aj + *ai;
1070 jmin = 0;
1071 nonzerorow += (n > 0);
1072 if (n && *ib == i) { /* (diag of A)*x */
1073 z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1074 z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1075 z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1076 z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1077 z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1078 z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1079 v += 36;
1080 jmin++;
1081 }
1082 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1083 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1084 for (j = jmin; j < n; j++) {
1085 /* (strict lower triangular part of A)*x */
1086 cval = ib[j] * 6;
1087 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1088 z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1089 z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1090 z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1091 z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1092 z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1093 /* (strict upper triangular part of A)*x */
1094 z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1095 z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1096 z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1097 z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1098 z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1099 z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1100 v += 36;
1101 }
1102 xb += 6;
1103 ai++;
1104 }
1105
1106 PetscCall(VecRestoreArrayRead(xx, &x));
1107 PetscCall(VecRestoreArray(zz, &z));
1108
1109 PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1110 PetscFunctionReturn(PETSC_SUCCESS);
1111 }
1112
MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)1113 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1114 {
1115 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1116 PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1117 const PetscScalar *x, *xb;
1118 const MatScalar *v;
1119 PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1120 const PetscInt *aj = a->j, *ai = a->i, *ib;
1121 PetscInt nonzerorow = 0;
1122
1123 PetscFunctionBegin;
1124 PetscCall(VecCopy(yy, zz));
1125 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1126 PetscCall(VecGetArrayRead(xx, &x));
1127 PetscCall(VecGetArray(zz, &z));
1128
1129 v = a->a;
1130 xb = x;
1131
1132 for (i = 0; i < mbs; i++) {
1133 n = ai[1] - ai[0]; /* length of i_th block row of A */
1134 x1 = xb[0];
1135 x2 = xb[1];
1136 x3 = xb[2];
1137 x4 = xb[3];
1138 x5 = xb[4];
1139 x6 = xb[5];
1140 x7 = xb[6];
1141 ib = aj + *ai;
1142 jmin = 0;
1143 nonzerorow += (n > 0);
1144 if (n && *ib == i) { /* (diag of A)*x */
1145 z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1146 z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1147 z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1148 z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1149 z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1150 z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1151 z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1152 v += 49;
1153 jmin++;
1154 }
1155 PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1156 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1157 for (j = jmin; j < n; j++) {
1158 /* (strict lower triangular part of A)*x */
1159 cval = ib[j] * 7;
1160 z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1161 z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1162 z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1163 z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1164 z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1165 z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1166 z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1167 /* (strict upper triangular part of A)*x */
1168 z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1169 z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1170 z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1171 z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1172 z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1173 z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1174 z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1175 v += 49;
1176 }
1177 xb += 7;
1178 ai++;
1179 }
1180
1181 PetscCall(VecRestoreArrayRead(xx, &x));
1182 PetscCall(VecRestoreArray(zz, &z));
1183
1184 PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1185 PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187
MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)1188 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1189 {
1190 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1191 PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1192 const PetscScalar *x, *x_ptr, *xb;
1193 const MatScalar *v;
1194 PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1195 const PetscInt *idx, *aj, *ii;
1196 PetscInt nonzerorow = 0;
1197
1198 PetscFunctionBegin;
1199 PetscCall(VecCopy(yy, zz));
1200 if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1201 PetscCall(VecGetArrayRead(xx, &x));
1202 x_ptr = x;
1203 PetscCall(VecGetArray(zz, &z));
1204 z_ptr = z;
1205
1206 aj = a->j;
1207 v = a->a;
1208 ii = a->i;
1209
1210 if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1211 work = a->mult_work;
1212
1213 for (i = 0; i < mbs; i++) {
1214 n = ii[1] - ii[0];
1215 ncols = n * bs;
1216 workt = work;
1217 idx = aj + ii[0];
1218 nonzerorow += (n > 0);
1219
1220 /* upper triangular part */
1221 for (j = 0; j < n; j++) {
1222 xb = x_ptr + bs * (*idx++);
1223 for (k = 0; k < bs; k++) workt[k] = xb[k];
1224 workt += bs;
1225 }
1226 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1227 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1228
1229 /* strict lower triangular part */
1230 idx = aj + ii[0];
1231 if (n && *idx == i) {
1232 ncols -= bs;
1233 v += bs2;
1234 idx++;
1235 n--;
1236 }
1237 if (ncols > 0) {
1238 workt = work;
1239 PetscCall(PetscArrayzero(workt, ncols));
1240 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1241 for (j = 0; j < n; j++) {
1242 zb = z_ptr + bs * (*idx++);
1243 for (k = 0; k < bs; k++) zb[k] += workt[k];
1244 workt += bs;
1245 }
1246 }
1247
1248 x += bs;
1249 v += n * bs2;
1250 z += bs;
1251 ii++;
1252 }
1253
1254 PetscCall(VecRestoreArrayRead(xx, &x));
1255 PetscCall(VecRestoreArray(zz, &z));
1256
1257 PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1258 PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260
MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)1261 PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1262 {
1263 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1264 PetscScalar oalpha = alpha;
1265 PetscBLASInt one = 1, totalnz;
1266
1267 PetscFunctionBegin;
1268 PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1269 PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1270 PetscCall(PetscLogFlops(totalnz));
1271 PetscFunctionReturn(PETSC_SUCCESS);
1272 }
1273
MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal * norm)1274 PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1275 {
1276 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1277 const MatScalar *v = a->a;
1278 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1279 PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1280 const PetscInt *aj = a->j, *col;
1281
1282 PetscFunctionBegin;
1283 if (!a->nz) {
1284 *norm = 0.0;
1285 PetscFunctionReturn(PETSC_SUCCESS);
1286 }
1287 if (type == NORM_FROBENIUS) {
1288 for (k = 0; k < mbs; k++) {
1289 jmin = a->i[k];
1290 jmax = a->i[k + 1];
1291 col = aj + jmin;
1292 if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1293 for (i = 0; i < bs2; i++) {
1294 sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1295 v++;
1296 }
1297 jmin++;
1298 }
1299 for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1300 for (i = 0; i < bs2; i++) {
1301 sum_off += PetscRealPart(PetscConj(*v) * (*v));
1302 v++;
1303 }
1304 }
1305 }
1306 *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1307 PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1308 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1309 PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1310 for (i = 0; i < mbs; i++) jl[i] = mbs;
1311 il[0] = 0;
1312
1313 *norm = 0.0;
1314 for (k = 0; k < mbs; k++) { /* k_th block row */
1315 for (j = 0; j < bs; j++) sum[j] = 0.0;
1316 /*-- col sum --*/
1317 i = jl[k]; /* first |A(i,k)| to be added */
1318 /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1319 at step k */
1320 while (i < mbs) {
1321 nexti = jl[i]; /* next block row to be added */
1322 ik = il[i]; /* block index of A(i,k) in the array a */
1323 for (j = 0; j < bs; j++) {
1324 v = a->a + ik * bs2 + j * bs;
1325 for (k1 = 0; k1 < bs; k1++) {
1326 sum[j] += PetscAbsScalar(*v);
1327 v++;
1328 }
1329 }
1330 /* update il, jl */
1331 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1332 jmax = a->i[i + 1];
1333 if (jmin < jmax) {
1334 il[i] = jmin;
1335 j = a->j[jmin];
1336 jl[i] = jl[j];
1337 jl[j] = i;
1338 }
1339 i = nexti;
1340 }
1341 /*-- row sum --*/
1342 jmin = a->i[k];
1343 jmax = a->i[k + 1];
1344 for (i = jmin; i < jmax; i++) {
1345 for (j = 0; j < bs; j++) {
1346 v = a->a + i * bs2 + j;
1347 for (k1 = 0; k1 < bs; k1++) {
1348 sum[j] += PetscAbsScalar(*v);
1349 v += bs;
1350 }
1351 }
1352 }
1353 /* add k_th block row to il, jl */
1354 col = aj + jmin;
1355 if (jmax - jmin > 0 && *col == k) jmin++;
1356 if (jmin < jmax) {
1357 il[k] = jmin;
1358 j = a->j[jmin];
1359 jl[k] = jl[j];
1360 jl[j] = k;
1361 }
1362 for (j = 0; j < bs; j++) {
1363 if (sum[j] > *norm) *norm = sum[j];
1364 }
1365 }
1366 PetscCall(PetscFree3(sum, il, jl));
1367 PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1368 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1369 PetscFunctionReturn(PETSC_SUCCESS);
1370 }
1371
MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)1372 PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1373 {
1374 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1375
1376 PetscFunctionBegin;
1377 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1378 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1379 *flg = PETSC_FALSE;
1380 PetscFunctionReturn(PETSC_SUCCESS);
1381 }
1382
1383 /* if the a->i are the same */
1384 PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1385 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1386
1387 /* if a->j are the same */
1388 PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1389 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1390
1391 /* if a->a are the same */
1392 PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg));
1393 PetscFunctionReturn(PETSC_SUCCESS);
1394 }
1395
MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)1396 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1397 {
1398 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1399 PetscInt n;
1400 const PetscInt bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2;
1401 PetscScalar *x;
1402 const MatScalar *aa = a->a, *aa_j;
1403 const PetscInt *ai = a->i, *adiag;
1404 PetscBool diagDense;
1405
1406 PetscFunctionBegin;
1407 PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1408 PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, &diagDense));
1409 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1410 PetscCall(VecGetArrayWrite(v, &x));
1411 for (PetscInt i = 0; i < ambs; i++) x[i] = 1.0 / aa[adiag[i]];
1412 PetscCall(VecRestoreArrayWrite(v, &x));
1413 PetscFunctionReturn(PETSC_SUCCESS);
1414 }
1415
1416 PetscCall(VecGetLocalSize(v, &n));
1417 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1418 PetscCall(VecGetArrayWrite(v, &x));
1419
1420 if (diagDense) {
1421 for (PetscInt i = 0, row = 0; i < ambs; i++) {
1422 aa_j = aa + adiag[i] * bs2;
1423 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1424 }
1425 } else {
1426 for (PetscInt i = 0, row = 0; i < ambs; i++) {
1427 const PetscInt j = adiag[i];
1428
1429 if (j != ai[i + 1]) {
1430 aa_j = aa + j * bs2;
1431 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1432 } else {
1433 for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0;
1434 }
1435 }
1436 }
1437 PetscCall(VecRestoreArrayWrite(v, &x));
1438 PetscFunctionReturn(PETSC_SUCCESS);
1439 }
1440
MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)1441 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1442 {
1443 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1444 PetscScalar x;
1445 const PetscScalar *l, *li, *ri;
1446 MatScalar *aa, *v;
1447 PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1448 const PetscInt *ai, *aj;
1449
1450 PetscFunctionBegin;
1451 if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1452 ai = a->i;
1453 aj = a->j;
1454 aa = a->a;
1455 m = A->rmap->N;
1456 bs = A->rmap->bs;
1457 mbs = a->mbs;
1458 bs2 = a->bs2;
1459
1460 PetscCall(VecGetArrayRead(ll, &l));
1461 PetscCall(VecGetLocalSize(ll, &lm));
1462 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1463 for (i = 0; i < mbs; i++) { /* for each block row */
1464 M = ai[i + 1] - ai[i];
1465 li = l + i * bs;
1466 v = aa + bs2 * ai[i];
1467 for (j = 0; j < M; j++) { /* for each block */
1468 ri = l + bs * aj[ai[i] + j];
1469 for (k = 0; k < bs; k++) {
1470 x = ri[k];
1471 for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1472 }
1473 }
1474 }
1475 PetscCall(VecRestoreArrayRead(ll, &l));
1476 PetscCall(PetscLogFlops(2.0 * a->nz));
1477 PetscFunctionReturn(PETSC_SUCCESS);
1478 }
1479
MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo * info)1480 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1481 {
1482 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1483
1484 PetscFunctionBegin;
1485 info->block_size = a->bs2;
1486 info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1487 info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1488 info->nz_unneeded = info->nz_allocated - info->nz_used;
1489 info->assemblies = A->num_ass;
1490 info->mallocs = A->info.mallocs;
1491 info->memory = 0; /* REVIEW ME */
1492 if (A->factortype) {
1493 info->fill_ratio_given = A->info.fill_ratio_given;
1494 info->fill_ratio_needed = A->info.fill_ratio_needed;
1495 info->factor_mallocs = A->info.factor_mallocs;
1496 } else {
1497 info->fill_ratio_given = 0;
1498 info->fill_ratio_needed = 0;
1499 info->factor_mallocs = 0;
1500 }
1501 PetscFunctionReturn(PETSC_SUCCESS);
1502 }
1503
MatZeroEntries_SeqSBAIJ(Mat A)1504 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1505 {
1506 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1507
1508 PetscFunctionBegin;
1509 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1510 PetscFunctionReturn(PETSC_SUCCESS);
1511 }
1512
MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])1513 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1514 {
1515 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1516 PetscInt i, j, n, row, col, bs, mbs;
1517 const PetscInt *ai, *aj;
1518 PetscReal atmp;
1519 const MatScalar *aa;
1520 PetscScalar *x;
1521 PetscInt ncols, brow, bcol, krow, kcol;
1522
1523 PetscFunctionBegin;
1524 PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1525 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1526 bs = A->rmap->bs;
1527 aa = a->a;
1528 ai = a->i;
1529 aj = a->j;
1530 mbs = a->mbs;
1531
1532 PetscCall(VecSet(v, 0.0));
1533 PetscCall(VecGetArray(v, &x));
1534 PetscCall(VecGetLocalSize(v, &n));
1535 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1536 for (i = 0; i < mbs; i++) {
1537 ncols = ai[1] - ai[0];
1538 ai++;
1539 brow = bs * i;
1540 for (j = 0; j < ncols; j++) {
1541 bcol = bs * (*aj);
1542 for (kcol = 0; kcol < bs; kcol++) {
1543 col = bcol + kcol; /* col index */
1544 for (krow = 0; krow < bs; krow++) {
1545 atmp = PetscAbsScalar(*aa);
1546 aa++;
1547 row = brow + krow; /* row index */
1548 if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1549 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1550 }
1551 }
1552 aj++;
1553 }
1554 }
1555 PetscCall(VecRestoreArray(v, &x));
1556 PetscFunctionReturn(PETSC_SUCCESS);
1557 }
1558
MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)1559 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1560 {
1561 PetscFunctionBegin;
1562 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1563 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1564 PetscFunctionReturn(PETSC_SUCCESS);
1565 }
1566
MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1567 static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1568 {
1569 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1570 PetscScalar *z = c;
1571 const PetscScalar *xb;
1572 PetscScalar x1;
1573 const MatScalar *v = a->a, *vv;
1574 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1575 const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
1576
1577 PetscFunctionBegin;
1578 for (i = 0; i < mbs; i++) {
1579 n = ii[1] - ii[0];
1580 ii++;
1581 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1582 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583 jj = idx;
1584 vv = v;
1585 for (k = 0; k < cn; k++) {
1586 idx = jj;
1587 v = vv;
1588 for (j = 0; j < n; j++) {
1589 xb = b + (*idx);
1590 x1 = xb[0 + k * bm];
1591 z[0 + k * cm] += v[0] * x1;
1592 if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1593 v += 1;
1594 ++idx;
1595 }
1596 }
1597 z += 1;
1598 }
1599 PetscFunctionReturn(PETSC_SUCCESS);
1600 }
1601
MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1602 static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1603 {
1604 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1605 PetscScalar *z = c;
1606 const PetscScalar *xb;
1607 PetscScalar x1, x2;
1608 const MatScalar *v = a->a, *vv;
1609 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1610
1611 PetscFunctionBegin;
1612 for (i = 0; i < mbs; i++) {
1613 n = ii[1] - ii[0];
1614 ii++;
1615 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1616 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1617 jj = idx;
1618 vv = v;
1619 for (k = 0; k < cn; k++) {
1620 idx = jj;
1621 v = vv;
1622 for (j = 0; j < n; j++) {
1623 xb = b + 2 * (*idx);
1624 x1 = xb[0 + k * bm];
1625 x2 = xb[1 + k * bm];
1626 z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1627 z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1628 if (*idx != i) {
1629 c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1630 c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1631 }
1632 v += 4;
1633 ++idx;
1634 }
1635 }
1636 z += 2;
1637 }
1638 PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640
MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1641 static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1642 {
1643 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1644 PetscScalar *z = c;
1645 const PetscScalar *xb;
1646 PetscScalar x1, x2, x3;
1647 const MatScalar *v = a->a, *vv;
1648 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1649
1650 PetscFunctionBegin;
1651 for (i = 0; i < mbs; i++) {
1652 n = ii[1] - ii[0];
1653 ii++;
1654 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1655 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1656 jj = idx;
1657 vv = v;
1658 for (k = 0; k < cn; k++) {
1659 idx = jj;
1660 v = vv;
1661 for (j = 0; j < n; j++) {
1662 xb = b + 3 * (*idx);
1663 x1 = xb[0 + k * bm];
1664 x2 = xb[1 + k * bm];
1665 x3 = xb[2 + k * bm];
1666 z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1667 z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1668 z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1669 if (*idx != i) {
1670 c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1671 c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1672 c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1673 }
1674 v += 9;
1675 ++idx;
1676 }
1677 }
1678 z += 3;
1679 }
1680 PetscFunctionReturn(PETSC_SUCCESS);
1681 }
1682
MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1683 static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1684 {
1685 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1686 PetscScalar *z = c;
1687 const PetscScalar *xb;
1688 PetscScalar x1, x2, x3, x4;
1689 const MatScalar *v = a->a, *vv;
1690 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1691
1692 PetscFunctionBegin;
1693 for (i = 0; i < mbs; i++) {
1694 n = ii[1] - ii[0];
1695 ii++;
1696 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1697 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1698 jj = idx;
1699 vv = v;
1700 for (k = 0; k < cn; k++) {
1701 idx = jj;
1702 v = vv;
1703 for (j = 0; j < n; j++) {
1704 xb = b + 4 * (*idx);
1705 x1 = xb[0 + k * bm];
1706 x2 = xb[1 + k * bm];
1707 x3 = xb[2 + k * bm];
1708 x4 = xb[3 + k * bm];
1709 z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1710 z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1711 z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1712 z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1713 if (*idx != i) {
1714 c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1715 c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1716 c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1717 c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1718 }
1719 v += 16;
1720 ++idx;
1721 }
1722 }
1723 z += 4;
1724 }
1725 PetscFunctionReturn(PETSC_SUCCESS);
1726 }
1727
MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1728 static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1729 {
1730 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1731 PetscScalar *z = c;
1732 const PetscScalar *xb;
1733 PetscScalar x1, x2, x3, x4, x5;
1734 const MatScalar *v = a->a, *vv;
1735 PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1736
1737 PetscFunctionBegin;
1738 for (i = 0; i < mbs; i++) {
1739 n = ii[1] - ii[0];
1740 ii++;
1741 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1742 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1743 jj = idx;
1744 vv = v;
1745 for (k = 0; k < cn; k++) {
1746 idx = jj;
1747 v = vv;
1748 for (j = 0; j < n; j++) {
1749 xb = b + 5 * (*idx);
1750 x1 = xb[0 + k * bm];
1751 x2 = xb[1 + k * bm];
1752 x3 = xb[2 + k * bm];
1753 x4 = xb[3 + k * bm];
1754 x5 = xb[4 + k * cm];
1755 z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1756 z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1757 z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1758 z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1759 z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1760 if (*idx != i) {
1761 c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1762 c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1763 c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1764 c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1765 c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1766 }
1767 v += 25;
1768 ++idx;
1769 }
1770 }
1771 z += 5;
1772 }
1773 PetscFunctionReturn(PETSC_SUCCESS);
1774 }
1775
MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)1776 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1777 {
1778 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1779 Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1780 Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1781 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1782 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1783 PetscBLASInt bbs, bcn, bbm, bcm;
1784 PetscScalar *z = NULL;
1785 PetscScalar *c, *b;
1786 const MatScalar *v;
1787 const PetscInt *idx, *ii;
1788 PetscScalar _DOne = 1.0;
1789
1790 PetscFunctionBegin;
1791 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1792 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1793 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1794 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1795 b = bd->v;
1796 PetscCall(MatZeroEntries(C));
1797 PetscCall(MatDenseGetArray(C, &c));
1798 switch (bs) {
1799 case 1:
1800 PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1801 break;
1802 case 2:
1803 PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1804 break;
1805 case 3:
1806 PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1807 break;
1808 case 4:
1809 PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1810 break;
1811 case 5:
1812 PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1813 break;
1814 default: /* block sizes larger than 5 by 5 are handled by BLAS */
1815 PetscCall(PetscBLASIntCast(bs, &bbs));
1816 PetscCall(PetscBLASIntCast(cn, &bcn));
1817 PetscCall(PetscBLASIntCast(bm, &bbm));
1818 PetscCall(PetscBLASIntCast(cm, &bcm));
1819 idx = a->j;
1820 v = a->a;
1821 mbs = a->mbs;
1822 ii = a->i;
1823 z = c;
1824 for (i = 0; i < mbs; i++) {
1825 n = ii[1] - ii[0];
1826 ii++;
1827 for (j = 0; j < n; j++) {
1828 if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1829 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1830 v += bs2;
1831 }
1832 z += bs;
1833 }
1834 }
1835 PetscCall(MatDenseRestoreArray(C, &c));
1836 PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1837 PetscFunctionReturn(PETSC_SUCCESS);
1838 }
1839