1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #elif defined(PETSC_HAVE_XMMINTRIN_H)
10 #include <xmmintrin.h>
11 #endif
12
MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)13 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
14 {
15 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
16 PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival;
17 const PetscInt *idx;
18 PetscInt start, end, *ai, *aj, bs;
19 PetscBT table;
20
21 PetscFunctionBegin;
22 m = a->mbs;
23 ai = a->i;
24 aj = a->j;
25 bs = A->rmap->bs;
26
27 PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
28
29 PetscCall(PetscBTCreate(m, &table));
30 PetscCall(PetscMalloc1(m + 1, &nidx));
31
32 for (i = 0; i < is_max; i++) {
33 /* Initialise the two local arrays */
34 isz = 0;
35 PetscCall(PetscBTMemzero(m, table));
36
37 /* Extract the indices, assume there can be duplicate entries */
38 PetscCall(ISGetIndices(is[i], &idx));
39 PetscCall(ISGetLocalSize(is[i], &n));
40
41 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42 for (j = 0; j < n; ++j) {
43 ival = idx[j] / bs; /* convert the indices into block indices */
44 PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
45 if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
46 }
47 PetscCall(ISRestoreIndices(is[i], &idx));
48 PetscCall(ISDestroy(&is[i]));
49
50 k = 0;
51 for (j = 0; j < ov; j++) { /* for each overlap*/
52 n = isz;
53 for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
54 row = nidx[k];
55 start = ai[row];
56 end = ai[row + 1];
57 for (l = start; l < end; l++) {
58 val = aj[l];
59 if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
60 }
61 }
62 }
63 PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
64 }
65 PetscCall(PetscBTDestroy(&table));
66 PetscCall(PetscFree(nidx));
67 PetscFunctionReturn(PETSC_SUCCESS);
68 }
69
MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)70 static PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
71 {
72 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c;
73 PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
74 PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
75 const PetscInt *irow, *icol;
76 PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
77 PetscInt *aj = a->j, *ai = a->i;
78 MatScalar *mat_a;
79 Mat C;
80 PetscBool flag;
81
82 PetscFunctionBegin;
83 PetscCall(ISGetIndices(isrow, &irow));
84 PetscCall(ISGetIndices(iscol, &icol));
85 PetscCall(ISGetLocalSize(isrow, &nrows));
86 PetscCall(ISGetLocalSize(iscol, &ncols));
87
88 PetscCall(PetscCalloc1(1 + oldcols, &smap));
89 ssmap = smap;
90 PetscCall(PetscMalloc1(1 + nrows, &lens));
91 for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
92 /* determine lens of each row */
93 for (i = 0; i < nrows; i++) {
94 kstart = ai[irow[i]];
95 kend = kstart + a->ilen[irow[i]];
96 lens[i] = 0;
97 for (k = kstart; k < kend; k++) {
98 if (ssmap[aj[k]]) lens[i]++;
99 }
100 }
101 /* Create and fill new matrix */
102 if (scall == MAT_REUSE_MATRIX) {
103 c = (Mat_SeqBAIJ *)((*B)->data);
104
105 PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
106 PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
107 PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
108 PetscCall(PetscArrayzero(c->ilen, c->mbs));
109 C = *B;
110 } else {
111 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
112 PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
113 PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
114 PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
115 }
116 c = (Mat_SeqBAIJ *)C->data;
117 for (i = 0; i < nrows; i++) {
118 row = irow[i];
119 kstart = ai[row];
120 kend = kstart + a->ilen[row];
121 mat_i = c->i[i];
122 mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
123 mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
124 mat_ilen = c->ilen + i;
125 for (k = kstart; k < kend; k++) {
126 if ((tcol = ssmap[a->j[k]])) {
127 *mat_j++ = tcol - 1;
128 PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
129 mat_a += bs2;
130 (*mat_ilen)++;
131 }
132 }
133 }
134 /* sort */
135 if (c->j && c->a) {
136 MatScalar *work;
137 PetscCall(PetscMalloc1(bs2, &work));
138 for (i = 0; i < nrows; i++) {
139 PetscInt ilen;
140 mat_i = c->i[i];
141 mat_j = c->j + mat_i;
142 mat_a = c->a + mat_i * bs2;
143 ilen = c->ilen[i];
144 PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
145 }
146 PetscCall(PetscFree(work));
147 }
148
149 /* Free work space */
150 PetscCall(ISRestoreIndices(iscol, &icol));
151 PetscCall(PetscFree(smap));
152 PetscCall(PetscFree(lens));
153 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
154 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
155
156 PetscCall(ISRestoreIndices(isrow, &irow));
157 *B = C;
158 PetscFunctionReturn(PETSC_SUCCESS);
159 }
160
MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)161 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
162 {
163 IS is1, is2;
164
165 PetscFunctionBegin;
166 PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
167 if (isrow == iscol) {
168 is2 = is1;
169 PetscCall(PetscObjectReference((PetscObject)is2));
170 } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
171 PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
172 PetscCall(ISDestroy(&is1));
173 PetscCall(ISDestroy(&is2));
174 PetscFunctionReturn(PETSC_SUCCESS);
175 }
176
MatDestroySubMatrix_SeqBAIJ(Mat C)177 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
178 {
179 Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data;
180 Mat_SubSppt *submatj = c->submatis1;
181
182 PetscFunctionBegin;
183 PetscCall((*submatj->destroy)(C));
184 PetscCall(MatDestroySubMatrix_Private(submatj));
185 PetscFunctionReturn(PETSC_SUCCESS);
186 }
187
188 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat * mat[])189 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
190 {
191 PetscInt i;
192 Mat C;
193 Mat_SeqBAIJ *c;
194 Mat_SubSppt *submatj;
195
196 PetscFunctionBegin;
197 for (i = 0; i < n; i++) {
198 C = (*mat)[i];
199 c = (Mat_SeqBAIJ *)C->data;
200 submatj = c->submatis1;
201 if (submatj) {
202 if (--((PetscObject)C)->refct <= 0) {
203 PetscCall(PetscFree(C->factorprefix));
204 PetscCall((*submatj->destroy)(C));
205 PetscCall(MatDestroySubMatrix_Private(submatj));
206 PetscCall(PetscFree(C->defaultvectype));
207 PetscCall(PetscFree(C->defaultrandtype));
208 PetscCall(PetscLayoutDestroy(&C->rmap));
209 PetscCall(PetscLayoutDestroy(&C->cmap));
210 PetscCall(PetscHeaderDestroy(&C));
211 }
212 } else {
213 PetscCall(MatDestroy(&C));
214 }
215 }
216
217 /* Destroy Dummy submatrices created for reuse */
218 PetscCall(MatDestroySubMatrices_Dummy(n, mat));
219
220 PetscCall(PetscFree(*mat));
221 PetscFunctionReturn(PETSC_SUCCESS);
222 }
223
MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])224 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
225 {
226 PetscInt i;
227
228 PetscFunctionBegin;
229 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
230
231 for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
232 PetscFunctionReturn(PETSC_SUCCESS);
233 }
234
235 /* Should check that shapes of vectors and matrices match */
MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)236 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
237 {
238 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
239 PetscScalar *z, sum;
240 const PetscScalar *x;
241 const MatScalar *v;
242 PetscInt mbs, i, n;
243 const PetscInt *idx, *ii, *ridx = NULL;
244 PetscBool usecprow = a->compressedrow.use;
245
246 PetscFunctionBegin;
247 PetscCall(VecGetArrayRead(xx, &x));
248 PetscCall(VecGetArrayWrite(zz, &z));
249
250 if (usecprow) {
251 mbs = a->compressedrow.nrows;
252 ii = a->compressedrow.i;
253 ridx = a->compressedrow.rindex;
254 PetscCall(PetscArrayzero(z, a->mbs));
255 } else {
256 mbs = a->mbs;
257 ii = a->i;
258 }
259
260 for (i = 0; i < mbs; i++) {
261 n = ii[1] - ii[0];
262 v = a->a + ii[0];
263 idx = a->j + ii[0];
264 ii++;
265 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
266 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267 sum = 0.0;
268 PetscSparseDensePlusDot(sum, x, v, idx, n);
269 if (usecprow) {
270 z[ridx[i]] = sum;
271 } else {
272 z[i] = sum;
273 }
274 }
275 PetscCall(VecRestoreArrayRead(xx, &x));
276 PetscCall(VecRestoreArrayWrite(zz, &z));
277 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
278 PetscFunctionReturn(PETSC_SUCCESS);
279 }
280
MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
282 {
283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
284 PetscScalar *z = NULL, sum1, sum2, *zarray;
285 const PetscScalar *x, *xb;
286 PetscScalar x1, x2;
287 const MatScalar *v;
288 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
289 PetscBool usecprow = a->compressedrow.use;
290
291 PetscFunctionBegin;
292 PetscCall(VecGetArrayRead(xx, &x));
293 PetscCall(VecGetArrayWrite(zz, &zarray));
294
295 idx = a->j;
296 v = a->a;
297 if (usecprow) {
298 mbs = a->compressedrow.nrows;
299 ii = a->compressedrow.i;
300 ridx = a->compressedrow.rindex;
301 PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
302 } else {
303 mbs = a->mbs;
304 ii = a->i;
305 z = zarray;
306 }
307
308 for (i = 0; i < mbs; i++) {
309 n = ii[1] - ii[0];
310 ii++;
311 sum1 = 0.0;
312 sum2 = 0.0;
313 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
314 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
315 for (j = 0; j < n; j++) {
316 xb = x + 2 * (*idx++);
317 x1 = xb[0];
318 x2 = xb[1];
319 sum1 += v[0] * x1 + v[2] * x2;
320 sum2 += v[1] * x1 + v[3] * x2;
321 v += 4;
322 }
323 if (usecprow) z = zarray + 2 * ridx[i];
324 z[0] = sum1;
325 z[1] = sum2;
326 if (!usecprow) z += 2;
327 }
328 PetscCall(VecRestoreArrayRead(xx, &x));
329 PetscCall(VecRestoreArrayWrite(zz, &zarray));
330 PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
331 PetscFunctionReturn(PETSC_SUCCESS);
332 }
333
MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)334 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
335 {
336 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
337 PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
338 const PetscScalar *x, *xb;
339 const MatScalar *v;
340 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
341 PetscBool usecprow = a->compressedrow.use;
342
343 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
344 #pragma disjoint(*v, *z, *xb)
345 #endif
346
347 PetscFunctionBegin;
348 PetscCall(VecGetArrayRead(xx, &x));
349 PetscCall(VecGetArrayWrite(zz, &zarray));
350
351 idx = a->j;
352 v = a->a;
353 if (usecprow) {
354 mbs = a->compressedrow.nrows;
355 ii = a->compressedrow.i;
356 ridx = a->compressedrow.rindex;
357 PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
358 } else {
359 mbs = a->mbs;
360 ii = a->i;
361 z = zarray;
362 }
363
364 for (i = 0; i < mbs; i++) {
365 n = ii[1] - ii[0];
366 ii++;
367 sum1 = 0.0;
368 sum2 = 0.0;
369 sum3 = 0.0;
370 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
371 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
372 for (j = 0; j < n; j++) {
373 xb = x + 3 * (*idx++);
374 x1 = xb[0];
375 x2 = xb[1];
376 x3 = xb[2];
377
378 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
379 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
380 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
381 v += 9;
382 }
383 if (usecprow) z = zarray + 3 * ridx[i];
384 z[0] = sum1;
385 z[1] = sum2;
386 z[2] = sum3;
387 if (!usecprow) z += 3;
388 }
389 PetscCall(VecRestoreArrayRead(xx, &x));
390 PetscCall(VecRestoreArrayWrite(zz, &zarray));
391 PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
392 PetscFunctionReturn(PETSC_SUCCESS);
393 }
394
MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)395 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
396 {
397 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
398 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
399 const PetscScalar *x, *xb;
400 const MatScalar *v;
401 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
402 PetscBool usecprow = a->compressedrow.use;
403
404 PetscFunctionBegin;
405 PetscCall(VecGetArrayRead(xx, &x));
406 PetscCall(VecGetArrayWrite(zz, &zarray));
407
408 idx = a->j;
409 v = a->a;
410 if (usecprow) {
411 mbs = a->compressedrow.nrows;
412 ii = a->compressedrow.i;
413 ridx = a->compressedrow.rindex;
414 PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
415 } else {
416 mbs = a->mbs;
417 ii = a->i;
418 z = zarray;
419 }
420
421 for (i = 0; i < mbs; i++) {
422 n = ii[1] - ii[0];
423 ii++;
424 sum1 = 0.0;
425 sum2 = 0.0;
426 sum3 = 0.0;
427 sum4 = 0.0;
428
429 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
430 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
431 for (j = 0; j < n; j++) {
432 xb = x + 4 * (*idx++);
433 x1 = xb[0];
434 x2 = xb[1];
435 x3 = xb[2];
436 x4 = xb[3];
437 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
438 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
439 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
440 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
441 v += 16;
442 }
443 if (usecprow) z = zarray + 4 * ridx[i];
444 z[0] = sum1;
445 z[1] = sum2;
446 z[2] = sum3;
447 z[3] = sum4;
448 if (!usecprow) z += 4;
449 }
450 PetscCall(VecRestoreArrayRead(xx, &x));
451 PetscCall(VecRestoreArrayWrite(zz, &zarray));
452 PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
453 PetscFunctionReturn(PETSC_SUCCESS);
454 }
455
MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)456 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
457 {
458 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
459 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
460 const PetscScalar *xb, *x;
461 const MatScalar *v;
462 const PetscInt *idx, *ii, *ridx = NULL;
463 PetscInt mbs, i, j, n;
464 PetscBool usecprow = a->compressedrow.use;
465
466 PetscFunctionBegin;
467 PetscCall(VecGetArrayRead(xx, &x));
468 PetscCall(VecGetArrayWrite(zz, &zarray));
469
470 idx = a->j;
471 v = a->a;
472 if (usecprow) {
473 mbs = a->compressedrow.nrows;
474 ii = a->compressedrow.i;
475 ridx = a->compressedrow.rindex;
476 PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
477 } else {
478 mbs = a->mbs;
479 ii = a->i;
480 z = zarray;
481 }
482
483 for (i = 0; i < mbs; i++) {
484 n = ii[1] - ii[0];
485 ii++;
486 sum1 = 0.0;
487 sum2 = 0.0;
488 sum3 = 0.0;
489 sum4 = 0.0;
490 sum5 = 0.0;
491 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
492 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493 for (j = 0; j < n; j++) {
494 xb = x + 5 * (*idx++);
495 x1 = xb[0];
496 x2 = xb[1];
497 x3 = xb[2];
498 x4 = xb[3];
499 x5 = xb[4];
500 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
501 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
502 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
503 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
504 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
505 v += 25;
506 }
507 if (usecprow) z = zarray + 5 * ridx[i];
508 z[0] = sum1;
509 z[1] = sum2;
510 z[2] = sum3;
511 z[3] = sum4;
512 z[4] = sum5;
513 if (!usecprow) z += 5;
514 }
515 PetscCall(VecRestoreArrayRead(xx, &x));
516 PetscCall(VecRestoreArrayWrite(zz, &zarray));
517 PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
518 PetscFunctionReturn(PETSC_SUCCESS);
519 }
520
MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)521 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
522 {
523 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
524 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
525 const PetscScalar *x, *xb;
526 PetscScalar x1, x2, x3, x4, x5, x6, *zarray;
527 const MatScalar *v;
528 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
529 PetscBool usecprow = a->compressedrow.use;
530
531 PetscFunctionBegin;
532 PetscCall(VecGetArrayRead(xx, &x));
533 PetscCall(VecGetArrayWrite(zz, &zarray));
534
535 idx = a->j;
536 v = a->a;
537 if (usecprow) {
538 mbs = a->compressedrow.nrows;
539 ii = a->compressedrow.i;
540 ridx = a->compressedrow.rindex;
541 PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
542 } else {
543 mbs = a->mbs;
544 ii = a->i;
545 z = zarray;
546 }
547
548 for (i = 0; i < mbs; i++) {
549 n = ii[1] - ii[0];
550 ii++;
551 sum1 = 0.0;
552 sum2 = 0.0;
553 sum3 = 0.0;
554 sum4 = 0.0;
555 sum5 = 0.0;
556 sum6 = 0.0;
557
558 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
559 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
560 for (j = 0; j < n; j++) {
561 xb = x + 6 * (*idx++);
562 x1 = xb[0];
563 x2 = xb[1];
564 x3 = xb[2];
565 x4 = xb[3];
566 x5 = xb[4];
567 x6 = xb[5];
568 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
569 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
570 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
571 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
572 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
573 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
574 v += 36;
575 }
576 if (usecprow) z = zarray + 6 * ridx[i];
577 z[0] = sum1;
578 z[1] = sum2;
579 z[2] = sum3;
580 z[3] = sum4;
581 z[4] = sum5;
582 z[5] = sum6;
583 if (!usecprow) z += 6;
584 }
585
586 PetscCall(VecRestoreArrayRead(xx, &x));
587 PetscCall(VecRestoreArrayWrite(zz, &zarray));
588 PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
589 PetscFunctionReturn(PETSC_SUCCESS);
590 }
591
MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)592 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
593 {
594 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
595 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
596 const PetscScalar *x, *xb;
597 PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray;
598 const MatScalar *v;
599 PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
600 PetscBool usecprow = a->compressedrow.use;
601
602 PetscFunctionBegin;
603 PetscCall(VecGetArrayRead(xx, &x));
604 PetscCall(VecGetArrayWrite(zz, &zarray));
605
606 idx = a->j;
607 v = a->a;
608 if (usecprow) {
609 mbs = a->compressedrow.nrows;
610 ii = a->compressedrow.i;
611 ridx = a->compressedrow.rindex;
612 PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
613 } else {
614 mbs = a->mbs;
615 ii = a->i;
616 z = zarray;
617 }
618
619 for (i = 0; i < mbs; i++) {
620 n = ii[1] - ii[0];
621 ii++;
622 sum1 = 0.0;
623 sum2 = 0.0;
624 sum3 = 0.0;
625 sum4 = 0.0;
626 sum5 = 0.0;
627 sum6 = 0.0;
628 sum7 = 0.0;
629
630 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
631 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
632 for (j = 0; j < n; j++) {
633 xb = x + 7 * (*idx++);
634 x1 = xb[0];
635 x2 = xb[1];
636 x3 = xb[2];
637 x4 = xb[3];
638 x5 = xb[4];
639 x6 = xb[5];
640 x7 = xb[6];
641 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
642 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
643 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
644 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
645 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
646 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
647 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
648 v += 49;
649 }
650 if (usecprow) z = zarray + 7 * ridx[i];
651 z[0] = sum1;
652 z[1] = sum2;
653 z[2] = sum3;
654 z[3] = sum4;
655 z[4] = sum5;
656 z[5] = sum6;
657 z[6] = sum7;
658 if (!usecprow) z += 7;
659 }
660
661 PetscCall(VecRestoreArrayRead(xx, &x));
662 PetscCall(VecRestoreArrayWrite(zz, &zarray));
663 PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
664 PetscFunctionReturn(PETSC_SUCCESS);
665 }
666
667 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)668 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
669 {
670 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
671 PetscScalar *z = NULL, *work, *workt, *zarray;
672 const PetscScalar *x, *xb;
673 const MatScalar *v;
674 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
675 const PetscInt *idx, *ii, *ridx = NULL;
676 PetscInt k;
677 PetscBool usecprow = a->compressedrow.use;
678
679 __m256d a0, a1, a2, a3, a4, a5;
680 __m256d w0, w1, w2, w3;
681 __m256d z0, z1, z2;
682 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
683
684 PetscFunctionBegin;
685 PetscCall(VecGetArrayRead(xx, &x));
686 PetscCall(VecGetArrayWrite(zz, &zarray));
687
688 idx = a->j;
689 v = a->a;
690 if (usecprow) {
691 mbs = a->compressedrow.nrows;
692 ii = a->compressedrow.i;
693 ridx = a->compressedrow.rindex;
694 PetscCall(PetscArrayzero(zarray, bs * a->mbs));
695 } else {
696 mbs = a->mbs;
697 ii = a->i;
698 z = zarray;
699 }
700
701 if (!a->mult_work) {
702 k = PetscMax(A->rmap->n, A->cmap->n);
703 PetscCall(PetscMalloc1(k + 1, &a->mult_work));
704 }
705
706 work = a->mult_work;
707 for (i = 0; i < mbs; i++) {
708 n = ii[1] - ii[0];
709 ii++;
710 workt = work;
711 for (j = 0; j < n; j++) {
712 xb = x + bs * (*idx++);
713 for (k = 0; k < bs; k++) workt[k] = xb[k];
714 workt += bs;
715 }
716 if (usecprow) z = zarray + bs * ridx[i];
717
718 z0 = _mm256_setzero_pd();
719 z1 = _mm256_setzero_pd();
720 z2 = _mm256_setzero_pd();
721
722 for (j = 0; j < n; j++) {
723 /* first column of a */
724 w0 = _mm256_set1_pd(work[j * 9]);
725 a0 = _mm256_loadu_pd(&v[j * 81]);
726 z0 = _mm256_fmadd_pd(a0, w0, z0);
727 a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
728 z1 = _mm256_fmadd_pd(a1, w0, z1);
729 a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
730 z2 = _mm256_fmadd_pd(a2, w0, z2);
731
732 /* second column of a */
733 w1 = _mm256_set1_pd(work[j * 9 + 1]);
734 a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
735 z0 = _mm256_fmadd_pd(a0, w1, z0);
736 a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
737 z1 = _mm256_fmadd_pd(a1, w1, z1);
738 a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
739 z2 = _mm256_fmadd_pd(a2, w1, z2);
740
741 /* third column of a */
742 w2 = _mm256_set1_pd(work[j * 9 + 2]);
743 a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
744 z0 = _mm256_fmadd_pd(a3, w2, z0);
745 a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
746 z1 = _mm256_fmadd_pd(a4, w2, z1);
747 a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
748 z2 = _mm256_fmadd_pd(a5, w2, z2);
749
750 /* fourth column of a */
751 w3 = _mm256_set1_pd(work[j * 9 + 3]);
752 a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
753 z0 = _mm256_fmadd_pd(a0, w3, z0);
754 a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
755 z1 = _mm256_fmadd_pd(a1, w3, z1);
756 a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
757 z2 = _mm256_fmadd_pd(a2, w3, z2);
758
759 /* fifth column of a */
760 w0 = _mm256_set1_pd(work[j * 9 + 4]);
761 a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
762 z0 = _mm256_fmadd_pd(a3, w0, z0);
763 a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
764 z1 = _mm256_fmadd_pd(a4, w0, z1);
765 a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
766 z2 = _mm256_fmadd_pd(a5, w0, z2);
767
768 /* sixth column of a */
769 w1 = _mm256_set1_pd(work[j * 9 + 5]);
770 a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
771 z0 = _mm256_fmadd_pd(a0, w1, z0);
772 a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
773 z1 = _mm256_fmadd_pd(a1, w1, z1);
774 a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
775 z2 = _mm256_fmadd_pd(a2, w1, z2);
776
777 /* seventh column of a */
778 w2 = _mm256_set1_pd(work[j * 9 + 6]);
779 a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
780 z0 = _mm256_fmadd_pd(a0, w2, z0);
781 a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
782 z1 = _mm256_fmadd_pd(a1, w2, z1);
783 a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
784 z2 = _mm256_fmadd_pd(a2, w2, z2);
785
786 /* eighth column of a */
787 w3 = _mm256_set1_pd(work[j * 9 + 7]);
788 a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
789 z0 = _mm256_fmadd_pd(a3, w3, z0);
790 a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
791 z1 = _mm256_fmadd_pd(a4, w3, z1);
792 a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
793 z2 = _mm256_fmadd_pd(a5, w3, z2);
794
795 /* ninth column of a */
796 w0 = _mm256_set1_pd(work[j * 9 + 8]);
797 a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
798 z0 = _mm256_fmadd_pd(a0, w0, z0);
799 a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
800 z1 = _mm256_fmadd_pd(a1, w0, z1);
801 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
802 z2 = _mm256_fmadd_pd(a2, w0, z2);
803 }
804
805 _mm256_storeu_pd(&z[0], z0);
806 _mm256_storeu_pd(&z[4], z1);
807 _mm256_maskstore_pd(&z[8], mask1, z2);
808
809 v += n * bs2;
810 if (!usecprow) z += bs;
811 }
812 PetscCall(VecRestoreArrayRead(xx, &x));
813 PetscCall(VecRestoreArrayWrite(zz, &zarray));
814 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
815 PetscFunctionReturn(PETSC_SUCCESS);
816 }
817 #endif
818
MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)819 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
820 {
821 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
822 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
823 const PetscScalar *x, *xb;
824 PetscScalar *zarray, xv;
825 const MatScalar *v;
826 const PetscInt *ii, *ij = a->j, *idx;
827 PetscInt mbs, i, j, k, n, *ridx = NULL;
828 PetscBool usecprow = a->compressedrow.use;
829
830 PetscFunctionBegin;
831 PetscCall(VecGetArrayRead(xx, &x));
832 PetscCall(VecGetArrayWrite(zz, &zarray));
833
834 v = a->a;
835 if (usecprow) {
836 mbs = a->compressedrow.nrows;
837 ii = a->compressedrow.i;
838 ridx = a->compressedrow.rindex;
839 PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
840 } else {
841 mbs = a->mbs;
842 ii = a->i;
843 z = zarray;
844 }
845
846 for (i = 0; i < mbs; i++) {
847 n = ii[i + 1] - ii[i];
848 idx = ij + ii[i];
849 sum1 = 0.0;
850 sum2 = 0.0;
851 sum3 = 0.0;
852 sum4 = 0.0;
853 sum5 = 0.0;
854 sum6 = 0.0;
855 sum7 = 0.0;
856 sum8 = 0.0;
857 sum9 = 0.0;
858 sum10 = 0.0;
859 sum11 = 0.0;
860
861 for (j = 0; j < n; j++) {
862 xb = x + 11 * (idx[j]);
863
864 for (k = 0; k < 11; k++) {
865 xv = xb[k];
866 sum1 += v[0] * xv;
867 sum2 += v[1] * xv;
868 sum3 += v[2] * xv;
869 sum4 += v[3] * xv;
870 sum5 += v[4] * xv;
871 sum6 += v[5] * xv;
872 sum7 += v[6] * xv;
873 sum8 += v[7] * xv;
874 sum9 += v[8] * xv;
875 sum10 += v[9] * xv;
876 sum11 += v[10] * xv;
877 v += 11;
878 }
879 }
880 if (usecprow) z = zarray + 11 * ridx[i];
881 z[0] = sum1;
882 z[1] = sum2;
883 z[2] = sum3;
884 z[3] = sum4;
885 z[4] = sum5;
886 z[5] = sum6;
887 z[6] = sum7;
888 z[7] = sum8;
889 z[8] = sum9;
890 z[9] = sum10;
891 z[10] = sum11;
892
893 if (!usecprow) z += 11;
894 }
895
896 PetscCall(VecRestoreArrayRead(xx, &x));
897 PetscCall(VecRestoreArrayWrite(zz, &zarray));
898 PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
899 PetscFunctionReturn(PETSC_SUCCESS);
900 }
901
902 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)903 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
904 {
905 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
906 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
907 const PetscScalar *x, *xb;
908 PetscScalar *zarray, xv;
909 const MatScalar *v;
910 const PetscInt *ii, *ij = a->j, *idx;
911 PetscInt mbs, i, j, k, n, *ridx = NULL;
912 PetscBool usecprow = a->compressedrow.use;
913
914 PetscFunctionBegin;
915 PetscCall(VecGetArrayRead(xx, &x));
916 PetscCall(VecGetArrayWrite(zz, &zarray));
917
918 v = a->a;
919 if (usecprow) {
920 mbs = a->compressedrow.nrows;
921 ii = a->compressedrow.i;
922 ridx = a->compressedrow.rindex;
923 PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
924 } else {
925 mbs = a->mbs;
926 ii = a->i;
927 z = zarray;
928 }
929
930 for (i = 0; i < mbs; i++) {
931 n = ii[i + 1] - ii[i];
932 idx = ij + ii[i];
933 sum1 = 0.0;
934 sum2 = 0.0;
935 sum3 = 0.0;
936 sum4 = 0.0;
937 sum5 = 0.0;
938 sum6 = 0.0;
939 sum7 = 0.0;
940 sum8 = 0.0;
941 sum9 = 0.0;
942 sum10 = 0.0;
943 sum11 = 0.0;
944 sum12 = 0.0;
945
946 for (j = 0; j < n; j++) {
947 xb = x + 12 * (idx[j]);
948
949 for (k = 0; k < 12; k++) {
950 xv = xb[k];
951 sum1 += v[0] * xv;
952 sum2 += v[1] * xv;
953 sum3 += v[2] * xv;
954 sum4 += v[3] * xv;
955 sum5 += v[4] * xv;
956 sum6 += v[5] * xv;
957 sum7 += v[6] * xv;
958 sum8 += v[7] * xv;
959 sum9 += v[8] * xv;
960 sum10 += v[9] * xv;
961 sum11 += v[10] * xv;
962 sum12 += v[11] * xv;
963 v += 12;
964 }
965 }
966 if (usecprow) z = zarray + 12 * ridx[i];
967 z[0] = sum1;
968 z[1] = sum2;
969 z[2] = sum3;
970 z[3] = sum4;
971 z[4] = sum5;
972 z[5] = sum6;
973 z[6] = sum7;
974 z[7] = sum8;
975 z[8] = sum9;
976 z[9] = sum10;
977 z[10] = sum11;
978 z[11] = sum12;
979 if (!usecprow) z += 12;
980 }
981 PetscCall(VecRestoreArrayRead(xx, &x));
982 PetscCall(VecRestoreArrayWrite(zz, &zarray));
983 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
984 PetscFunctionReturn(PETSC_SUCCESS);
985 }
986
MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)987 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
988 {
989 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
990 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
991 const PetscScalar *x, *xb;
992 PetscScalar *zarray, *yarray, xv;
993 const MatScalar *v;
994 const PetscInt *ii, *ij = a->j, *idx;
995 PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL;
996 PetscBool usecprow = a->compressedrow.use;
997
998 PetscFunctionBegin;
999 PetscCall(VecGetArrayRead(xx, &x));
1000 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1001
1002 v = a->a;
1003 if (usecprow) {
1004 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1005 mbs = a->compressedrow.nrows;
1006 ii = a->compressedrow.i;
1007 ridx = a->compressedrow.rindex;
1008 } else {
1009 ii = a->i;
1010 y = yarray;
1011 z = zarray;
1012 }
1013
1014 for (i = 0; i < mbs; i++) {
1015 n = ii[i + 1] - ii[i];
1016 idx = ij + ii[i];
1017
1018 if (usecprow) {
1019 y = yarray + 12 * ridx[i];
1020 z = zarray + 12 * ridx[i];
1021 }
1022 sum1 = y[0];
1023 sum2 = y[1];
1024 sum3 = y[2];
1025 sum4 = y[3];
1026 sum5 = y[4];
1027 sum6 = y[5];
1028 sum7 = y[6];
1029 sum8 = y[7];
1030 sum9 = y[8];
1031 sum10 = y[9];
1032 sum11 = y[10];
1033 sum12 = y[11];
1034
1035 for (j = 0; j < n; j++) {
1036 xb = x + 12 * (idx[j]);
1037
1038 for (k = 0; k < 12; k++) {
1039 xv = xb[k];
1040 sum1 += v[0] * xv;
1041 sum2 += v[1] * xv;
1042 sum3 += v[2] * xv;
1043 sum4 += v[3] * xv;
1044 sum5 += v[4] * xv;
1045 sum6 += v[5] * xv;
1046 sum7 += v[6] * xv;
1047 sum8 += v[7] * xv;
1048 sum9 += v[8] * xv;
1049 sum10 += v[9] * xv;
1050 sum11 += v[10] * xv;
1051 sum12 += v[11] * xv;
1052 v += 12;
1053 }
1054 }
1055
1056 z[0] = sum1;
1057 z[1] = sum2;
1058 z[2] = sum3;
1059 z[3] = sum4;
1060 z[4] = sum5;
1061 z[5] = sum6;
1062 z[6] = sum7;
1063 z[7] = sum8;
1064 z[8] = sum9;
1065 z[9] = sum10;
1066 z[10] = sum11;
1067 z[11] = sum12;
1068 if (!usecprow) {
1069 y += 12;
1070 z += 12;
1071 }
1072 }
1073 PetscCall(VecRestoreArrayRead(xx, &x));
1074 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1075 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1076 PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078
1079 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)1080 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1081 {
1082 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1083 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1084 const PetscScalar *x, *xb;
1085 PetscScalar x1, x2, x3, x4, *zarray;
1086 const MatScalar *v;
1087 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1088 PetscInt mbs, i, j, n;
1089 PetscBool usecprow = a->compressedrow.use;
1090
1091 PetscFunctionBegin;
1092 PetscCall(VecGetArrayRead(xx, &x));
1093 PetscCall(VecGetArrayWrite(zz, &zarray));
1094
1095 v = a->a;
1096 if (usecprow) {
1097 mbs = a->compressedrow.nrows;
1098 ii = a->compressedrow.i;
1099 ridx = a->compressedrow.rindex;
1100 PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1101 } else {
1102 mbs = a->mbs;
1103 ii = a->i;
1104 z = zarray;
1105 }
1106
1107 for (i = 0; i < mbs; i++) {
1108 n = ii[i + 1] - ii[i];
1109 idx = ij + ii[i];
1110
1111 sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1112 for (j = 0; j < n; j++) {
1113 xb = x + 12 * (idx[j]);
1114 x1 = xb[0];
1115 x2 = xb[1];
1116 x3 = xb[2];
1117 x4 = xb[3];
1118
1119 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1120 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1121 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1122 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1123 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1124 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1125 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1126 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1127 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1128 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1129 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1130 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1131 v += 48;
1132
1133 x1 = xb[4];
1134 x2 = xb[5];
1135 x3 = xb[6];
1136 x4 = xb[7];
1137
1138 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1139 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1140 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1141 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1142 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1143 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1144 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1145 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1146 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1147 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1148 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1149 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1150 v += 48;
1151
1152 x1 = xb[8];
1153 x2 = xb[9];
1154 x3 = xb[10];
1155 x4 = xb[11];
1156 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1157 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1158 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1159 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1160 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1161 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1162 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1163 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1164 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1165 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1166 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1167 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1168 v += 48;
1169 }
1170 if (usecprow) z = zarray + 12 * ridx[i];
1171 z[0] = sum1;
1172 z[1] = sum2;
1173 z[2] = sum3;
1174 z[3] = sum4;
1175 z[4] = sum5;
1176 z[5] = sum6;
1177 z[6] = sum7;
1178 z[7] = sum8;
1179 z[8] = sum9;
1180 z[9] = sum10;
1181 z[10] = sum11;
1182 z[11] = sum12;
1183 if (!usecprow) z += 12;
1184 }
1185 PetscCall(VecRestoreArrayRead(xx, &x));
1186 PetscCall(VecRestoreArrayWrite(zz, &zarray));
1187 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1188 PetscFunctionReturn(PETSC_SUCCESS);
1189 }
1190
1191 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)1192 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1193 {
1194 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1195 PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1196 const PetscScalar *x, *xb;
1197 PetscScalar x1, x2, x3, x4, *zarray, *yarray;
1198 const MatScalar *v;
1199 const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1200 PetscInt mbs = a->mbs, i, j, n;
1201 PetscBool usecprow = a->compressedrow.use;
1202
1203 PetscFunctionBegin;
1204 PetscCall(VecGetArrayRead(xx, &x));
1205 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1206
1207 v = a->a;
1208 if (usecprow) {
1209 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1210 mbs = a->compressedrow.nrows;
1211 ii = a->compressedrow.i;
1212 ridx = a->compressedrow.rindex;
1213 } else {
1214 ii = a->i;
1215 y = yarray;
1216 z = zarray;
1217 }
1218
1219 for (i = 0; i < mbs; i++) {
1220 n = ii[i + 1] - ii[i];
1221 idx = ij + ii[i];
1222
1223 if (usecprow) {
1224 y = yarray + 12 * ridx[i];
1225 z = zarray + 12 * ridx[i];
1226 }
1227 sum1 = y[0];
1228 sum2 = y[1];
1229 sum3 = y[2];
1230 sum4 = y[3];
1231 sum5 = y[4];
1232 sum6 = y[5];
1233 sum7 = y[6];
1234 sum8 = y[7];
1235 sum9 = y[8];
1236 sum10 = y[9];
1237 sum11 = y[10];
1238 sum12 = y[11];
1239
1240 for (j = 0; j < n; j++) {
1241 xb = x + 12 * (idx[j]);
1242 x1 = xb[0];
1243 x2 = xb[1];
1244 x3 = xb[2];
1245 x4 = xb[3];
1246
1247 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1248 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1249 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1250 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1251 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1252 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1253 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1254 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1255 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1256 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1257 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1258 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1259 v += 48;
1260
1261 x1 = xb[4];
1262 x2 = xb[5];
1263 x3 = xb[6];
1264 x4 = xb[7];
1265
1266 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1267 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1268 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1269 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1270 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1271 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1272 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1273 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1274 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1275 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1276 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1277 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1278 v += 48;
1279
1280 x1 = xb[8];
1281 x2 = xb[9];
1282 x3 = xb[10];
1283 x4 = xb[11];
1284 sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285 sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286 sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287 sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288 sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289 sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290 sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291 sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292 sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293 sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294 sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295 sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296 v += 48;
1297 }
1298 z[0] = sum1;
1299 z[1] = sum2;
1300 z[2] = sum3;
1301 z[3] = sum4;
1302 z[4] = sum5;
1303 z[5] = sum6;
1304 z[6] = sum7;
1305 z[7] = sum8;
1306 z[8] = sum9;
1307 z[9] = sum10;
1308 z[10] = sum11;
1309 z[11] = sum12;
1310 if (!usecprow) {
1311 y += 12;
1312 z += 12;
1313 }
1314 }
1315 PetscCall(VecRestoreArrayRead(xx, &x));
1316 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1317 PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1318 PetscFunctionReturn(PETSC_SUCCESS);
1319 }
1320
1321 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)1322 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1323 {
1324 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1325 PetscScalar *z = NULL, *zarray;
1326 const PetscScalar *x, *work;
1327 const MatScalar *v = a->a;
1328 PetscInt mbs, i, j, n;
1329 const PetscInt *idx = a->j, *ii, *ridx = NULL;
1330 PetscBool usecprow = a->compressedrow.use;
1331 const PetscInt bs = 12, bs2 = 144;
1332
1333 __m256d a0, a1, a2, a3, a4, a5;
1334 __m256d w0, w1, w2, w3;
1335 __m256d z0, z1, z2;
1336
1337 PetscFunctionBegin;
1338 PetscCall(VecGetArrayRead(xx, &x));
1339 PetscCall(VecGetArrayWrite(zz, &zarray));
1340
1341 if (usecprow) {
1342 mbs = a->compressedrow.nrows;
1343 ii = a->compressedrow.i;
1344 ridx = a->compressedrow.rindex;
1345 PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1346 } else {
1347 mbs = a->mbs;
1348 ii = a->i;
1349 z = zarray;
1350 }
1351
1352 for (i = 0; i < mbs; i++) {
1353 z0 = _mm256_setzero_pd();
1354 z1 = _mm256_setzero_pd();
1355 z2 = _mm256_setzero_pd();
1356
1357 n = ii[1] - ii[0];
1358 ii++;
1359 for (j = 0; j < n; j++) {
1360 work = x + bs * (*idx++);
1361
1362 /* first column of a */
1363 w0 = _mm256_set1_pd(work[0]);
1364 a0 = _mm256_loadu_pd(v + 0);
1365 z0 = _mm256_fmadd_pd(a0, w0, z0);
1366 a1 = _mm256_loadu_pd(v + 4);
1367 z1 = _mm256_fmadd_pd(a1, w0, z1);
1368 a2 = _mm256_loadu_pd(v + 8);
1369 z2 = _mm256_fmadd_pd(a2, w0, z2);
1370
1371 /* second column of a */
1372 w1 = _mm256_set1_pd(work[1]);
1373 a3 = _mm256_loadu_pd(v + 12);
1374 z0 = _mm256_fmadd_pd(a3, w1, z0);
1375 a4 = _mm256_loadu_pd(v + 16);
1376 z1 = _mm256_fmadd_pd(a4, w1, z1);
1377 a5 = _mm256_loadu_pd(v + 20);
1378 z2 = _mm256_fmadd_pd(a5, w1, z2);
1379
1380 /* third column of a */
1381 w2 = _mm256_set1_pd(work[2]);
1382 a0 = _mm256_loadu_pd(v + 24);
1383 z0 = _mm256_fmadd_pd(a0, w2, z0);
1384 a1 = _mm256_loadu_pd(v + 28);
1385 z1 = _mm256_fmadd_pd(a1, w2, z1);
1386 a2 = _mm256_loadu_pd(v + 32);
1387 z2 = _mm256_fmadd_pd(a2, w2, z2);
1388
1389 /* fourth column of a */
1390 w3 = _mm256_set1_pd(work[3]);
1391 a3 = _mm256_loadu_pd(v + 36);
1392 z0 = _mm256_fmadd_pd(a3, w3, z0);
1393 a4 = _mm256_loadu_pd(v + 40);
1394 z1 = _mm256_fmadd_pd(a4, w3, z1);
1395 a5 = _mm256_loadu_pd(v + 44);
1396 z2 = _mm256_fmadd_pd(a5, w3, z2);
1397
1398 /* fifth column of a */
1399 w0 = _mm256_set1_pd(work[4]);
1400 a0 = _mm256_loadu_pd(v + 48);
1401 z0 = _mm256_fmadd_pd(a0, w0, z0);
1402 a1 = _mm256_loadu_pd(v + 52);
1403 z1 = _mm256_fmadd_pd(a1, w0, z1);
1404 a2 = _mm256_loadu_pd(v + 56);
1405 z2 = _mm256_fmadd_pd(a2, w0, z2);
1406
1407 /* sixth column of a */
1408 w1 = _mm256_set1_pd(work[5]);
1409 a3 = _mm256_loadu_pd(v + 60);
1410 z0 = _mm256_fmadd_pd(a3, w1, z0);
1411 a4 = _mm256_loadu_pd(v + 64);
1412 z1 = _mm256_fmadd_pd(a4, w1, z1);
1413 a5 = _mm256_loadu_pd(v + 68);
1414 z2 = _mm256_fmadd_pd(a5, w1, z2);
1415
1416 /* seventh column of a */
1417 w2 = _mm256_set1_pd(work[6]);
1418 a0 = _mm256_loadu_pd(v + 72);
1419 z0 = _mm256_fmadd_pd(a0, w2, z0);
1420 a1 = _mm256_loadu_pd(v + 76);
1421 z1 = _mm256_fmadd_pd(a1, w2, z1);
1422 a2 = _mm256_loadu_pd(v + 80);
1423 z2 = _mm256_fmadd_pd(a2, w2, z2);
1424
1425 /* eighth column of a */
1426 w3 = _mm256_set1_pd(work[7]);
1427 a3 = _mm256_loadu_pd(v + 84);
1428 z0 = _mm256_fmadd_pd(a3, w3, z0);
1429 a4 = _mm256_loadu_pd(v + 88);
1430 z1 = _mm256_fmadd_pd(a4, w3, z1);
1431 a5 = _mm256_loadu_pd(v + 92);
1432 z2 = _mm256_fmadd_pd(a5, w3, z2);
1433
1434 /* ninth column of a */
1435 w0 = _mm256_set1_pd(work[8]);
1436 a0 = _mm256_loadu_pd(v + 96);
1437 z0 = _mm256_fmadd_pd(a0, w0, z0);
1438 a1 = _mm256_loadu_pd(v + 100);
1439 z1 = _mm256_fmadd_pd(a1, w0, z1);
1440 a2 = _mm256_loadu_pd(v + 104);
1441 z2 = _mm256_fmadd_pd(a2, w0, z2);
1442
1443 /* tenth column of a */
1444 w1 = _mm256_set1_pd(work[9]);
1445 a3 = _mm256_loadu_pd(v + 108);
1446 z0 = _mm256_fmadd_pd(a3, w1, z0);
1447 a4 = _mm256_loadu_pd(v + 112);
1448 z1 = _mm256_fmadd_pd(a4, w1, z1);
1449 a5 = _mm256_loadu_pd(v + 116);
1450 z2 = _mm256_fmadd_pd(a5, w1, z2);
1451
1452 /* eleventh column of a */
1453 w2 = _mm256_set1_pd(work[10]);
1454 a0 = _mm256_loadu_pd(v + 120);
1455 z0 = _mm256_fmadd_pd(a0, w2, z0);
1456 a1 = _mm256_loadu_pd(v + 124);
1457 z1 = _mm256_fmadd_pd(a1, w2, z1);
1458 a2 = _mm256_loadu_pd(v + 128);
1459 z2 = _mm256_fmadd_pd(a2, w2, z2);
1460
1461 /* twelveth column of a */
1462 w3 = _mm256_set1_pd(work[11]);
1463 a3 = _mm256_loadu_pd(v + 132);
1464 z0 = _mm256_fmadd_pd(a3, w3, z0);
1465 a4 = _mm256_loadu_pd(v + 136);
1466 z1 = _mm256_fmadd_pd(a4, w3, z1);
1467 a5 = _mm256_loadu_pd(v + 140);
1468 z2 = _mm256_fmadd_pd(a5, w3, z2);
1469
1470 v += bs2;
1471 }
1472 if (usecprow) z = zarray + bs * ridx[i];
1473 _mm256_storeu_pd(&z[0], z0);
1474 _mm256_storeu_pd(&z[4], z1);
1475 _mm256_storeu_pd(&z[8], z2);
1476 if (!usecprow) z += bs;
1477 }
1478 PetscCall(VecRestoreArrayRead(xx, &x));
1479 PetscCall(VecRestoreArrayWrite(zz, &zarray));
1480 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1481 PetscFunctionReturn(PETSC_SUCCESS);
1482 }
1483 #endif
1484
1485 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1486 /* Default MatMult for block size 15 */
MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)1487 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1488 {
1489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1490 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1491 const PetscScalar *x, *xb;
1492 PetscScalar *zarray, xv;
1493 const MatScalar *v;
1494 const PetscInt *ii, *ij = a->j, *idx;
1495 PetscInt mbs, i, j, k, n, *ridx = NULL;
1496 PetscBool usecprow = a->compressedrow.use;
1497
1498 PetscFunctionBegin;
1499 PetscCall(VecGetArrayRead(xx, &x));
1500 PetscCall(VecGetArrayWrite(zz, &zarray));
1501
1502 v = a->a;
1503 if (usecprow) {
1504 mbs = a->compressedrow.nrows;
1505 ii = a->compressedrow.i;
1506 ridx = a->compressedrow.rindex;
1507 PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1508 } else {
1509 mbs = a->mbs;
1510 ii = a->i;
1511 z = zarray;
1512 }
1513
1514 for (i = 0; i < mbs; i++) {
1515 n = ii[i + 1] - ii[i];
1516 idx = ij + ii[i];
1517 sum1 = 0.0;
1518 sum2 = 0.0;
1519 sum3 = 0.0;
1520 sum4 = 0.0;
1521 sum5 = 0.0;
1522 sum6 = 0.0;
1523 sum7 = 0.0;
1524 sum8 = 0.0;
1525 sum9 = 0.0;
1526 sum10 = 0.0;
1527 sum11 = 0.0;
1528 sum12 = 0.0;
1529 sum13 = 0.0;
1530 sum14 = 0.0;
1531 sum15 = 0.0;
1532
1533 for (j = 0; j < n; j++) {
1534 xb = x + 15 * (idx[j]);
1535
1536 for (k = 0; k < 15; k++) {
1537 xv = xb[k];
1538 sum1 += v[0] * xv;
1539 sum2 += v[1] * xv;
1540 sum3 += v[2] * xv;
1541 sum4 += v[3] * xv;
1542 sum5 += v[4] * xv;
1543 sum6 += v[5] * xv;
1544 sum7 += v[6] * xv;
1545 sum8 += v[7] * xv;
1546 sum9 += v[8] * xv;
1547 sum10 += v[9] * xv;
1548 sum11 += v[10] * xv;
1549 sum12 += v[11] * xv;
1550 sum13 += v[12] * xv;
1551 sum14 += v[13] * xv;
1552 sum15 += v[14] * xv;
1553 v += 15;
1554 }
1555 }
1556 if (usecprow) z = zarray + 15 * ridx[i];
1557 z[0] = sum1;
1558 z[1] = sum2;
1559 z[2] = sum3;
1560 z[3] = sum4;
1561 z[4] = sum5;
1562 z[5] = sum6;
1563 z[6] = sum7;
1564 z[7] = sum8;
1565 z[8] = sum9;
1566 z[9] = sum10;
1567 z[10] = sum11;
1568 z[11] = sum12;
1569 z[12] = sum13;
1570 z[13] = sum14;
1571 z[14] = sum15;
1572
1573 if (!usecprow) z += 15;
1574 }
1575
1576 PetscCall(VecRestoreArrayRead(xx, &x));
1577 PetscCall(VecRestoreArrayWrite(zz, &zarray));
1578 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1579 PetscFunctionReturn(PETSC_SUCCESS);
1580 }
1581
1582 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)1583 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1584 {
1585 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1586 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1587 const PetscScalar *x, *xb;
1588 PetscScalar x1, x2, x3, x4, *zarray;
1589 const MatScalar *v;
1590 const PetscInt *ii, *ij = a->j, *idx;
1591 PetscInt mbs, i, j, n, *ridx = NULL;
1592 PetscBool usecprow = a->compressedrow.use;
1593
1594 PetscFunctionBegin;
1595 PetscCall(VecGetArrayRead(xx, &x));
1596 PetscCall(VecGetArrayWrite(zz, &zarray));
1597
1598 v = a->a;
1599 if (usecprow) {
1600 mbs = a->compressedrow.nrows;
1601 ii = a->compressedrow.i;
1602 ridx = a->compressedrow.rindex;
1603 PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1604 } else {
1605 mbs = a->mbs;
1606 ii = a->i;
1607 z = zarray;
1608 }
1609
1610 for (i = 0; i < mbs; i++) {
1611 n = ii[i + 1] - ii[i];
1612 idx = ij + ii[i];
1613 sum1 = 0.0;
1614 sum2 = 0.0;
1615 sum3 = 0.0;
1616 sum4 = 0.0;
1617 sum5 = 0.0;
1618 sum6 = 0.0;
1619 sum7 = 0.0;
1620 sum8 = 0.0;
1621 sum9 = 0.0;
1622 sum10 = 0.0;
1623 sum11 = 0.0;
1624 sum12 = 0.0;
1625 sum13 = 0.0;
1626 sum14 = 0.0;
1627 sum15 = 0.0;
1628
1629 for (j = 0; j < n; j++) {
1630 xb = x + 15 * (idx[j]);
1631 x1 = xb[0];
1632 x2 = xb[1];
1633 x3 = xb[2];
1634 x4 = xb[3];
1635
1636 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1637 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1638 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1639 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1640 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1641 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1642 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1643 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1644 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1645 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1646 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1647 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1648 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1649 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1650 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1651
1652 v += 60;
1653
1654 x1 = xb[4];
1655 x2 = xb[5];
1656 x3 = xb[6];
1657 x4 = xb[7];
1658
1659 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1660 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1661 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1662 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1663 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1664 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1665 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1666 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1667 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1668 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1669 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1670 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1671 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1672 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1673 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1674 v += 60;
1675
1676 x1 = xb[8];
1677 x2 = xb[9];
1678 x3 = xb[10];
1679 x4 = xb[11];
1680 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1681 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1682 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1683 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1684 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1685 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1686 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1687 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1688 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1689 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1690 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1691 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1692 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1693 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1694 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1695 v += 60;
1696
1697 x1 = xb[12];
1698 x2 = xb[13];
1699 x3 = xb[14];
1700 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1701 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1702 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1703 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1704 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1705 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1706 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1707 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1708 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1709 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1710 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1711 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1712 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1713 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1714 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1715 v += 45;
1716 }
1717 if (usecprow) z = zarray + 15 * ridx[i];
1718 z[0] = sum1;
1719 z[1] = sum2;
1720 z[2] = sum3;
1721 z[3] = sum4;
1722 z[4] = sum5;
1723 z[5] = sum6;
1724 z[6] = sum7;
1725 z[7] = sum8;
1726 z[8] = sum9;
1727 z[9] = sum10;
1728 z[10] = sum11;
1729 z[11] = sum12;
1730 z[12] = sum13;
1731 z[13] = sum14;
1732 z[14] = sum15;
1733
1734 if (!usecprow) z += 15;
1735 }
1736
1737 PetscCall(VecRestoreArrayRead(xx, &x));
1738 PetscCall(VecRestoreArrayWrite(zz, &zarray));
1739 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1740 PetscFunctionReturn(PETSC_SUCCESS);
1741 }
1742
1743 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)1744 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1745 {
1746 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1747 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1748 const PetscScalar *x, *xb;
1749 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1750 const MatScalar *v;
1751 const PetscInt *ii, *ij = a->j, *idx;
1752 PetscInt mbs, i, j, n, *ridx = NULL;
1753 PetscBool usecprow = a->compressedrow.use;
1754
1755 PetscFunctionBegin;
1756 PetscCall(VecGetArrayRead(xx, &x));
1757 PetscCall(VecGetArrayWrite(zz, &zarray));
1758
1759 v = a->a;
1760 if (usecprow) {
1761 mbs = a->compressedrow.nrows;
1762 ii = a->compressedrow.i;
1763 ridx = a->compressedrow.rindex;
1764 PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1765 } else {
1766 mbs = a->mbs;
1767 ii = a->i;
1768 z = zarray;
1769 }
1770
1771 for (i = 0; i < mbs; i++) {
1772 n = ii[i + 1] - ii[i];
1773 idx = ij + ii[i];
1774 sum1 = 0.0;
1775 sum2 = 0.0;
1776 sum3 = 0.0;
1777 sum4 = 0.0;
1778 sum5 = 0.0;
1779 sum6 = 0.0;
1780 sum7 = 0.0;
1781 sum8 = 0.0;
1782 sum9 = 0.0;
1783 sum10 = 0.0;
1784 sum11 = 0.0;
1785 sum12 = 0.0;
1786 sum13 = 0.0;
1787 sum14 = 0.0;
1788 sum15 = 0.0;
1789
1790 for (j = 0; j < n; j++) {
1791 xb = x + 15 * (idx[j]);
1792 x1 = xb[0];
1793 x2 = xb[1];
1794 x3 = xb[2];
1795 x4 = xb[3];
1796 x5 = xb[4];
1797 x6 = xb[5];
1798 x7 = xb[6];
1799 x8 = xb[7];
1800
1801 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1802 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1803 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1804 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1805 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1806 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1807 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1808 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1809 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1810 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1811 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1812 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1813 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1814 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1815 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1816 v += 120;
1817
1818 x1 = xb[8];
1819 x2 = xb[9];
1820 x3 = xb[10];
1821 x4 = xb[11];
1822 x5 = xb[12];
1823 x6 = xb[13];
1824 x7 = xb[14];
1825
1826 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1827 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1828 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1829 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1830 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1831 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1832 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1833 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1834 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1835 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1836 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1837 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1838 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1839 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1840 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1841 v += 105;
1842 }
1843 if (usecprow) z = zarray + 15 * ridx[i];
1844 z[0] = sum1;
1845 z[1] = sum2;
1846 z[2] = sum3;
1847 z[3] = sum4;
1848 z[4] = sum5;
1849 z[5] = sum6;
1850 z[6] = sum7;
1851 z[7] = sum8;
1852 z[8] = sum9;
1853 z[9] = sum10;
1854 z[10] = sum11;
1855 z[11] = sum12;
1856 z[12] = sum13;
1857 z[13] = sum14;
1858 z[14] = sum15;
1859
1860 if (!usecprow) z += 15;
1861 }
1862
1863 PetscCall(VecRestoreArrayRead(xx, &x));
1864 PetscCall(VecRestoreArrayWrite(zz, &zarray));
1865 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1866 PetscFunctionReturn(PETSC_SUCCESS);
1867 }
1868
1869 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)1870 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1871 {
1872 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1873 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1874 const PetscScalar *x, *xb;
1875 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1876 const MatScalar *v;
1877 const PetscInt *ii, *ij = a->j, *idx;
1878 PetscInt mbs, i, j, n, *ridx = NULL;
1879 PetscBool usecprow = a->compressedrow.use;
1880
1881 PetscFunctionBegin;
1882 PetscCall(VecGetArrayRead(xx, &x));
1883 PetscCall(VecGetArrayWrite(zz, &zarray));
1884
1885 v = a->a;
1886 if (usecprow) {
1887 mbs = a->compressedrow.nrows;
1888 ii = a->compressedrow.i;
1889 ridx = a->compressedrow.rindex;
1890 PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1891 } else {
1892 mbs = a->mbs;
1893 ii = a->i;
1894 z = zarray;
1895 }
1896
1897 for (i = 0; i < mbs; i++) {
1898 n = ii[i + 1] - ii[i];
1899 idx = ij + ii[i];
1900 sum1 = 0.0;
1901 sum2 = 0.0;
1902 sum3 = 0.0;
1903 sum4 = 0.0;
1904 sum5 = 0.0;
1905 sum6 = 0.0;
1906 sum7 = 0.0;
1907 sum8 = 0.0;
1908 sum9 = 0.0;
1909 sum10 = 0.0;
1910 sum11 = 0.0;
1911 sum12 = 0.0;
1912 sum13 = 0.0;
1913 sum14 = 0.0;
1914 sum15 = 0.0;
1915
1916 for (j = 0; j < n; j++) {
1917 xb = x + 15 * (idx[j]);
1918 x1 = xb[0];
1919 x2 = xb[1];
1920 x3 = xb[2];
1921 x4 = xb[3];
1922 x5 = xb[4];
1923 x6 = xb[5];
1924 x7 = xb[6];
1925 x8 = xb[7];
1926 x9 = xb[8];
1927 x10 = xb[9];
1928 x11 = xb[10];
1929 x12 = xb[11];
1930 x13 = xb[12];
1931 x14 = xb[13];
1932 x15 = xb[14];
1933
1934 sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1935 sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1936 sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1937 sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1938 sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1939 sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1940 sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1941 sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1942 sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1943 sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1944 sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1945 sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1946 sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1947 sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1948 sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1949 v += 225;
1950 }
1951 if (usecprow) z = zarray + 15 * ridx[i];
1952 z[0] = sum1;
1953 z[1] = sum2;
1954 z[2] = sum3;
1955 z[3] = sum4;
1956 z[4] = sum5;
1957 z[5] = sum6;
1958 z[6] = sum7;
1959 z[7] = sum8;
1960 z[8] = sum9;
1961 z[9] = sum10;
1962 z[10] = sum11;
1963 z[11] = sum12;
1964 z[12] = sum13;
1965 z[13] = sum14;
1966 z[14] = sum15;
1967
1968 if (!usecprow) z += 15;
1969 }
1970
1971 PetscCall(VecRestoreArrayRead(xx, &x));
1972 PetscCall(VecRestoreArrayWrite(zz, &zarray));
1973 PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1974 PetscFunctionReturn(PETSC_SUCCESS);
1975 }
1976
1977 /*
1978 This will not work with MatScalar == float because it calls the BLAS
1979 */
MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)1980 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1981 {
1982 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1983 PetscScalar *z = NULL, *work, *workt, *zarray;
1984 const PetscScalar *x, *xb;
1985 const MatScalar *v;
1986 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1987 const PetscInt *idx, *ii, *ridx = NULL;
1988 PetscInt ncols, k;
1989 PetscBool usecprow = a->compressedrow.use;
1990
1991 PetscFunctionBegin;
1992 PetscCall(VecGetArrayRead(xx, &x));
1993 PetscCall(VecGetArrayWrite(zz, &zarray));
1994
1995 idx = a->j;
1996 v = a->a;
1997 if (usecprow) {
1998 mbs = a->compressedrow.nrows;
1999 ii = a->compressedrow.i;
2000 ridx = a->compressedrow.rindex;
2001 PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2002 } else {
2003 mbs = a->mbs;
2004 ii = a->i;
2005 z = zarray;
2006 }
2007
2008 if (!a->mult_work) {
2009 k = PetscMax(A->rmap->n, A->cmap->n);
2010 PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2011 }
2012 work = a->mult_work;
2013 for (i = 0; i < mbs; i++) {
2014 n = ii[1] - ii[0];
2015 ii++;
2016 ncols = n * bs;
2017 workt = work;
2018 for (j = 0; j < n; j++) {
2019 xb = x + bs * (*idx++);
2020 for (k = 0; k < bs; k++) workt[k] = xb[k];
2021 workt += bs;
2022 }
2023 if (usecprow) z = zarray + bs * ridx[i];
2024 PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2025 v += n * bs2;
2026 if (!usecprow) z += bs;
2027 }
2028 PetscCall(VecRestoreArrayRead(xx, &x));
2029 PetscCall(VecRestoreArrayWrite(zz, &zarray));
2030 PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2031 PetscFunctionReturn(PETSC_SUCCESS);
2032 }
2033
MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)2034 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2035 {
2036 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2037 const PetscScalar *x;
2038 PetscScalar *y, *z, sum;
2039 const MatScalar *v;
2040 PetscInt mbs = a->mbs, i, n, *ridx = NULL;
2041 const PetscInt *idx, *ii;
2042 PetscBool usecprow = a->compressedrow.use;
2043
2044 PetscFunctionBegin;
2045 PetscCall(VecGetArrayRead(xx, &x));
2046 PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2047
2048 idx = a->j;
2049 v = a->a;
2050 if (usecprow) {
2051 if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2052 mbs = a->compressedrow.nrows;
2053 ii = a->compressedrow.i;
2054 ridx = a->compressedrow.rindex;
2055 } else {
2056 ii = a->i;
2057 }
2058
2059 for (i = 0; i < mbs; i++) {
2060 n = ii[1] - ii[0];
2061 ii++;
2062 if (!usecprow) {
2063 sum = y[i];
2064 } else {
2065 sum = y[ridx[i]];
2066 }
2067 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2068 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2069 PetscSparseDensePlusDot(sum, x, v, idx, n);
2070 v += n;
2071 idx += n;
2072 if (usecprow) {
2073 z[ridx[i]] = sum;
2074 } else {
2075 z[i] = sum;
2076 }
2077 }
2078 PetscCall(VecRestoreArrayRead(xx, &x));
2079 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2080 PetscCall(PetscLogFlops(2.0 * a->nz));
2081 PetscFunctionReturn(PETSC_SUCCESS);
2082 }
2083
MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)2084 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2085 {
2086 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2087 PetscScalar *y = NULL, *z = NULL, sum1, sum2;
2088 const PetscScalar *x, *xb;
2089 PetscScalar x1, x2, *yarray, *zarray;
2090 const MatScalar *v;
2091 PetscInt mbs = a->mbs, i, n, j;
2092 const PetscInt *idx, *ii, *ridx = NULL;
2093 PetscBool usecprow = a->compressedrow.use;
2094
2095 PetscFunctionBegin;
2096 PetscCall(VecGetArrayRead(xx, &x));
2097 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2098
2099 idx = a->j;
2100 v = a->a;
2101 if (usecprow) {
2102 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2103 mbs = a->compressedrow.nrows;
2104 ii = a->compressedrow.i;
2105 ridx = a->compressedrow.rindex;
2106 } else {
2107 ii = a->i;
2108 y = yarray;
2109 z = zarray;
2110 }
2111
2112 for (i = 0; i < mbs; i++) {
2113 n = ii[1] - ii[0];
2114 ii++;
2115 if (usecprow) {
2116 z = zarray + 2 * ridx[i];
2117 y = yarray + 2 * ridx[i];
2118 }
2119 sum1 = y[0];
2120 sum2 = y[1];
2121 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2122 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2123 for (j = 0; j < n; j++) {
2124 xb = x + 2 * (*idx++);
2125 x1 = xb[0];
2126 x2 = xb[1];
2127
2128 sum1 += v[0] * x1 + v[2] * x2;
2129 sum2 += v[1] * x1 + v[3] * x2;
2130 v += 4;
2131 }
2132 z[0] = sum1;
2133 z[1] = sum2;
2134 if (!usecprow) {
2135 z += 2;
2136 y += 2;
2137 }
2138 }
2139 PetscCall(VecRestoreArrayRead(xx, &x));
2140 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2141 PetscCall(PetscLogFlops(4.0 * a->nz));
2142 PetscFunctionReturn(PETSC_SUCCESS);
2143 }
2144
MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)2145 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2146 {
2147 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2148 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2149 const PetscScalar *x, *xb;
2150 const MatScalar *v;
2151 PetscInt mbs = a->mbs, i, j, n;
2152 const PetscInt *idx, *ii, *ridx = NULL;
2153 PetscBool usecprow = a->compressedrow.use;
2154
2155 PetscFunctionBegin;
2156 PetscCall(VecGetArrayRead(xx, &x));
2157 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2158
2159 idx = a->j;
2160 v = a->a;
2161 if (usecprow) {
2162 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2163 mbs = a->compressedrow.nrows;
2164 ii = a->compressedrow.i;
2165 ridx = a->compressedrow.rindex;
2166 } else {
2167 ii = a->i;
2168 y = yarray;
2169 z = zarray;
2170 }
2171
2172 for (i = 0; i < mbs; i++) {
2173 n = ii[1] - ii[0];
2174 ii++;
2175 if (usecprow) {
2176 z = zarray + 3 * ridx[i];
2177 y = yarray + 3 * ridx[i];
2178 }
2179 sum1 = y[0];
2180 sum2 = y[1];
2181 sum3 = y[2];
2182 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2183 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2184 for (j = 0; j < n; j++) {
2185 xb = x + 3 * (*idx++);
2186 x1 = xb[0];
2187 x2 = xb[1];
2188 x3 = xb[2];
2189 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2190 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2191 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2192 v += 9;
2193 }
2194 z[0] = sum1;
2195 z[1] = sum2;
2196 z[2] = sum3;
2197 if (!usecprow) {
2198 z += 3;
2199 y += 3;
2200 }
2201 }
2202 PetscCall(VecRestoreArrayRead(xx, &x));
2203 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2204 PetscCall(PetscLogFlops(18.0 * a->nz));
2205 PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207
MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)2208 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2209 {
2210 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2211 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2212 const PetscScalar *x, *xb;
2213 const MatScalar *v;
2214 PetscInt mbs = a->mbs, i, j, n;
2215 const PetscInt *idx, *ii, *ridx = NULL;
2216 PetscBool usecprow = a->compressedrow.use;
2217
2218 PetscFunctionBegin;
2219 PetscCall(VecGetArrayRead(xx, &x));
2220 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2221
2222 idx = a->j;
2223 v = a->a;
2224 if (usecprow) {
2225 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2226 mbs = a->compressedrow.nrows;
2227 ii = a->compressedrow.i;
2228 ridx = a->compressedrow.rindex;
2229 } else {
2230 ii = a->i;
2231 y = yarray;
2232 z = zarray;
2233 }
2234
2235 for (i = 0; i < mbs; i++) {
2236 n = ii[1] - ii[0];
2237 ii++;
2238 if (usecprow) {
2239 z = zarray + 4 * ridx[i];
2240 y = yarray + 4 * ridx[i];
2241 }
2242 sum1 = y[0];
2243 sum2 = y[1];
2244 sum3 = y[2];
2245 sum4 = y[3];
2246 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2247 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2248 for (j = 0; j < n; j++) {
2249 xb = x + 4 * (*idx++);
2250 x1 = xb[0];
2251 x2 = xb[1];
2252 x3 = xb[2];
2253 x4 = xb[3];
2254 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2255 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2256 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2257 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2258 v += 16;
2259 }
2260 z[0] = sum1;
2261 z[1] = sum2;
2262 z[2] = sum3;
2263 z[3] = sum4;
2264 if (!usecprow) {
2265 z += 4;
2266 y += 4;
2267 }
2268 }
2269 PetscCall(VecRestoreArrayRead(xx, &x));
2270 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2271 PetscCall(PetscLogFlops(32.0 * a->nz));
2272 PetscFunctionReturn(PETSC_SUCCESS);
2273 }
2274
MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)2275 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2276 {
2277 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2278 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2279 const PetscScalar *x, *xb;
2280 PetscScalar *yarray, *zarray;
2281 const MatScalar *v;
2282 PetscInt mbs = a->mbs, i, j, n;
2283 const PetscInt *idx, *ii, *ridx = NULL;
2284 PetscBool usecprow = a->compressedrow.use;
2285
2286 PetscFunctionBegin;
2287 PetscCall(VecGetArrayRead(xx, &x));
2288 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2289
2290 idx = a->j;
2291 v = a->a;
2292 if (usecprow) {
2293 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2294 mbs = a->compressedrow.nrows;
2295 ii = a->compressedrow.i;
2296 ridx = a->compressedrow.rindex;
2297 } else {
2298 ii = a->i;
2299 y = yarray;
2300 z = zarray;
2301 }
2302
2303 for (i = 0; i < mbs; i++) {
2304 n = ii[1] - ii[0];
2305 ii++;
2306 if (usecprow) {
2307 z = zarray + 5 * ridx[i];
2308 y = yarray + 5 * ridx[i];
2309 }
2310 sum1 = y[0];
2311 sum2 = y[1];
2312 sum3 = y[2];
2313 sum4 = y[3];
2314 sum5 = y[4];
2315 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2316 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2317 for (j = 0; j < n; j++) {
2318 xb = x + 5 * (*idx++);
2319 x1 = xb[0];
2320 x2 = xb[1];
2321 x3 = xb[2];
2322 x4 = xb[3];
2323 x5 = xb[4];
2324 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2325 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2326 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2327 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2328 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2329 v += 25;
2330 }
2331 z[0] = sum1;
2332 z[1] = sum2;
2333 z[2] = sum3;
2334 z[3] = sum4;
2335 z[4] = sum5;
2336 if (!usecprow) {
2337 z += 5;
2338 y += 5;
2339 }
2340 }
2341 PetscCall(VecRestoreArrayRead(xx, &x));
2342 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2343 PetscCall(PetscLogFlops(50.0 * a->nz));
2344 PetscFunctionReturn(PETSC_SUCCESS);
2345 }
2346
MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)2347 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2348 {
2349 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2350 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2351 const PetscScalar *x, *xb;
2352 PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2353 const MatScalar *v;
2354 PetscInt mbs = a->mbs, i, j, n;
2355 const PetscInt *idx, *ii, *ridx = NULL;
2356 PetscBool usecprow = a->compressedrow.use;
2357
2358 PetscFunctionBegin;
2359 PetscCall(VecGetArrayRead(xx, &x));
2360 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2361
2362 idx = a->j;
2363 v = a->a;
2364 if (usecprow) {
2365 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2366 mbs = a->compressedrow.nrows;
2367 ii = a->compressedrow.i;
2368 ridx = a->compressedrow.rindex;
2369 } else {
2370 ii = a->i;
2371 y = yarray;
2372 z = zarray;
2373 }
2374
2375 for (i = 0; i < mbs; i++) {
2376 n = ii[1] - ii[0];
2377 ii++;
2378 if (usecprow) {
2379 z = zarray + 6 * ridx[i];
2380 y = yarray + 6 * ridx[i];
2381 }
2382 sum1 = y[0];
2383 sum2 = y[1];
2384 sum3 = y[2];
2385 sum4 = y[3];
2386 sum5 = y[4];
2387 sum6 = y[5];
2388 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2389 PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2390 for (j = 0; j < n; j++) {
2391 xb = x + 6 * (*idx++);
2392 x1 = xb[0];
2393 x2 = xb[1];
2394 x3 = xb[2];
2395 x4 = xb[3];
2396 x5 = xb[4];
2397 x6 = xb[5];
2398 sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2399 sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2400 sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2401 sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2402 sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2403 sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2404 v += 36;
2405 }
2406 z[0] = sum1;
2407 z[1] = sum2;
2408 z[2] = sum3;
2409 z[3] = sum4;
2410 z[4] = sum5;
2411 z[5] = sum6;
2412 if (!usecprow) {
2413 z += 6;
2414 y += 6;
2415 }
2416 }
2417 PetscCall(VecRestoreArrayRead(xx, &x));
2418 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2419 PetscCall(PetscLogFlops(72.0 * a->nz));
2420 PetscFunctionReturn(PETSC_SUCCESS);
2421 }
2422
MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)2423 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2424 {
2425 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2426 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2427 const PetscScalar *x, *xb;
2428 PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2429 const MatScalar *v;
2430 PetscInt mbs = a->mbs, i, j, n;
2431 const PetscInt *idx, *ii, *ridx = NULL;
2432 PetscBool usecprow = a->compressedrow.use;
2433
2434 PetscFunctionBegin;
2435 PetscCall(VecGetArrayRead(xx, &x));
2436 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2437
2438 idx = a->j;
2439 v = a->a;
2440 if (usecprow) {
2441 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2442 mbs = a->compressedrow.nrows;
2443 ii = a->compressedrow.i;
2444 ridx = a->compressedrow.rindex;
2445 } else {
2446 ii = a->i;
2447 y = yarray;
2448 z = zarray;
2449 }
2450
2451 for (i = 0; i < mbs; i++) {
2452 n = ii[1] - ii[0];
2453 ii++;
2454 if (usecprow) {
2455 z = zarray + 7 * ridx[i];
2456 y = yarray + 7 * ridx[i];
2457 }
2458 sum1 = y[0];
2459 sum2 = y[1];
2460 sum3 = y[2];
2461 sum4 = y[3];
2462 sum5 = y[4];
2463 sum6 = y[5];
2464 sum7 = y[6];
2465 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2466 PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2467 for (j = 0; j < n; j++) {
2468 xb = x + 7 * (*idx++);
2469 x1 = xb[0];
2470 x2 = xb[1];
2471 x3 = xb[2];
2472 x4 = xb[3];
2473 x5 = xb[4];
2474 x6 = xb[5];
2475 x7 = xb[6];
2476 sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2477 sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2478 sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2479 sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2480 sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2481 sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2482 sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2483 v += 49;
2484 }
2485 z[0] = sum1;
2486 z[1] = sum2;
2487 z[2] = sum3;
2488 z[3] = sum4;
2489 z[4] = sum5;
2490 z[5] = sum6;
2491 z[6] = sum7;
2492 if (!usecprow) {
2493 z += 7;
2494 y += 7;
2495 }
2496 }
2497 PetscCall(VecRestoreArrayRead(xx, &x));
2498 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2499 PetscCall(PetscLogFlops(98.0 * a->nz));
2500 PetscFunctionReturn(PETSC_SUCCESS);
2501 }
2502
2503 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)2504 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2505 {
2506 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2507 PetscScalar *z = NULL, *work, *workt, *zarray;
2508 const PetscScalar *x, *xb;
2509 const MatScalar *v;
2510 PetscInt mbs, i, j, n;
2511 PetscInt k;
2512 PetscBool usecprow = a->compressedrow.use;
2513 const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2514
2515 __m256d a0, a1, a2, a3, a4, a5;
2516 __m256d w0, w1, w2, w3;
2517 __m256d z0, z1, z2;
2518 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2519
2520 PetscFunctionBegin;
2521 PetscCall(VecCopy(yy, zz));
2522 PetscCall(VecGetArrayRead(xx, &x));
2523 PetscCall(VecGetArray(zz, &zarray));
2524
2525 idx = a->j;
2526 v = a->a;
2527 if (usecprow) {
2528 mbs = a->compressedrow.nrows;
2529 ii = a->compressedrow.i;
2530 ridx = a->compressedrow.rindex;
2531 } else {
2532 mbs = a->mbs;
2533 ii = a->i;
2534 z = zarray;
2535 }
2536
2537 if (!a->mult_work) {
2538 k = PetscMax(A->rmap->n, A->cmap->n);
2539 PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2540 }
2541
2542 work = a->mult_work;
2543 for (i = 0; i < mbs; i++) {
2544 n = ii[1] - ii[0];
2545 ii++;
2546 workt = work;
2547 for (j = 0; j < n; j++) {
2548 xb = x + bs * (*idx++);
2549 for (k = 0; k < bs; k++) workt[k] = xb[k];
2550 workt += bs;
2551 }
2552 if (usecprow) z = zarray + bs * ridx[i];
2553
2554 z0 = _mm256_loadu_pd(&z[0]);
2555 z1 = _mm256_loadu_pd(&z[4]);
2556 z2 = _mm256_set1_pd(z[8]);
2557
2558 for (j = 0; j < n; j++) {
2559 /* first column of a */
2560 w0 = _mm256_set1_pd(work[j * 9]);
2561 a0 = _mm256_loadu_pd(&v[j * 81]);
2562 z0 = _mm256_fmadd_pd(a0, w0, z0);
2563 a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2564 z1 = _mm256_fmadd_pd(a1, w0, z1);
2565 a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2566 z2 = _mm256_fmadd_pd(a2, w0, z2);
2567
2568 /* second column of a */
2569 w1 = _mm256_set1_pd(work[j * 9 + 1]);
2570 a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2571 z0 = _mm256_fmadd_pd(a0, w1, z0);
2572 a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2573 z1 = _mm256_fmadd_pd(a1, w1, z1);
2574 a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2575 z2 = _mm256_fmadd_pd(a2, w1, z2);
2576
2577 /* third column of a */
2578 w2 = _mm256_set1_pd(work[j * 9 + 2]);
2579 a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2580 z0 = _mm256_fmadd_pd(a3, w2, z0);
2581 a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2582 z1 = _mm256_fmadd_pd(a4, w2, z1);
2583 a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2584 z2 = _mm256_fmadd_pd(a5, w2, z2);
2585
2586 /* fourth column of a */
2587 w3 = _mm256_set1_pd(work[j * 9 + 3]);
2588 a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2589 z0 = _mm256_fmadd_pd(a0, w3, z0);
2590 a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2591 z1 = _mm256_fmadd_pd(a1, w3, z1);
2592 a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2593 z2 = _mm256_fmadd_pd(a2, w3, z2);
2594
2595 /* fifth column of a */
2596 w0 = _mm256_set1_pd(work[j * 9 + 4]);
2597 a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2598 z0 = _mm256_fmadd_pd(a3, w0, z0);
2599 a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2600 z1 = _mm256_fmadd_pd(a4, w0, z1);
2601 a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2602 z2 = _mm256_fmadd_pd(a5, w0, z2);
2603
2604 /* sixth column of a */
2605 w1 = _mm256_set1_pd(work[j * 9 + 5]);
2606 a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2607 z0 = _mm256_fmadd_pd(a0, w1, z0);
2608 a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2609 z1 = _mm256_fmadd_pd(a1, w1, z1);
2610 a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2611 z2 = _mm256_fmadd_pd(a2, w1, z2);
2612
2613 /* seventh column of a */
2614 w2 = _mm256_set1_pd(work[j * 9 + 6]);
2615 a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2616 z0 = _mm256_fmadd_pd(a0, w2, z0);
2617 a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2618 z1 = _mm256_fmadd_pd(a1, w2, z1);
2619 a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2620 z2 = _mm256_fmadd_pd(a2, w2, z2);
2621
2622 /* eighth column of a */
2623 w3 = _mm256_set1_pd(work[j * 9 + 7]);
2624 a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2625 z0 = _mm256_fmadd_pd(a3, w3, z0);
2626 a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2627 z1 = _mm256_fmadd_pd(a4, w3, z1);
2628 a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2629 z2 = _mm256_fmadd_pd(a5, w3, z2);
2630
2631 /* ninth column of a */
2632 w0 = _mm256_set1_pd(work[j * 9 + 8]);
2633 a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2634 z0 = _mm256_fmadd_pd(a0, w0, z0);
2635 a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2636 z1 = _mm256_fmadd_pd(a1, w0, z1);
2637 a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2638 z2 = _mm256_fmadd_pd(a2, w0, z2);
2639 }
2640
2641 _mm256_storeu_pd(&z[0], z0);
2642 _mm256_storeu_pd(&z[4], z1);
2643 _mm256_maskstore_pd(&z[8], mask1, z2);
2644
2645 v += n * bs2;
2646 if (!usecprow) z += bs;
2647 }
2648 PetscCall(VecRestoreArrayRead(xx, &x));
2649 PetscCall(VecRestoreArray(zz, &zarray));
2650 PetscCall(PetscLogFlops(162.0 * a->nz));
2651 PetscFunctionReturn(PETSC_SUCCESS);
2652 }
2653 #endif
2654
MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)2655 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2656 {
2657 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2658 PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2659 const PetscScalar *x, *xb;
2660 PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2661 const MatScalar *v;
2662 PetscInt mbs = a->mbs, i, j, n;
2663 const PetscInt *idx, *ii, *ridx = NULL;
2664 PetscBool usecprow = a->compressedrow.use;
2665
2666 PetscFunctionBegin;
2667 PetscCall(VecGetArrayRead(xx, &x));
2668 PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2669
2670 idx = a->j;
2671 v = a->a;
2672 if (usecprow) {
2673 if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2674 mbs = a->compressedrow.nrows;
2675 ii = a->compressedrow.i;
2676 ridx = a->compressedrow.rindex;
2677 } else {
2678 ii = a->i;
2679 y = yarray;
2680 z = zarray;
2681 }
2682
2683 for (i = 0; i < mbs; i++) {
2684 n = ii[1] - ii[0];
2685 ii++;
2686 if (usecprow) {
2687 z = zarray + 11 * ridx[i];
2688 y = yarray + 11 * ridx[i];
2689 }
2690 sum1 = y[0];
2691 sum2 = y[1];
2692 sum3 = y[2];
2693 sum4 = y[3];
2694 sum5 = y[4];
2695 sum6 = y[5];
2696 sum7 = y[6];
2697 sum8 = y[7];
2698 sum9 = y[8];
2699 sum10 = y[9];
2700 sum11 = y[10];
2701 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2702 PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2703 for (j = 0; j < n; j++) {
2704 xb = x + 11 * (*idx++);
2705 x1 = xb[0];
2706 x2 = xb[1];
2707 x3 = xb[2];
2708 x4 = xb[3];
2709 x5 = xb[4];
2710 x6 = xb[5];
2711 x7 = xb[6];
2712 x8 = xb[7];
2713 x9 = xb[8];
2714 x10 = xb[9];
2715 x11 = xb[10];
2716 sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2717 sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2718 sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2719 sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2720 sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2721 sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2722 sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2723 sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2724 sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2725 sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2726 sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2727 v += 121;
2728 }
2729 z[0] = sum1;
2730 z[1] = sum2;
2731 z[2] = sum3;
2732 z[3] = sum4;
2733 z[4] = sum5;
2734 z[5] = sum6;
2735 z[6] = sum7;
2736 z[7] = sum8;
2737 z[8] = sum9;
2738 z[9] = sum10;
2739 z[10] = sum11;
2740 if (!usecprow) {
2741 z += 11;
2742 y += 11;
2743 }
2744 }
2745 PetscCall(VecRestoreArrayRead(xx, &x));
2746 PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2747 PetscCall(PetscLogFlops(242.0 * a->nz));
2748 PetscFunctionReturn(PETSC_SUCCESS);
2749 }
2750
MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)2751 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2752 {
2753 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2754 PetscScalar *z = NULL, *work, *workt, *zarray;
2755 const PetscScalar *x, *xb;
2756 const MatScalar *v;
2757 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2758 PetscInt ncols, k;
2759 const PetscInt *ridx = NULL, *idx, *ii;
2760 PetscBool usecprow = a->compressedrow.use;
2761
2762 PetscFunctionBegin;
2763 PetscCall(VecCopy(yy, zz));
2764 PetscCall(VecGetArrayRead(xx, &x));
2765 PetscCall(VecGetArray(zz, &zarray));
2766
2767 idx = a->j;
2768 v = a->a;
2769 if (usecprow) {
2770 mbs = a->compressedrow.nrows;
2771 ii = a->compressedrow.i;
2772 ridx = a->compressedrow.rindex;
2773 } else {
2774 mbs = a->mbs;
2775 ii = a->i;
2776 z = zarray;
2777 }
2778
2779 if (!a->mult_work) {
2780 k = PetscMax(A->rmap->n, A->cmap->n);
2781 PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2782 }
2783 work = a->mult_work;
2784 for (i = 0; i < mbs; i++) {
2785 n = ii[1] - ii[0];
2786 ii++;
2787 ncols = n * bs;
2788 workt = work;
2789 for (j = 0; j < n; j++) {
2790 xb = x + bs * (*idx++);
2791 for (k = 0; k < bs; k++) workt[k] = xb[k];
2792 workt += bs;
2793 }
2794 if (usecprow) z = zarray + bs * ridx[i];
2795 PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2796 v += n * bs2;
2797 if (!usecprow) z += bs;
2798 }
2799 PetscCall(VecRestoreArrayRead(xx, &x));
2800 PetscCall(VecRestoreArray(zz, &zarray));
2801 PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2802 PetscFunctionReturn(PETSC_SUCCESS);
2803 }
2804
MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)2805 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2806 {
2807 PetscScalar zero = 0.0;
2808
2809 PetscFunctionBegin;
2810 PetscCall(VecSet(zz, zero));
2811 PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2812 PetscFunctionReturn(PETSC_SUCCESS);
2813 }
2814
MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)2815 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2816 {
2817 PetscScalar zero = 0.0;
2818
2819 PetscFunctionBegin;
2820 PetscCall(VecSet(zz, zero));
2821 PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2822 PetscFunctionReturn(PETSC_SUCCESS);
2823 }
2824
MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)2825 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2826 {
2827 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2828 PetscScalar *z, x1, x2, x3, x4, x5;
2829 const PetscScalar *x, *xb = NULL;
2830 const MatScalar *v;
2831 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n;
2832 const PetscInt *idx, *ii, *ib, *ridx = NULL;
2833 Mat_CompressedRow cprow = a->compressedrow;
2834 PetscBool usecprow = cprow.use;
2835
2836 PetscFunctionBegin;
2837 if (yy != zz) PetscCall(VecCopy(yy, zz));
2838 PetscCall(VecGetArrayRead(xx, &x));
2839 PetscCall(VecGetArray(zz, &z));
2840
2841 idx = a->j;
2842 v = a->a;
2843 if (usecprow) {
2844 mbs = cprow.nrows;
2845 ii = cprow.i;
2846 ridx = cprow.rindex;
2847 } else {
2848 mbs = a->mbs;
2849 ii = a->i;
2850 xb = x;
2851 }
2852
2853 switch (bs) {
2854 case 1:
2855 for (i = 0; i < mbs; i++) {
2856 if (usecprow) xb = x + ridx[i];
2857 x1 = xb[0];
2858 ib = idx + ii[0];
2859 n = ii[1] - ii[0];
2860 ii++;
2861 for (j = 0; j < n; j++) {
2862 rval = ib[j];
2863 z[rval] += PetscConj(*v) * x1;
2864 v++;
2865 }
2866 if (!usecprow) xb++;
2867 }
2868 break;
2869 case 2:
2870 for (i = 0; i < mbs; i++) {
2871 if (usecprow) xb = x + 2 * ridx[i];
2872 x1 = xb[0];
2873 x2 = xb[1];
2874 ib = idx + ii[0];
2875 n = ii[1] - ii[0];
2876 ii++;
2877 for (j = 0; j < n; j++) {
2878 rval = ib[j] * 2;
2879 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2880 z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2881 v += 4;
2882 }
2883 if (!usecprow) xb += 2;
2884 }
2885 break;
2886 case 3:
2887 for (i = 0; i < mbs; i++) {
2888 if (usecprow) xb = x + 3 * ridx[i];
2889 x1 = xb[0];
2890 x2 = xb[1];
2891 x3 = xb[2];
2892 ib = idx + ii[0];
2893 n = ii[1] - ii[0];
2894 ii++;
2895 for (j = 0; j < n; j++) {
2896 rval = ib[j] * 3;
2897 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2898 z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2899 z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2900 v += 9;
2901 }
2902 if (!usecprow) xb += 3;
2903 }
2904 break;
2905 case 4:
2906 for (i = 0; i < mbs; i++) {
2907 if (usecprow) xb = x + 4 * ridx[i];
2908 x1 = xb[0];
2909 x2 = xb[1];
2910 x3 = xb[2];
2911 x4 = xb[3];
2912 ib = idx + ii[0];
2913 n = ii[1] - ii[0];
2914 ii++;
2915 for (j = 0; j < n; j++) {
2916 rval = ib[j] * 4;
2917 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2918 z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2919 z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2920 z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2921 v += 16;
2922 }
2923 if (!usecprow) xb += 4;
2924 }
2925 break;
2926 case 5:
2927 for (i = 0; i < mbs; i++) {
2928 if (usecprow) xb = x + 5 * ridx[i];
2929 x1 = xb[0];
2930 x2 = xb[1];
2931 x3 = xb[2];
2932 x4 = xb[3];
2933 x5 = xb[4];
2934 ib = idx + ii[0];
2935 n = ii[1] - ii[0];
2936 ii++;
2937 for (j = 0; j < n; j++) {
2938 rval = ib[j] * 5;
2939 z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2940 z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2941 z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2942 z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2943 z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2944 v += 25;
2945 }
2946 if (!usecprow) xb += 5;
2947 }
2948 break;
2949 default: /* block sizes larger than 5 by 5 are handled by BLAS */
2950 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2951 #if 0
2952 {
2953 PetscInt ncols,k,bs2=a->bs2;
2954 PetscScalar *work,*workt,zb;
2955 const PetscScalar *xtmp;
2956 if (!a->mult_work) {
2957 k = PetscMax(A->rmap->n,A->cmap->n);
2958 PetscCall(PetscMalloc1(k+1,&a->mult_work));
2959 }
2960 work = a->mult_work;
2961 xtmp = x;
2962 for (i=0; i<mbs; i++) {
2963 n = ii[1] - ii[0]; ii++;
2964 ncols = n*bs;
2965 PetscCall(PetscArrayzero(work,ncols));
2966 if (usecprow) xtmp = x + bs*ridx[i];
2967 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2968 v += n*bs2;
2969 if (!usecprow) xtmp += bs;
2970 workt = work;
2971 for (j=0; j<n; j++) {
2972 zb = z + bs*(*idx++);
2973 for (k=0; k<bs; k++) zb[k] += workt[k] ;
2974 workt += bs;
2975 }
2976 }
2977 }
2978 #endif
2979 }
2980 PetscCall(VecRestoreArrayRead(xx, &x));
2981 PetscCall(VecRestoreArray(zz, &z));
2982 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2983 PetscFunctionReturn(PETSC_SUCCESS);
2984 }
2985
MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)2986 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2987 {
2988 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2989 PetscScalar *zb, *z, x1, x2, x3, x4, x5;
2990 const PetscScalar *x, *xb = NULL;
2991 const MatScalar *v;
2992 PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2993 const PetscInt *idx, *ii, *ib, *ridx = NULL;
2994 Mat_CompressedRow cprow = a->compressedrow;
2995 PetscBool usecprow = cprow.use;
2996
2997 PetscFunctionBegin;
2998 if (yy != zz) PetscCall(VecCopy(yy, zz));
2999 PetscCall(VecGetArrayRead(xx, &x));
3000 PetscCall(VecGetArray(zz, &z));
3001
3002 idx = a->j;
3003 v = a->a;
3004 if (usecprow) {
3005 mbs = cprow.nrows;
3006 ii = cprow.i;
3007 ridx = cprow.rindex;
3008 } else {
3009 mbs = a->mbs;
3010 ii = a->i;
3011 xb = x;
3012 }
3013
3014 switch (bs) {
3015 case 1:
3016 for (i = 0; i < mbs; i++) {
3017 if (usecprow) xb = x + ridx[i];
3018 x1 = xb[0];
3019 ib = idx + ii[0];
3020 n = ii[1] - ii[0];
3021 ii++;
3022 for (j = 0; j < n; j++) {
3023 rval = ib[j];
3024 z[rval] += *v * x1;
3025 v++;
3026 }
3027 if (!usecprow) xb++;
3028 }
3029 break;
3030 case 2:
3031 for (i = 0; i < mbs; i++) {
3032 if (usecprow) xb = x + 2 * ridx[i];
3033 x1 = xb[0];
3034 x2 = xb[1];
3035 ib = idx + ii[0];
3036 n = ii[1] - ii[0];
3037 ii++;
3038 for (j = 0; j < n; j++) {
3039 rval = ib[j] * 2;
3040 z[rval++] += v[0] * x1 + v[1] * x2;
3041 z[rval++] += v[2] * x1 + v[3] * x2;
3042 v += 4;
3043 }
3044 if (!usecprow) xb += 2;
3045 }
3046 break;
3047 case 3:
3048 for (i = 0; i < mbs; i++) {
3049 if (usecprow) xb = x + 3 * ridx[i];
3050 x1 = xb[0];
3051 x2 = xb[1];
3052 x3 = xb[2];
3053 ib = idx + ii[0];
3054 n = ii[1] - ii[0];
3055 ii++;
3056 for (j = 0; j < n; j++) {
3057 rval = ib[j] * 3;
3058 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3059 z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3060 z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3061 v += 9;
3062 }
3063 if (!usecprow) xb += 3;
3064 }
3065 break;
3066 case 4:
3067 for (i = 0; i < mbs; i++) {
3068 if (usecprow) xb = x + 4 * ridx[i];
3069 x1 = xb[0];
3070 x2 = xb[1];
3071 x3 = xb[2];
3072 x4 = xb[3];
3073 ib = idx + ii[0];
3074 n = ii[1] - ii[0];
3075 ii++;
3076 for (j = 0; j < n; j++) {
3077 rval = ib[j] * 4;
3078 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3079 z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3080 z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3081 z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3082 v += 16;
3083 }
3084 if (!usecprow) xb += 4;
3085 }
3086 break;
3087 case 5:
3088 for (i = 0; i < mbs; i++) {
3089 if (usecprow) xb = x + 5 * ridx[i];
3090 x1 = xb[0];
3091 x2 = xb[1];
3092 x3 = xb[2];
3093 x4 = xb[3];
3094 x5 = xb[4];
3095 ib = idx + ii[0];
3096 n = ii[1] - ii[0];
3097 ii++;
3098 for (j = 0; j < n; j++) {
3099 rval = ib[j] * 5;
3100 z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3101 z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3102 z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3103 z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3104 z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3105 v += 25;
3106 }
3107 if (!usecprow) xb += 5;
3108 }
3109 break;
3110 default: { /* block sizes larger than 5 by 5 are handled by BLAS */
3111 PetscInt ncols, k;
3112 PetscScalar *work, *workt;
3113 const PetscScalar *xtmp;
3114 if (!a->mult_work) {
3115 k = PetscMax(A->rmap->n, A->cmap->n);
3116 PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3117 }
3118 work = a->mult_work;
3119 xtmp = x;
3120 for (i = 0; i < mbs; i++) {
3121 n = ii[1] - ii[0];
3122 ii++;
3123 ncols = n * bs;
3124 PetscCall(PetscArrayzero(work, ncols));
3125 if (usecprow) xtmp = x + bs * ridx[i];
3126 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3127 v += n * bs2;
3128 if (!usecprow) xtmp += bs;
3129 workt = work;
3130 for (j = 0; j < n; j++) {
3131 zb = z + bs * (*idx++);
3132 for (k = 0; k < bs; k++) zb[k] += workt[k];
3133 workt += bs;
3134 }
3135 }
3136 }
3137 }
3138 PetscCall(VecRestoreArrayRead(xx, &x));
3139 PetscCall(VecRestoreArray(zz, &z));
3140 PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3141 PetscFunctionReturn(PETSC_SUCCESS);
3142 }
3143
MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)3144 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3145 {
3146 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3147 PetscInt totalnz = a->bs2 * a->nz;
3148 PetscScalar oalpha = alpha;
3149 PetscBLASInt one = 1, tnz;
3150
3151 PetscFunctionBegin;
3152 PetscCall(PetscBLASIntCast(totalnz, &tnz));
3153 PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3154 PetscCall(PetscLogFlops(totalnz));
3155 PetscFunctionReturn(PETSC_SUCCESS);
3156 }
3157
MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal * norm)3158 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3159 {
3160 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3161 MatScalar *v = a->a;
3162 PetscReal sum = 0.0;
3163 PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3164
3165 PetscFunctionBegin;
3166 if (type == NORM_FROBENIUS) {
3167 #if defined(PETSC_USE_REAL___FP16)
3168 PetscBLASInt one = 1, cnt = bs2 * nz;
3169 PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3170 #else
3171 for (i = 0; i < bs2 * nz; i++) {
3172 sum += PetscRealPart(PetscConj(*v) * (*v));
3173 v++;
3174 }
3175 #endif
3176 *norm = PetscSqrtReal(sum);
3177 PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3178 } else if (type == NORM_1) { /* maximum column sum */
3179 PetscReal *tmp;
3180 PetscInt *bcol = a->j;
3181 PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3182 for (i = 0; i < nz; i++) {
3183 for (j = 0; j < bs; j++) {
3184 k1 = bs * (*bcol) + j; /* column index */
3185 for (k = 0; k < bs; k++) {
3186 tmp[k1] += PetscAbsScalar(*v);
3187 v++;
3188 }
3189 }
3190 bcol++;
3191 }
3192 *norm = 0.0;
3193 for (j = 0; j < A->cmap->n; j++) {
3194 if (tmp[j] > *norm) *norm = tmp[j];
3195 }
3196 PetscCall(PetscFree(tmp));
3197 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3198 } else if (type == NORM_INFINITY) { /* maximum row sum */
3199 *norm = 0.0;
3200 for (k = 0; k < bs; k++) {
3201 for (j = 0; j < a->mbs; j++) {
3202 v = a->a + bs2 * a->i[j] + k;
3203 sum = 0.0;
3204 for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3205 for (k1 = 0; k1 < bs; k1++) {
3206 sum += PetscAbsScalar(*v);
3207 v += bs;
3208 }
3209 }
3210 if (sum > *norm) *norm = sum;
3211 }
3212 }
3213 PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3214 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3215 PetscFunctionReturn(PETSC_SUCCESS);
3216 }
3217
MatGetDiagonal_SeqBAIJ(Mat A,Vec v)3218 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3219 {
3220 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3221 PetscInt n;
3222 const PetscInt bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2;
3223 PetscScalar *x;
3224 const MatScalar *aa = a->a, *aa_j;
3225 const PetscInt *ai = a->i, *adiag;
3226 PetscBool diagDense;
3227
3228 PetscFunctionBegin;
3229 PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3230
3231 PetscCall(VecGetLocalSize(v, &n));
3232 PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3233 PetscCall(VecGetArrayWrite(v, &x));
3234 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
3235 if (diagDense) {
3236 for (PetscInt i = 0, row = 0; i < ambs; i++) {
3237 aa_j = aa + adiag[i] * bs2;
3238 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
3239 }
3240 } else {
3241 for (PetscInt i = 0, row = 0; i < ambs; i++) {
3242 const PetscInt j = adiag[i];
3243
3244 if (j != ai[i + 1]) {
3245 aa_j = aa + j * bs2;
3246 for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
3247 } else {
3248 for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0;
3249 }
3250 }
3251 }
3252 PetscCall(VecRestoreArrayWrite(v, &x));
3253 PetscFunctionReturn(PETSC_SUCCESS);
3254 }
3255
MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)3256 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3257 {
3258 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3259 const PetscScalar *l, *r, *li, *ri;
3260 PetscScalar x;
3261 MatScalar *aa, *v;
3262 PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3263 const PetscInt *ai, *aj;
3264
3265 PetscFunctionBegin;
3266 ai = a->i;
3267 aj = a->j;
3268 aa = a->a;
3269 m = A->rmap->n;
3270 n = A->cmap->n;
3271 bs = A->rmap->bs;
3272 mbs = a->mbs;
3273 bs2 = a->bs2;
3274 if (ll) {
3275 PetscCall(VecGetArrayRead(ll, &l));
3276 PetscCall(VecGetLocalSize(ll, &lm));
3277 PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3278 for (i = 0; i < mbs; i++) { /* for each block row */
3279 M = ai[i + 1] - ai[i];
3280 li = l + i * bs;
3281 v = PetscSafePointerPlusOffset(aa, bs2 * ai[i]);
3282 for (j = 0; j < M; j++) { /* for each block */
3283 for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3284 }
3285 }
3286 PetscCall(VecRestoreArrayRead(ll, &l));
3287 PetscCall(PetscLogFlops(a->nz));
3288 }
3289
3290 if (rr) {
3291 PetscCall(VecGetArrayRead(rr, &r));
3292 PetscCall(VecGetLocalSize(rr, &rn));
3293 PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3294 for (i = 0; i < mbs; i++) { /* for each block row */
3295 iai = ai[i];
3296 M = ai[i + 1] - iai;
3297 v = PetscSafePointerPlusOffset(aa, bs2 * iai);
3298 for (j = 0; j < M; j++) { /* for each block */
3299 ri = r + bs * aj[iai + j];
3300 for (k = 0; k < bs; k++) {
3301 x = ri[k];
3302 for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3303 v += bs;
3304 }
3305 }
3306 }
3307 PetscCall(VecRestoreArrayRead(rr, &r));
3308 PetscCall(PetscLogFlops(a->nz));
3309 }
3310 PetscFunctionReturn(PETSC_SUCCESS);
3311 }
3312
MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo * info)3313 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3314 {
3315 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3316
3317 PetscFunctionBegin;
3318 info->block_size = a->bs2;
3319 info->nz_allocated = a->bs2 * a->maxnz;
3320 info->nz_used = a->bs2 * a->nz;
3321 info->nz_unneeded = info->nz_allocated - info->nz_used;
3322 info->assemblies = A->num_ass;
3323 info->mallocs = A->info.mallocs;
3324 info->memory = 0; /* REVIEW ME */
3325 if (A->factortype) {
3326 info->fill_ratio_given = A->info.fill_ratio_given;
3327 info->fill_ratio_needed = A->info.fill_ratio_needed;
3328 info->factor_mallocs = A->info.factor_mallocs;
3329 } else {
3330 info->fill_ratio_given = 0;
3331 info->fill_ratio_needed = 0;
3332 info->factor_mallocs = 0;
3333 }
3334 PetscFunctionReturn(PETSC_SUCCESS);
3335 }
3336
MatZeroEntries_SeqBAIJ(Mat A)3337 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3338 {
3339 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3340
3341 PetscFunctionBegin;
3342 PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3343 PetscFunctionReturn(PETSC_SUCCESS);
3344 }
3345
MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)3346 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3347 {
3348 PetscFunctionBegin;
3349 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3350 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3351 PetscFunctionReturn(PETSC_SUCCESS);
3352 }
3353
MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)3354 PetscErrorCode MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3355 {
3356 PetscFunctionBegin;
3357 PetscCall(MatTransposeMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3358 C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqBAIJ_SeqDense;
3359 PetscFunctionReturn(PETSC_SUCCESS);
3360 }
3361
MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3362 static PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3363 {
3364 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3365 PetscScalar *z = NULL, sum1;
3366 const PetscScalar *xb;
3367 PetscScalar x1;
3368 const MatScalar *v, *vv;
3369 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3370 PetscBool usecprow = a->compressedrow.use;
3371
3372 PetscFunctionBegin;
3373 idx = a->j;
3374 v = a->a;
3375 if (usecprow) {
3376 mbs = a->compressedrow.nrows;
3377 ii = a->compressedrow.i;
3378 ridx = a->compressedrow.rindex;
3379 } else {
3380 mbs = a->mbs;
3381 ii = a->i;
3382 z = c;
3383 }
3384
3385 for (i = 0; i < mbs; i++) {
3386 n = ii[1] - ii[0];
3387 ii++;
3388 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3389 PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3390 if (usecprow) z = c + ridx[i];
3391 jj = idx;
3392 vv = v;
3393 for (k = 0; k < cn; k++) {
3394 idx = jj;
3395 v = vv;
3396 sum1 = 0.0;
3397 for (j = 0; j < n; j++) {
3398 xb = b + (*idx++);
3399 x1 = xb[0 + k * bm];
3400 sum1 += v[0] * x1;
3401 v += 1;
3402 }
3403 z[0 + k * cm] = sum1;
3404 }
3405 if (!usecprow) z += 1;
3406 }
3407 PetscFunctionReturn(PETSC_SUCCESS);
3408 }
3409
MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3410 static PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3411 {
3412 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3413 PetscScalar *z = NULL, sum1, sum2;
3414 const PetscScalar *xb;
3415 PetscScalar x1, x2;
3416 const MatScalar *v, *vv;
3417 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3418 PetscBool usecprow = a->compressedrow.use;
3419
3420 PetscFunctionBegin;
3421 idx = a->j;
3422 v = a->a;
3423 if (usecprow) {
3424 mbs = a->compressedrow.nrows;
3425 ii = a->compressedrow.i;
3426 ridx = a->compressedrow.rindex;
3427 } else {
3428 mbs = a->mbs;
3429 ii = a->i;
3430 z = c;
3431 }
3432
3433 for (i = 0; i < mbs; i++) {
3434 n = ii[1] - ii[0];
3435 ii++;
3436 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3437 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3438 if (usecprow) z = c + 2 * ridx[i];
3439 jj = idx;
3440 vv = v;
3441 for (k = 0; k < cn; k++) {
3442 idx = jj;
3443 v = vv;
3444 sum1 = 0.0;
3445 sum2 = 0.0;
3446 for (j = 0; j < n; j++) {
3447 xb = b + 2 * (*idx++);
3448 x1 = xb[0 + k * bm];
3449 x2 = xb[1 + k * bm];
3450 sum1 += v[0] * x1 + v[2] * x2;
3451 sum2 += v[1] * x1 + v[3] * x2;
3452 v += 4;
3453 }
3454 z[0 + k * cm] = sum1;
3455 z[1 + k * cm] = sum2;
3456 }
3457 if (!usecprow) z += 2;
3458 }
3459 PetscFunctionReturn(PETSC_SUCCESS);
3460 }
3461
MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3462 static PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3463 {
3464 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3465 PetscScalar *z = NULL, sum1, sum2, sum3;
3466 const PetscScalar *xb;
3467 PetscScalar x1, x2, x3;
3468 const MatScalar *v, *vv;
3469 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3470 PetscBool usecprow = a->compressedrow.use;
3471
3472 PetscFunctionBegin;
3473 idx = a->j;
3474 v = a->a;
3475 if (usecprow) {
3476 mbs = a->compressedrow.nrows;
3477 ii = a->compressedrow.i;
3478 ridx = a->compressedrow.rindex;
3479 } else {
3480 mbs = a->mbs;
3481 ii = a->i;
3482 z = c;
3483 }
3484
3485 for (i = 0; i < mbs; i++) {
3486 n = ii[1] - ii[0];
3487 ii++;
3488 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3489 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3490 if (usecprow) z = c + 3 * ridx[i];
3491 jj = idx;
3492 vv = v;
3493 for (k = 0; k < cn; k++) {
3494 idx = jj;
3495 v = vv;
3496 sum1 = 0.0;
3497 sum2 = 0.0;
3498 sum3 = 0.0;
3499 for (j = 0; j < n; j++) {
3500 xb = b + 3 * (*idx++);
3501 x1 = xb[0 + k * bm];
3502 x2 = xb[1 + k * bm];
3503 x3 = xb[2 + k * bm];
3504 sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3505 sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3506 sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3507 v += 9;
3508 }
3509 z[0 + k * cm] = sum1;
3510 z[1 + k * cm] = sum2;
3511 z[2 + k * cm] = sum3;
3512 }
3513 if (!usecprow) z += 3;
3514 }
3515 PetscFunctionReturn(PETSC_SUCCESS);
3516 }
3517
MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3518 static PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3519 {
3520 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3521 PetscScalar *z = NULL, sum1, sum2, sum3, sum4;
3522 const PetscScalar *xb;
3523 PetscScalar x1, x2, x3, x4;
3524 const MatScalar *v, *vv;
3525 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3526 PetscBool usecprow = a->compressedrow.use;
3527
3528 PetscFunctionBegin;
3529 idx = a->j;
3530 v = a->a;
3531 if (usecprow) {
3532 mbs = a->compressedrow.nrows;
3533 ii = a->compressedrow.i;
3534 ridx = a->compressedrow.rindex;
3535 } else {
3536 mbs = a->mbs;
3537 ii = a->i;
3538 z = c;
3539 }
3540
3541 for (i = 0; i < mbs; i++) {
3542 n = ii[1] - ii[0];
3543 ii++;
3544 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3545 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3546 if (usecprow) z = c + 4 * ridx[i];
3547 jj = idx;
3548 vv = v;
3549 for (k = 0; k < cn; k++) {
3550 idx = jj;
3551 v = vv;
3552 sum1 = 0.0;
3553 sum2 = 0.0;
3554 sum3 = 0.0;
3555 sum4 = 0.0;
3556 for (j = 0; j < n; j++) {
3557 xb = b + 4 * (*idx++);
3558 x1 = xb[0 + k * bm];
3559 x2 = xb[1 + k * bm];
3560 x3 = xb[2 + k * bm];
3561 x4 = xb[3 + k * bm];
3562 sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3563 sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3564 sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3565 sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3566 v += 16;
3567 }
3568 z[0 + k * cm] = sum1;
3569 z[1 + k * cm] = sum2;
3570 z[2 + k * cm] = sum3;
3571 z[3 + k * cm] = sum4;
3572 }
3573 if (!usecprow) z += 4;
3574 }
3575 PetscFunctionReturn(PETSC_SUCCESS);
3576 }
3577
MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3578 static PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3579 {
3580 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3581 PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5;
3582 const PetscScalar *xb;
3583 PetscScalar x1, x2, x3, x4, x5;
3584 const MatScalar *v, *vv;
3585 PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3586 PetscBool usecprow = a->compressedrow.use;
3587
3588 PetscFunctionBegin;
3589 idx = a->j;
3590 v = a->a;
3591 if (usecprow) {
3592 mbs = a->compressedrow.nrows;
3593 ii = a->compressedrow.i;
3594 ridx = a->compressedrow.rindex;
3595 } else {
3596 mbs = a->mbs;
3597 ii = a->i;
3598 z = c;
3599 }
3600
3601 for (i = 0; i < mbs; i++) {
3602 n = ii[1] - ii[0];
3603 ii++;
3604 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3605 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3606 if (usecprow) z = c + 5 * ridx[i];
3607 jj = idx;
3608 vv = v;
3609 for (k = 0; k < cn; k++) {
3610 idx = jj;
3611 v = vv;
3612 sum1 = 0.0;
3613 sum2 = 0.0;
3614 sum3 = 0.0;
3615 sum4 = 0.0;
3616 sum5 = 0.0;
3617 for (j = 0; j < n; j++) {
3618 xb = b + 5 * (*idx++);
3619 x1 = xb[0 + k * bm];
3620 x2 = xb[1 + k * bm];
3621 x3 = xb[2 + k * bm];
3622 x4 = xb[3 + k * bm];
3623 x5 = xb[4 + k * bm];
3624 sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3625 sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3626 sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3627 sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3628 sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3629 v += 25;
3630 }
3631 z[0 + k * cm] = sum1;
3632 z[1 + k * cm] = sum2;
3633 z[2 + k * cm] = sum3;
3634 z[3 + k * cm] = sum4;
3635 z[4 + k * cm] = sum5;
3636 }
3637 if (!usecprow) z += 5;
3638 }
3639 PetscFunctionReturn(PETSC_SUCCESS);
3640 }
3641
MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)3642 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3643 {
3644 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3645 Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
3646 Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
3647 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3648 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3649 PetscBLASInt bbs, bcn, bbm, bcm;
3650 PetscScalar *z = NULL;
3651 PetscScalar *c, *b;
3652 const MatScalar *v;
3653 const PetscInt *idx, *ii, *ridx = NULL;
3654 PetscScalar _DZero = 0.0, _DOne = 1.0;
3655 PetscBool usecprow = a->compressedrow.use;
3656
3657 PetscFunctionBegin;
3658 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3659 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3660 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3661 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3662 b = bd->v;
3663 if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3664 PetscCall(MatDenseGetArrayWrite(C, &c));
3665 switch (bs) {
3666 case 1:
3667 PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3668 break;
3669 case 2:
3670 PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3671 break;
3672 case 3:
3673 PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3674 break;
3675 case 4:
3676 PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3677 break;
3678 case 5:
3679 PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3680 break;
3681 default: /* block sizes larger than 5 by 5 are handled by BLAS */
3682 PetscCall(PetscBLASIntCast(bs, &bbs));
3683 PetscCall(PetscBLASIntCast(cn, &bcn));
3684 PetscCall(PetscBLASIntCast(bm, &bbm));
3685 PetscCall(PetscBLASIntCast(cm, &bcm));
3686 idx = a->j;
3687 v = a->a;
3688 if (usecprow) {
3689 mbs = a->compressedrow.nrows;
3690 ii = a->compressedrow.i;
3691 ridx = a->compressedrow.rindex;
3692 } else {
3693 mbs = a->mbs;
3694 ii = a->i;
3695 z = c;
3696 }
3697 for (i = 0; i < mbs; i++) {
3698 n = ii[1] - ii[0];
3699 ii++;
3700 if (usecprow) z = c + bs * ridx[i];
3701 if (n) {
3702 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3703 v += bs2;
3704 }
3705 for (j = 1; j < n; j++) {
3706 PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3707 v += bs2;
3708 }
3709 if (!usecprow) z += bs;
3710 }
3711 }
3712 PetscCall(MatDenseRestoreArrayWrite(C, &c));
3713 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3714 PetscFunctionReturn(PETSC_SUCCESS);
3715 }
3716
MatTransposeMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3717 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3718 {
3719 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3720 const MatScalar *v;
3721 PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3722 PetscBool usecprow = a->compressedrow.use;
3723 const PetscScalar *bi;
3724
3725 PetscFunctionBegin;
3726 idx = a->j;
3727 v = a->a;
3728 if (usecprow) {
3729 mbs = a->compressedrow.nrows;
3730 ii = a->compressedrow.i;
3731 ridx = a->compressedrow.rindex;
3732 } else {
3733 mbs = a->mbs;
3734 ii = a->i;
3735 }
3736
3737 for (i = 0; i < mbs; i++) {
3738 PetscInt brow = usecprow ? ridx[i] : i;
3739
3740 n = ii[1] - ii[0];
3741 ii++;
3742 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3743 PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3744 for (j = 0, bi = b + 1 * brow; j < n; j++) {
3745 PetscScalar *zcol = c + 1 * (*idx++);
3746
3747 for (k = 0; k < cn; k++) zcol[0 + k * cm] += v[0] * bi[k * bm];
3748 ++v;
3749 }
3750 }
3751 PetscFunctionReturn(PETSC_SUCCESS);
3752 }
3753
MatTransposeMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3754 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3755 {
3756 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3757 const MatScalar *v;
3758 PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3759 PetscBool usecprow = a->compressedrow.use;
3760 const PetscScalar *bi;
3761 PetscScalar x1, x2;
3762
3763 PetscFunctionBegin;
3764 idx = a->j;
3765 v = a->a;
3766 if (usecprow) {
3767 mbs = a->compressedrow.nrows;
3768 ii = a->compressedrow.i;
3769 ridx = a->compressedrow.rindex;
3770 } else {
3771 mbs = a->mbs;
3772 ii = a->i;
3773 }
3774
3775 for (i = 0; i < mbs; i++) {
3776 PetscInt brow = usecprow ? ridx[i] : i;
3777
3778 n = ii[1] - ii[0];
3779 ii++;
3780 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3781 PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3782 for (j = 0, bi = b + 2 * brow; j < n; j++) {
3783 PetscScalar *zcol = c + 2 * (*idx++);
3784
3785 for (k = 0; k < cn; k++) {
3786 x1 = bi[0 + k * bm];
3787 x2 = bi[1 + k * bm];
3788 zcol[0 + k * cm] += v[0] * x1 + v[1] * x2;
3789 zcol[1 + k * cm] += v[2] * x1 + v[3] * x2;
3790 }
3791 v += 4;
3792 }
3793 }
3794 PetscFunctionReturn(PETSC_SUCCESS);
3795 }
3796
MatTransposeMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3797 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3798 {
3799 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3800 const MatScalar *v;
3801 PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3802 PetscBool usecprow = a->compressedrow.use;
3803 const PetscScalar *bi;
3804 PetscScalar x1, x2, x3;
3805
3806 PetscFunctionBegin;
3807 idx = a->j;
3808 v = a->a;
3809 if (usecprow) {
3810 mbs = a->compressedrow.nrows;
3811 ii = a->compressedrow.i;
3812 ridx = a->compressedrow.rindex;
3813 } else {
3814 mbs = a->mbs;
3815 ii = a->i;
3816 }
3817
3818 for (i = 0; i < mbs; i++) {
3819 PetscInt brow = usecprow ? ridx[i] : i;
3820
3821 n = ii[1] - ii[0];
3822 ii++;
3823 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3824 PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3825 for (j = 0, bi = b + 3 * brow; j < n; j++) {
3826 PetscScalar *zcol = c + 3 * (*idx++);
3827
3828 for (k = 0; k < cn; k++) {
3829 x1 = bi[0 + k * bm];
3830 x2 = bi[1 + k * bm];
3831 x3 = bi[2 + k * bm];
3832 zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3833 zcol[1 + k * cm] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3834 zcol[2 + k * cm] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3835 }
3836 v += 9;
3837 }
3838 }
3839 PetscFunctionReturn(PETSC_SUCCESS);
3840 }
3841
MatTransposeMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3842 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3843 {
3844 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3845 const MatScalar *v;
3846 PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3847 PetscBool usecprow = a->compressedrow.use;
3848 const PetscScalar *bi;
3849 PetscScalar x1, x2, x3, x4;
3850
3851 PetscFunctionBegin;
3852 idx = a->j;
3853 v = a->a;
3854 if (usecprow) {
3855 mbs = a->compressedrow.nrows;
3856 ii = a->compressedrow.i;
3857 ridx = a->compressedrow.rindex;
3858 } else {
3859 mbs = a->mbs;
3860 ii = a->i;
3861 }
3862
3863 for (i = 0; i < mbs; i++) {
3864 PetscInt brow = usecprow ? ridx[i] : i;
3865
3866 n = ii[1] - ii[0];
3867 ii++;
3868 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3869 PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3870 for (j = 0, bi = b + 4 * brow; j < n; j++) {
3871 PetscScalar *zcol = c + 4 * (*idx++);
3872
3873 for (k = 0; k < cn; k++) {
3874 x1 = bi[0 + k * bm];
3875 x2 = bi[1 + k * bm];
3876 x3 = bi[2 + k * bm];
3877 x4 = bi[3 + k * bm];
3878 zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3879 zcol[1 + k * cm] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3880 zcol[2 + k * cm] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3881 zcol[3 + k * cm] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3882 }
3883 v += 16;
3884 }
3885 }
3886 PetscFunctionReturn(PETSC_SUCCESS);
3887 }
3888
MatTransposeMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3889 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3890 {
3891 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3892 const MatScalar *v;
3893 PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3894 PetscBool usecprow = a->compressedrow.use;
3895 const PetscScalar *bi;
3896 PetscScalar x1, x2, x3, x4, x5;
3897
3898 PetscFunctionBegin;
3899 idx = a->j;
3900 v = a->a;
3901 if (usecprow) {
3902 mbs = a->compressedrow.nrows;
3903 ii = a->compressedrow.i;
3904 ridx = a->compressedrow.rindex;
3905 } else {
3906 mbs = a->mbs;
3907 ii = a->i;
3908 }
3909
3910 for (i = 0; i < mbs; i++) {
3911 PetscInt brow = usecprow ? ridx[i] : i;
3912
3913 n = ii[1] - ii[0];
3914 ii++;
3915 PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3916 PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3917 for (j = 0, bi = b + 5 * brow; j < n; j++) {
3918 PetscScalar *zcol = c + 5 * (*idx++);
3919
3920 for (k = 0; k < cn; k++) {
3921 x1 = bi[0 + k * bm];
3922 x2 = bi[1 + k * bm];
3923 x3 = bi[2 + k * bm];
3924 x4 = bi[3 + k * bm];
3925 x5 = bi[4 + k * bm];
3926 zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3927 zcol[1 + k * cm] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3928 zcol[2 + k * cm] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3929 zcol[3 + k * cm] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3930 zcol[4 + k * cm] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3931 }
3932 v += 25;
3933 }
3934 }
3935 PetscFunctionReturn(PETSC_SUCCESS);
3936 }
3937
MatTransposeMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)3938 PetscErrorCode MatTransposeMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3939 {
3940 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3941 Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
3942 Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
3943 PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3944 PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3945 PetscBLASInt bbs, bcn, bbm, bcm;
3946 PetscScalar *c, *b;
3947 const MatScalar *v;
3948 const PetscInt *idx, *ii, *ridx = NULL;
3949 PetscScalar _DOne = 1.0;
3950 PetscBool usecprow = a->compressedrow.use;
3951
3952 PetscFunctionBegin;
3953 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3954 PetscCheck(B->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3955 PetscCheck(A->cmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal columns in A %" PetscInt_FMT, C->rmap->n, A->cmap->n);
3956 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3957 b = bd->v;
3958 PetscCall(MatZeroEntries(C));
3959 PetscCall(MatDenseGetArrayWrite(C, &c));
3960 switch (bs) {
3961 case 1:
3962 PetscCall(MatTransposeMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3963 break;
3964 case 2:
3965 PetscCall(MatTransposeMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3966 break;
3967 case 3:
3968 PetscCall(MatTransposeMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3969 break;
3970 case 4:
3971 PetscCall(MatTransposeMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3972 break;
3973 case 5:
3974 PetscCall(MatTransposeMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3975 break;
3976 default: /* block sizes larger than 5 by 5 are handled by BLAS */
3977 PetscCall(PetscBLASIntCast(bs, &bbs));
3978 PetscCall(PetscBLASIntCast(cn, &bcn));
3979 PetscCall(PetscBLASIntCast(bm, &bbm));
3980 PetscCall(PetscBLASIntCast(cm, &bcm));
3981 idx = a->j;
3982 v = a->a;
3983 if (usecprow) {
3984 mbs = a->compressedrow.nrows;
3985 ii = a->compressedrow.i;
3986 ridx = a->compressedrow.rindex;
3987 } else {
3988 mbs = a->mbs;
3989 ii = a->i;
3990 }
3991 for (i = 0; i < mbs; i++) {
3992 const PetscScalar *bi = b + bs * (usecprow ? ridx[i] : i);
3993
3994 n = ii[1] - ii[0];
3995 ii++;
3996 for (j = 0; j < n; j++) {
3997 PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, bi, &bbm, &_DOne, c + bs * (*idx++), &bcm));
3998 v += bs2;
3999 }
4000 }
4001 }
4002 PetscCall(MatDenseRestoreArrayWrite(C, &c));
4003 PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
4004 PetscFunctionReturn(PETSC_SUCCESS);
4005 }
4006