#include <../src/mat/impls/baij/seq/baij.h> #include <../src/mat/impls/dense/seq/dense.h> #include #include #include #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) #include #elif defined(PETSC_HAVE_XMMINTRIN_H) #include #endif PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival; const PetscInt *idx; PetscInt start, end, *ai, *aj, bs; PetscBT table; PetscFunctionBegin; m = a->mbs; ai = a->i; aj = a->j; bs = A->rmap->bs; PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); PetscCall(PetscBTCreate(m, &table)); PetscCall(PetscMalloc1(m + 1, &nidx)); for (i = 0; i < is_max; i++) { /* Initialise the two local arrays */ isz = 0; PetscCall(PetscBTMemzero(m, table)); /* Extract the indices, assume there can be duplicate entries */ PetscCall(ISGetIndices(is[i], &idx)); PetscCall(ISGetLocalSize(is[i], &n)); /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ for (j = 0; j < n; ++j) { ival = idx[j] / bs; /* convert the indices into block indices */ PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim"); if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival; } PetscCall(ISRestoreIndices(is[i], &idx)); PetscCall(ISDestroy(&is[i])); k = 0; for (j = 0; j < ov; j++) { /* for each overlap*/ n = isz; for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */ row = nidx[k]; start = ai[row]; end = ai[row + 1]; for (l = start; l < end; l++) { val = aj[l]; if (!PetscBTLookupSet(table, val)) nidx[isz++] = val; } } } PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i)); } PetscCall(PetscBTDestroy(&table)); PetscCall(PetscFree(nidx)); PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c; PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens; PetscInt row, mat_i, *mat_j, tcol, *mat_ilen; const PetscInt *irow, *icol; PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2; PetscInt *aj = a->j, *ai = a->i; MatScalar *mat_a; Mat C; PetscBool flag; PetscFunctionBegin; PetscCall(ISGetIndices(isrow, &irow)); PetscCall(ISGetIndices(iscol, &icol)); PetscCall(ISGetLocalSize(isrow, &nrows)); PetscCall(ISGetLocalSize(iscol, &ncols)); PetscCall(PetscCalloc1(1 + oldcols, &smap)); ssmap = smap; PetscCall(PetscMalloc1(1 + nrows, &lens)); for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1; /* determine lens of each row */ for (i = 0; i < nrows; i++) { kstart = ai[irow[i]]; kend = kstart + a->ilen[irow[i]]; lens[i] = 0; for (k = kstart; k < kend; k++) { if (ssmap[aj[k]]) lens[i]++; } } /* Create and fill new matrix */ if (scall == MAT_REUSE_MATRIX) { c = (Mat_SeqBAIJ *)((*B)->data); PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size"); PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag)); PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros"); PetscCall(PetscArrayzero(c->ilen, c->mbs)); C = *B; } else { PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C)); PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE)); PetscCall(MatSetType(C, ((PetscObject)A)->type_name)); PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens)); } c = (Mat_SeqBAIJ *)C->data; for (i = 0; i < nrows; i++) { row = irow[i]; kstart = ai[row]; kend = kstart + a->ilen[row]; mat_i = c->i[i]; mat_j = PetscSafePointerPlusOffset(c->j, mat_i); mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2); mat_ilen = c->ilen + i; for (k = kstart; k < kend; k++) { if ((tcol = ssmap[a->j[k]])) { *mat_j++ = tcol - 1; PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2)); mat_a += bs2; (*mat_ilen)++; } } } /* sort */ if (c->j && c->a) { MatScalar *work; PetscCall(PetscMalloc1(bs2, &work)); for (i = 0; i < nrows; i++) { PetscInt ilen; mat_i = c->i[i]; mat_j = c->j + mat_i; mat_a = c->a + mat_i * bs2; ilen = c->ilen[i]; PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work)); } PetscCall(PetscFree(work)); } /* Free work space */ PetscCall(ISRestoreIndices(iscol, &icol)); PetscCall(PetscFree(smap)); PetscCall(PetscFree(lens)); PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); PetscCall(ISRestoreIndices(isrow, &irow)); *B = C; PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B) { IS is1, is2; PetscFunctionBegin; PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1)); if (isrow == iscol) { is2 = is1; PetscCall(PetscObjectReference((PetscObject)is2)); } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2)); PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B)); PetscCall(ISDestroy(&is1)); PetscCall(ISDestroy(&is2)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) { Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data; Mat_SubSppt *submatj = c->submatis1; PetscFunctionBegin; PetscCall((*submatj->destroy)(C)); PetscCall(MatDestroySubMatrix_Private(submatj)); PetscFunctionReturn(PETSC_SUCCESS); } /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */ PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[]) { PetscInt i; Mat C; Mat_SeqBAIJ *c; Mat_SubSppt *submatj; PetscFunctionBegin; for (i = 0; i < n; i++) { C = (*mat)[i]; c = (Mat_SeqBAIJ *)C->data; submatj = c->submatis1; if (submatj) { if (--((PetscObject)C)->refct <= 0) { PetscCall(PetscFree(C->factorprefix)); PetscCall((*submatj->destroy)(C)); PetscCall(MatDestroySubMatrix_Private(submatj)); PetscCall(PetscFree(C->defaultvectype)); PetscCall(PetscFree(C->defaultrandtype)); PetscCall(PetscLayoutDestroy(&C->rmap)); PetscCall(PetscLayoutDestroy(&C->cmap)); PetscCall(PetscHeaderDestroy(&C)); } } else { PetscCall(MatDestroy(&C)); } } /* Destroy Dummy submatrices created for reuse */ PetscCall(MatDestroySubMatrices_Dummy(n, mat)); PetscCall(PetscFree(*mat)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[]) { PetscInt i; PetscFunctionBegin; if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B)); for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i])); PetscFunctionReturn(PETSC_SUCCESS); } /* Should check that shapes of vectors and matrices match */ PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z, sum; const PetscScalar *x; const MatScalar *v; PetscInt mbs, i, n; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &z)); if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(z, a->mbs)); } else { mbs = a->mbs; ii = a->i; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; v = a->a + ii[0]; idx = a->j + ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ sum = 0.0; PetscSparseDensePlusDot(sum, x, v, idx, n); if (usecprow) { z[ridx[i]] = sum; } else { z[i] = sum; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &z)); PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, *zarray; const PetscScalar *x, *xb; PetscScalar x1, x2; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 2 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; sum1 = 0.0; sum2 = 0.0; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 2 * (*idx++); x1 = xb[0]; x2 = xb[1]; sum1 += v[0] * x1 + v[2] * x2; sum2 += v[1] * x1 + v[3] * x2; v += 4; } if (usecprow) z = zarray + 2 * ridx[i]; z[0] = sum1; z[1] = sum2; if (!usecprow) z += 2; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; #if defined(PETSC_HAVE_PRAGMA_DISJOINT) #pragma disjoint(*v, *z, *xb) #endif PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 3 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 3 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; v += 9; } if (usecprow) z = zarray + 3 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; if (!usecprow) z += 3; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 4 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 4 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; v += 16; } if (usecprow) z = zarray + 4 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; if (!usecprow) z += 4; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray; const PetscScalar *xb, *x; const MatScalar *v; const PetscInt *idx, *ii, *ridx = NULL; PetscInt mbs, i, j, n; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 5 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 5 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; v += 25; } if (usecprow) z = zarray + 5 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; if (!usecprow) z += 5; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, x5, x6, *zarray; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 6 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 6 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; v += 36; } if (usecprow) z = zarray + 6 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; if (!usecprow) z += 6; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 7 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 7 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; v += 49; } if (usecprow) z = zarray + 7 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; if (!usecprow) z += 7; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, *work, *workt, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; const PetscInt *idx, *ii, *ridx = NULL; PetscInt k; PetscBool usecprow = a->compressedrow.use; __m256d a0, a1, a2, a3, a4, a5; __m256d w0, w1, w2, w3; __m256d z0, z1, z2; __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, bs * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n, A->cmap->n); PetscCall(PetscMalloc1(k + 1, &a->mult_work)); } work = a->mult_work; for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; workt = work; for (j = 0; j < n; j++) { xb = x + bs * (*idx++); for (k = 0; k < bs; k++) workt[k] = xb[k]; workt += bs; } if (usecprow) z = zarray + bs * ridx[i]; z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); for (j = 0; j < n; j++) { /* first column of a */ w0 = _mm256_set1_pd(work[j * 9]); a0 = _mm256_loadu_pd(&v[j * 81]); z0 = _mm256_fmadd_pd(a0, w0, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 4]); z1 = _mm256_fmadd_pd(a1, w0, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 8]); z2 = _mm256_fmadd_pd(a2, w0, z2); /* second column of a */ w1 = _mm256_set1_pd(work[j * 9 + 1]); a0 = _mm256_loadu_pd(&v[j * 81 + 9]); z0 = _mm256_fmadd_pd(a0, w1, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 13]); z1 = _mm256_fmadd_pd(a1, w1, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 17]); z2 = _mm256_fmadd_pd(a2, w1, z2); /* third column of a */ w2 = _mm256_set1_pd(work[j * 9 + 2]); a3 = _mm256_loadu_pd(&v[j * 81 + 18]); z0 = _mm256_fmadd_pd(a3, w2, z0); a4 = _mm256_loadu_pd(&v[j * 81 + 22]); z1 = _mm256_fmadd_pd(a4, w2, z1); a5 = _mm256_loadu_pd(&v[j * 81 + 26]); z2 = _mm256_fmadd_pd(a5, w2, z2); /* fourth column of a */ w3 = _mm256_set1_pd(work[j * 9 + 3]); a0 = _mm256_loadu_pd(&v[j * 81 + 27]); z0 = _mm256_fmadd_pd(a0, w3, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 31]); z1 = _mm256_fmadd_pd(a1, w3, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 35]); z2 = _mm256_fmadd_pd(a2, w3, z2); /* fifth column of a */ w0 = _mm256_set1_pd(work[j * 9 + 4]); a3 = _mm256_loadu_pd(&v[j * 81 + 36]); z0 = _mm256_fmadd_pd(a3, w0, z0); a4 = _mm256_loadu_pd(&v[j * 81 + 40]); z1 = _mm256_fmadd_pd(a4, w0, z1); a5 = _mm256_loadu_pd(&v[j * 81 + 44]); z2 = _mm256_fmadd_pd(a5, w0, z2); /* sixth column of a */ w1 = _mm256_set1_pd(work[j * 9 + 5]); a0 = _mm256_loadu_pd(&v[j * 81 + 45]); z0 = _mm256_fmadd_pd(a0, w1, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 49]); z1 = _mm256_fmadd_pd(a1, w1, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 53]); z2 = _mm256_fmadd_pd(a2, w1, z2); /* seventh column of a */ w2 = _mm256_set1_pd(work[j * 9 + 6]); a0 = _mm256_loadu_pd(&v[j * 81 + 54]); z0 = _mm256_fmadd_pd(a0, w2, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 58]); z1 = _mm256_fmadd_pd(a1, w2, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 62]); z2 = _mm256_fmadd_pd(a2, w2, z2); /* eighth column of a */ w3 = _mm256_set1_pd(work[j * 9 + 7]); a3 = _mm256_loadu_pd(&v[j * 81 + 63]); z0 = _mm256_fmadd_pd(a3, w3, z0); a4 = _mm256_loadu_pd(&v[j * 81 + 67]); z1 = _mm256_fmadd_pd(a4, w3, z1); a5 = _mm256_loadu_pd(&v[j * 81 + 71]); z2 = _mm256_fmadd_pd(a5, w3, z2); /* ninth column of a */ w0 = _mm256_set1_pd(work[j * 9 + 8]); a0 = _mm256_loadu_pd(&v[j * 81 + 72]); z0 = _mm256_fmadd_pd(a0, w0, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 76]); z1 = _mm256_fmadd_pd(a1, w0, z1); a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); z2 = _mm256_fmadd_pd(a2, w0, z2); } _mm256_storeu_pd(&z[0], z0); _mm256_storeu_pd(&z[4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); v += n * bs2; if (!usecprow) z += bs; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } #endif PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; const PetscScalar *x, *xb; PetscScalar *zarray, xv; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx; PetscInt mbs, i, j, k, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 11 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; for (j = 0; j < n; j++) { xb = x + 11 * (idx[j]); for (k = 0; k < 11; k++) { xv = xb[k]; sum1 += v[0] * xv; sum2 += v[1] * xv; sum3 += v[2] * xv; sum4 += v[3] * xv; sum5 += v[4] * xv; sum6 += v[5] * xv; sum7 += v[6] * xv; sum8 += v[7] * xv; sum9 += v[8] * xv; sum10 += v[9] * xv; sum11 += v[10] * xv; v += 11; } } if (usecprow) z = zarray + 11 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; if (!usecprow) z += 11; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */ PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; const PetscScalar *x, *xb; PetscScalar *zarray, xv; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx; PetscInt mbs, i, j, k, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; for (j = 0; j < n; j++) { xb = x + 12 * (idx[j]); for (k = 0; k < 12; k++) { xv = xb[k]; sum1 += v[0] * xv; sum2 += v[1] * xv; sum3 += v[2] * xv; sum4 += v[3] * xv; sum5 += v[4] * xv; sum6 += v[5] * xv; sum7 += v[6] * xv; sum8 += v[7] * xv; sum9 += v[8] * xv; sum10 += v[9] * xv; sum11 += v[10] * xv; sum12 += v[11] * xv; v += 12; } } if (usecprow) z = zarray + 12 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; if (!usecprow) z += 12; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; const PetscScalar *x, *xb; PetscScalar *zarray, *yarray, xv; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx; PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; if (usecprow) { y = yarray + 12 * ridx[i]; z = zarray + 12 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; for (j = 0; j < n; j++) { xb = x + 12 * (idx[j]); for (k = 0; k < 12; k++) { xv = xb[k]; sum1 += v[0] * xv; sum2 += v[1] * xv; sum3 += v[2] * xv; sum4 += v[3] * xv; sum5 += v[4] * xv; sum6 += v[5] * xv; sum7 += v[6] * xv; sum8 += v[7] * xv; sum9 += v[8] * xv; sum10 += v[9] * xv; sum11 += v[10] * xv; sum12 += v[11] * xv; v += 12; } } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; if (!usecprow) { y += 12; z += 12; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, *zarray; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; PetscInt mbs, i, j, n; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 12 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0; for (j = 0; j < n; j++) { xb = x + 12 * (idx[j]); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; v += 48; x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; v += 48; x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; v += 48; } if (usecprow) z = zarray + 12 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; if (!usecprow) z += 12; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */ PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, *zarray, *yarray; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL; PetscInt mbs = a->mbs, i, j, n; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; if (usecprow) { y = yarray + 12 * ridx[i]; z = zarray + 12 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11]; for (j = 0; j < n; j++) { xb = x + 12 * (idx[j]); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; v += 48; x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; v += 48; x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4; sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4; sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4; sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4; sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4; sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4; sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4; sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4; sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4; sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4; sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4; sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4; v += 48; } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; if (!usecprow) { y += 12; z += 12; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, *zarray; const PetscScalar *x, *work; const MatScalar *v = a->a; PetscInt mbs, i, j, n; const PetscInt *idx = a->j, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; const PetscInt bs = 12, bs2 = 144; __m256d a0, a1, a2, a3, a4, a5; __m256d w0, w1, w2, w3; __m256d z0, z1, z2; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, bs * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd(); n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { work = x + bs * (*idx++); /* first column of a */ w0 = _mm256_set1_pd(work[0]); a0 = _mm256_loadu_pd(v + 0); z0 = _mm256_fmadd_pd(a0, w0, z0); a1 = _mm256_loadu_pd(v + 4); z1 = _mm256_fmadd_pd(a1, w0, z1); a2 = _mm256_loadu_pd(v + 8); z2 = _mm256_fmadd_pd(a2, w0, z2); /* second column of a */ w1 = _mm256_set1_pd(work[1]); a3 = _mm256_loadu_pd(v + 12); z0 = _mm256_fmadd_pd(a3, w1, z0); a4 = _mm256_loadu_pd(v + 16); z1 = _mm256_fmadd_pd(a4, w1, z1); a5 = _mm256_loadu_pd(v + 20); z2 = _mm256_fmadd_pd(a5, w1, z2); /* third column of a */ w2 = _mm256_set1_pd(work[2]); a0 = _mm256_loadu_pd(v + 24); z0 = _mm256_fmadd_pd(a0, w2, z0); a1 = _mm256_loadu_pd(v + 28); z1 = _mm256_fmadd_pd(a1, w2, z1); a2 = _mm256_loadu_pd(v + 32); z2 = _mm256_fmadd_pd(a2, w2, z2); /* fourth column of a */ w3 = _mm256_set1_pd(work[3]); a3 = _mm256_loadu_pd(v + 36); z0 = _mm256_fmadd_pd(a3, w3, z0); a4 = _mm256_loadu_pd(v + 40); z1 = _mm256_fmadd_pd(a4, w3, z1); a5 = _mm256_loadu_pd(v + 44); z2 = _mm256_fmadd_pd(a5, w3, z2); /* fifth column of a */ w0 = _mm256_set1_pd(work[4]); a0 = _mm256_loadu_pd(v + 48); z0 = _mm256_fmadd_pd(a0, w0, z0); a1 = _mm256_loadu_pd(v + 52); z1 = _mm256_fmadd_pd(a1, w0, z1); a2 = _mm256_loadu_pd(v + 56); z2 = _mm256_fmadd_pd(a2, w0, z2); /* sixth column of a */ w1 = _mm256_set1_pd(work[5]); a3 = _mm256_loadu_pd(v + 60); z0 = _mm256_fmadd_pd(a3, w1, z0); a4 = _mm256_loadu_pd(v + 64); z1 = _mm256_fmadd_pd(a4, w1, z1); a5 = _mm256_loadu_pd(v + 68); z2 = _mm256_fmadd_pd(a5, w1, z2); /* seventh column of a */ w2 = _mm256_set1_pd(work[6]); a0 = _mm256_loadu_pd(v + 72); z0 = _mm256_fmadd_pd(a0, w2, z0); a1 = _mm256_loadu_pd(v + 76); z1 = _mm256_fmadd_pd(a1, w2, z1); a2 = _mm256_loadu_pd(v + 80); z2 = _mm256_fmadd_pd(a2, w2, z2); /* eighth column of a */ w3 = _mm256_set1_pd(work[7]); a3 = _mm256_loadu_pd(v + 84); z0 = _mm256_fmadd_pd(a3, w3, z0); a4 = _mm256_loadu_pd(v + 88); z1 = _mm256_fmadd_pd(a4, w3, z1); a5 = _mm256_loadu_pd(v + 92); z2 = _mm256_fmadd_pd(a5, w3, z2); /* ninth column of a */ w0 = _mm256_set1_pd(work[8]); a0 = _mm256_loadu_pd(v + 96); z0 = _mm256_fmadd_pd(a0, w0, z0); a1 = _mm256_loadu_pd(v + 100); z1 = _mm256_fmadd_pd(a1, w0, z1); a2 = _mm256_loadu_pd(v + 104); z2 = _mm256_fmadd_pd(a2, w0, z2); /* tenth column of a */ w1 = _mm256_set1_pd(work[9]); a3 = _mm256_loadu_pd(v + 108); z0 = _mm256_fmadd_pd(a3, w1, z0); a4 = _mm256_loadu_pd(v + 112); z1 = _mm256_fmadd_pd(a4, w1, z1); a5 = _mm256_loadu_pd(v + 116); z2 = _mm256_fmadd_pd(a5, w1, z2); /* eleventh column of a */ w2 = _mm256_set1_pd(work[10]); a0 = _mm256_loadu_pd(v + 120); z0 = _mm256_fmadd_pd(a0, w2, z0); a1 = _mm256_loadu_pd(v + 124); z1 = _mm256_fmadd_pd(a1, w2, z1); a2 = _mm256_loadu_pd(v + 128); z2 = _mm256_fmadd_pd(a2, w2, z2); /* twelveth column of a */ w3 = _mm256_set1_pd(work[11]); a3 = _mm256_loadu_pd(v + 132); z0 = _mm256_fmadd_pd(a3, w3, z0); a4 = _mm256_loadu_pd(v + 136); z1 = _mm256_fmadd_pd(a4, w3, z1); a5 = _mm256_loadu_pd(v + 140); z2 = _mm256_fmadd_pd(a5, w3, z2); v += bs2; } if (usecprow) z = zarray + bs * ridx[i]; _mm256_storeu_pd(&z[0], z0); _mm256_storeu_pd(&z[4], z1); _mm256_storeu_pd(&z[8], z2); if (!usecprow) z += bs; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } #endif /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ /* Default MatMult for block size 15 */ PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; const PetscScalar *x, *xb; PetscScalar *zarray, xv; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx; PetscInt mbs, i, j, k, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0; sum15 = 0.0; for (j = 0; j < n; j++) { xb = x + 15 * (idx[j]); for (k = 0; k < 15; k++) { xv = xb[k]; sum1 += v[0] * xv; sum2 += v[1] * xv; sum3 += v[2] * xv; sum4 += v[3] * xv; sum5 += v[4] * xv; sum6 += v[5] * xv; sum7 += v[6] * xv; sum8 += v[7] * xv; sum9 += v[8] * xv; sum10 += v[9] * xv; sum11 += v[10] * xv; sum12 += v[11] * xv; sum13 += v[12] * xv; sum14 += v[13] * xv; sum15 += v[14] * xv; v += 15; } } if (usecprow) z = zarray + 15 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14; z[14] = sum15; if (!usecprow) z += 15; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, *zarray; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx; PetscInt mbs, i, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0; sum15 = 0.0; for (j = 0; j < n; j++) { xb = x + 15 * (idx[j]); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; v += 60; x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; v += 60; x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4; sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4; sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4; sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4; sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4; sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4; sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4; sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4; sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4; sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4; sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4; sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4; sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4; sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4; sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4; v += 60; x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3; sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3; sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3; sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3; sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3; sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3; sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3; sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3; sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3; sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3; sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3; sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3; sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3; sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3; sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3; v += 45; } if (usecprow) z = zarray + 15 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14; z[14] = sum15; if (!usecprow) z += 15; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx; PetscInt mbs, i, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0; sum15 = 0.0; for (j = 0; j < n; j++) { xb = x + 15 * (idx[j]); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; x8 = xb[7]; sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8; sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8; sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8; sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8; sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8; sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8; sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8; sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8; sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8; sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8; sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8; sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8; sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8; sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8; sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8; v += 120; x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7; sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7; sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7; sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7; sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7; sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7; sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7; sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7; sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7; sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7; sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7; sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7; sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7; sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7; sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7; v += 105; } if (usecprow) z = zarray + 15 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14; z[14] = sum15; if (!usecprow) z += 15; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray; const MatScalar *v; const PetscInt *ii, *ij = a->j, *idx; PetscInt mbs, i, j, n, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, 15 * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[i + 1] - ii[i]; idx = ij + ii[i]; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0; sum15 = 0.0; for (j = 0; j < n; j++) { xb = x + 15 * (idx[j]); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13]; x15 = xb[14]; sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15; sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15; sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15; sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15; sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15; sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15; sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15; sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15; sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15; sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15; sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15; sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15; sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15; sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15; sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15; v += 225; } if (usecprow) z = zarray + 15 * ridx[i]; z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14; z[14] = sum15; if (!usecprow) z += 15; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } /* This will not work with MatScalar == float because it calls the BLAS */ PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, *work, *workt, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; const PetscInt *idx, *ii, *ridx = NULL; PetscInt ncols, k; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayWrite(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; PetscCall(PetscArrayzero(zarray, bs * a->mbs)); } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n, A->cmap->n); PetscCall(PetscMalloc1(k + 1, &a->mult_work)); } work = a->mult_work; for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; ncols = n * bs; workt = work; for (j = 0; j < n; j++) { xb = x + bs * (*idx++); for (k = 0; k < bs; k++) workt[k] = xb[k]; workt += bs; } if (usecprow) z = zarray + bs * ridx[i]; PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z); v += n * bs2; if (!usecprow) z += bs; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayWrite(zz, &zarray)); PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; const PetscScalar *x; PetscScalar *y, *z, sum; const MatScalar *v; PetscInt mbs = a->mbs, i, n, *ridx = NULL; const PetscInt *idx, *ii; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &y, &z)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (!usecprow) { sum = y[i]; } else { sum = y[ridx[i]]; } PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ PetscSparseDensePlusDot(sum, x, v, idx, n); v += n; idx += n; if (usecprow) { z[ridx[i]] = sum; } else { z[i] = sum; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); PetscCall(PetscLogFlops(2.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *y = NULL, *z = NULL, sum1, sum2; const PetscScalar *x, *xb; PetscScalar x1, x2, *yarray, *zarray; const MatScalar *v; PetscInt mbs = a->mbs, i, n, j; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 2 * ridx[i]; y = yarray + 2 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 2 * (*idx++); x1 = xb[0]; x2 = xb[1]; sum1 += v[0] * x1 + v[2] * x2; sum2 += v[1] * x1 + v[3] * x2; v += 4; } z[0] = sum1; z[1] = sum2; if (!usecprow) { z += 2; y += 2; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(4.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs = a->mbs, i, j, n; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 3 * ridx[i]; y = yarray + 3 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 3 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; v += 9; } z[0] = sum1; z[1] = sum2; z[2] = sum3; if (!usecprow) { z += 3; y += 3; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(18.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs = a->mbs, i, j, n; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 4 * ridx[i]; y = yarray + 4 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 4 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; v += 16; } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; if (!usecprow) { z += 4; y += 4; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(32.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5; const PetscScalar *x, *xb; PetscScalar *yarray, *zarray; const MatScalar *v; PetscInt mbs = a->mbs, i, j, n; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 5 * ridx[i]; y = yarray + 5 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 5 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; v += 25; } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; if (!usecprow) { z += 5; y += 5; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(50.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray; const MatScalar *v; PetscInt mbs = a->mbs, i, j, n; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 6 * ridx[i]; y = yarray + 6 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 6 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; v += 36; } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; if (!usecprow) { z += 6; y += 6; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(72.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray; const MatScalar *v; PetscInt mbs = a->mbs, i, j, n; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 7 * ridx[i]; y = yarray + 7 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 7 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; v += 49; } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; if (!usecprow) { z += 7; y += 7; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(98.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, *work, *workt, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs, i, j, n; PetscInt k; PetscBool usecprow = a->compressedrow.use; const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81; __m256d a0, a1, a2, a3, a4, a5; __m256d w0, w1, w2, w3; __m256d z0, z1, z2; __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63); PetscFunctionBegin; PetscCall(VecCopy(yy, zz)); PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArray(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n, A->cmap->n); PetscCall(PetscMalloc1(k + 1, &a->mult_work)); } work = a->mult_work; for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; workt = work; for (j = 0; j < n; j++) { xb = x + bs * (*idx++); for (k = 0; k < bs; k++) workt[k] = xb[k]; workt += bs; } if (usecprow) z = zarray + bs * ridx[i]; z0 = _mm256_loadu_pd(&z[0]); z1 = _mm256_loadu_pd(&z[4]); z2 = _mm256_set1_pd(z[8]); for (j = 0; j < n; j++) { /* first column of a */ w0 = _mm256_set1_pd(work[j * 9]); a0 = _mm256_loadu_pd(&v[j * 81]); z0 = _mm256_fmadd_pd(a0, w0, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 4]); z1 = _mm256_fmadd_pd(a1, w0, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 8]); z2 = _mm256_fmadd_pd(a2, w0, z2); /* second column of a */ w1 = _mm256_set1_pd(work[j * 9 + 1]); a0 = _mm256_loadu_pd(&v[j * 81 + 9]); z0 = _mm256_fmadd_pd(a0, w1, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 13]); z1 = _mm256_fmadd_pd(a1, w1, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 17]); z2 = _mm256_fmadd_pd(a2, w1, z2); /* third column of a */ w2 = _mm256_set1_pd(work[j * 9 + 2]); a3 = _mm256_loadu_pd(&v[j * 81 + 18]); z0 = _mm256_fmadd_pd(a3, w2, z0); a4 = _mm256_loadu_pd(&v[j * 81 + 22]); z1 = _mm256_fmadd_pd(a4, w2, z1); a5 = _mm256_loadu_pd(&v[j * 81 + 26]); z2 = _mm256_fmadd_pd(a5, w2, z2); /* fourth column of a */ w3 = _mm256_set1_pd(work[j * 9 + 3]); a0 = _mm256_loadu_pd(&v[j * 81 + 27]); z0 = _mm256_fmadd_pd(a0, w3, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 31]); z1 = _mm256_fmadd_pd(a1, w3, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 35]); z2 = _mm256_fmadd_pd(a2, w3, z2); /* fifth column of a */ w0 = _mm256_set1_pd(work[j * 9 + 4]); a3 = _mm256_loadu_pd(&v[j * 81 + 36]); z0 = _mm256_fmadd_pd(a3, w0, z0); a4 = _mm256_loadu_pd(&v[j * 81 + 40]); z1 = _mm256_fmadd_pd(a4, w0, z1); a5 = _mm256_loadu_pd(&v[j * 81 + 44]); z2 = _mm256_fmadd_pd(a5, w0, z2); /* sixth column of a */ w1 = _mm256_set1_pd(work[j * 9 + 5]); a0 = _mm256_loadu_pd(&v[j * 81 + 45]); z0 = _mm256_fmadd_pd(a0, w1, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 49]); z1 = _mm256_fmadd_pd(a1, w1, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 53]); z2 = _mm256_fmadd_pd(a2, w1, z2); /* seventh column of a */ w2 = _mm256_set1_pd(work[j * 9 + 6]); a0 = _mm256_loadu_pd(&v[j * 81 + 54]); z0 = _mm256_fmadd_pd(a0, w2, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 58]); z1 = _mm256_fmadd_pd(a1, w2, z1); a2 = _mm256_loadu_pd(&v[j * 81 + 62]); z2 = _mm256_fmadd_pd(a2, w2, z2); /* eighth column of a */ w3 = _mm256_set1_pd(work[j * 9 + 7]); a3 = _mm256_loadu_pd(&v[j * 81 + 63]); z0 = _mm256_fmadd_pd(a3, w3, z0); a4 = _mm256_loadu_pd(&v[j * 81 + 67]); z1 = _mm256_fmadd_pd(a4, w3, z1); a5 = _mm256_loadu_pd(&v[j * 81 + 71]); z2 = _mm256_fmadd_pd(a5, w3, z2); /* ninth column of a */ w0 = _mm256_set1_pd(work[j * 9 + 8]); a0 = _mm256_loadu_pd(&v[j * 81 + 72]); z0 = _mm256_fmadd_pd(a0, w0, z0); a1 = _mm256_loadu_pd(&v[j * 81 + 76]); z1 = _mm256_fmadd_pd(a1, w0, z1); a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1); z2 = _mm256_fmadd_pd(a2, w0, z2); } _mm256_storeu_pd(&z[0], z0); _mm256_storeu_pd(&z[4], z1); _mm256_maskstore_pd(&z[8], mask1, z2); v += n * bs2; if (!usecprow) z += bs; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArray(zz, &zarray)); PetscCall(PetscLogFlops(162.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } #endif PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11; const PetscScalar *x, *xb; PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray; const MatScalar *v; PetscInt mbs = a->mbs, i, j, n; const PetscInt *idx, *ii, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray)); idx = a->j; v = a->a; if (usecprow) { if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs)); mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { ii = a->i; y = yarray; z = zarray; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) { z = zarray + 11 * ridx[i]; y = yarray + 11 * ridx[i]; } sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0; j < n; j++) { xb = x + 11 * (*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11; sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11; sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11; sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11; sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11; sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11; sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11; sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11; sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11; sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11; sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11; v += 121; } z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; if (!usecprow) { z += 11; y += 11; } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray)); PetscCall(PetscLogFlops(242.0 * a->nz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, *work, *workt, *zarray; const PetscScalar *x, *xb; const MatScalar *v; PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; PetscInt ncols, k; const PetscInt *ridx = NULL, *idx, *ii; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; PetscCall(VecCopy(yy, zz)); PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArray(zz, &zarray)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = zarray; } if (!a->mult_work) { k = PetscMax(A->rmap->n, A->cmap->n); PetscCall(PetscMalloc1(k + 1, &a->mult_work)); } work = a->mult_work; for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; ncols = n * bs; workt = work; for (j = 0; j < n; j++) { xb = x + bs * (*idx++); for (k = 0; k < bs; k++) workt[k] = xb[k]; workt += bs; } if (usecprow) z = zarray + bs * ridx[i]; PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z); v += n * bs2; if (!usecprow) z += bs; } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArray(zz, &zarray)); PetscCall(PetscLogFlops(2.0 * a->nz * bs2)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { PetscScalar zero = 0.0; PetscFunctionBegin; PetscCall(VecSet(zz, zero)); PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz) { PetscScalar zero = 0.0; PetscFunctionBegin; PetscCall(VecSet(zz, zero)); PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z, x1, x2, x3, x4, x5; const PetscScalar *x, *xb = NULL; const MatScalar *v; PetscInt mbs, i, rval, bs = A->rmap->bs, j, n; const PetscInt *idx, *ii, *ib, *ridx = NULL; Mat_CompressedRow cprow = a->compressedrow; PetscBool usecprow = cprow.use; PetscFunctionBegin; if (yy != zz) PetscCall(VecCopy(yy, zz)); PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArray(zz, &z)); idx = a->j; v = a->a; if (usecprow) { mbs = cprow.nrows; ii = cprow.i; ridx = cprow.rindex; } else { mbs = a->mbs; ii = a->i; xb = x; } switch (bs) { case 1: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + ridx[i]; x1 = xb[0]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j]; z[rval] += PetscConj(*v) * x1; v++; } if (!usecprow) xb++; } break; case 2: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 2 * ridx[i]; x1 = xb[0]; x2 = xb[1]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 2; z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2; z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2; v += 4; } if (!usecprow) xb += 2; } break; case 3: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 3 * ridx[i]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 3; z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3; z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3; z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3; v += 9; } if (!usecprow) xb += 3; } break; case 4: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 4 * ridx[i]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 4; z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4; z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4; z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4; z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4; v += 16; } if (!usecprow) xb += 4; } break; case 5: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 5 * ridx[i]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 5; z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5; z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5; z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5; z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5; z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5; v += 25; } if (!usecprow) xb += 5; } break; default: /* block sizes larger than 5 by 5 are handled by BLAS */ SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet"); #if 0 { PetscInt ncols,k,bs2=a->bs2; PetscScalar *work,*workt,zb; const PetscScalar *xtmp; if (!a->mult_work) { k = PetscMax(A->rmap->n,A->cmap->n); PetscCall(PetscMalloc1(k+1,&a->mult_work)); } work = a->mult_work; xtmp = x; for (i=0; inz * a->bs2)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *zb, *z, x1, x2, x3, x4, x5; const PetscScalar *x, *xb = NULL; const MatScalar *v; PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2; const PetscInt *idx, *ii, *ib, *ridx = NULL; Mat_CompressedRow cprow = a->compressedrow; PetscBool usecprow = cprow.use; PetscFunctionBegin; if (yy != zz) PetscCall(VecCopy(yy, zz)); PetscCall(VecGetArrayRead(xx, &x)); PetscCall(VecGetArray(zz, &z)); idx = a->j; v = a->a; if (usecprow) { mbs = cprow.nrows; ii = cprow.i; ridx = cprow.rindex; } else { mbs = a->mbs; ii = a->i; xb = x; } switch (bs) { case 1: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + ridx[i]; x1 = xb[0]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j]; z[rval] += *v * x1; v++; } if (!usecprow) xb++; } break; case 2: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 2 * ridx[i]; x1 = xb[0]; x2 = xb[1]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 2; z[rval++] += v[0] * x1 + v[1] * x2; z[rval++] += v[2] * x1 + v[3] * x2; v += 4; } if (!usecprow) xb += 2; } break; case 3: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 3 * ridx[i]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 3; z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3; z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3; z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3; v += 9; } if (!usecprow) xb += 3; } break; case 4: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 4 * ridx[i]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 4; z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; v += 16; } if (!usecprow) xb += 4; } break; case 5: for (i = 0; i < mbs; i++) { if (usecprow) xb = x + 5 * ridx[i]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; ib = idx + ii[0]; n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { rval = ib[j] * 5; z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; v += 25; } if (!usecprow) xb += 5; } break; default: { /* block sizes larger than 5 by 5 are handled by BLAS */ PetscInt ncols, k; PetscScalar *work, *workt; const PetscScalar *xtmp; if (!a->mult_work) { k = PetscMax(A->rmap->n, A->cmap->n); PetscCall(PetscMalloc1(k + 1, &a->mult_work)); } work = a->mult_work; xtmp = x; for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; ncols = n * bs; PetscCall(PetscArrayzero(work, ncols)); if (usecprow) xtmp = x + bs * ridx[i]; PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work); v += n * bs2; if (!usecprow) xtmp += bs; workt = work; for (j = 0; j < n; j++) { zb = z + bs * (*idx++); for (k = 0; k < bs; k++) zb[k] += workt[k]; workt += bs; } } } } PetscCall(VecRestoreArrayRead(xx, &x)); PetscCall(VecRestoreArray(zz, &z)); PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; PetscInt totalnz = a->bs2 * a->nz; PetscScalar oalpha = alpha; PetscBLASInt one = 1, tnz; PetscFunctionBegin; PetscCall(PetscBLASIntCast(totalnz, &tnz)); PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one)); PetscCall(PetscLogFlops(totalnz)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; MatScalar *v = a->a; PetscReal sum = 0.0; PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1; PetscFunctionBegin; if (type == NORM_FROBENIUS) { #if defined(PETSC_USE_REAL___FP16) PetscBLASInt one = 1, cnt = bs2 * nz; PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one)); #else for (i = 0; i < bs2 * nz; i++) { sum += PetscRealPart(PetscConj(*v) * (*v)); v++; } #endif *norm = PetscSqrtReal(sum); PetscCall(PetscLogFlops(2.0 * bs2 * nz)); } else if (type == NORM_1) { /* maximum column sum */ PetscReal *tmp; PetscInt *bcol = a->j; PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp)); for (i = 0; i < nz; i++) { for (j = 0; j < bs; j++) { k1 = bs * (*bcol) + j; /* column index */ for (k = 0; k < bs; k++) { tmp[k1] += PetscAbsScalar(*v); v++; } } bcol++; } *norm = 0.0; for (j = 0; j < A->cmap->n; j++) { if (tmp[j] > *norm) *norm = tmp[j]; } PetscCall(PetscFree(tmp)); PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); } else if (type == NORM_INFINITY) { /* maximum row sum */ *norm = 0.0; for (k = 0; k < bs; k++) { for (j = 0; j < a->mbs; j++) { v = a->a + bs2 * a->i[j] + k; sum = 0.0; for (i = 0; i < a->i[j + 1] - a->i[j]; i++) { for (k1 = 0; k1 < bs; k1++) { sum += PetscAbsScalar(*v); v += bs; } } if (sum > *norm) *norm = sum; } } PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0))); } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet"); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscInt n; const PetscInt bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2; PetscScalar *x; const MatScalar *aa = a->a, *aa_j; const PetscInt *ai = a->i, *adiag; PetscBool diagDense; PetscFunctionBegin; PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); PetscCall(VecGetLocalSize(v, &n)); PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); PetscCall(VecGetArrayWrite(v, &x)); PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense)); if (diagDense) { for (PetscInt i = 0, row = 0; i < ambs; i++) { aa_j = aa + adiag[i] * bs2; for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k]; } } else { for (PetscInt i = 0, row = 0; i < ambs; i++) { const PetscInt j = adiag[i]; if (j != ai[i + 1]) { aa_j = aa + j * bs2; for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k]; } else { for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0; } } } PetscCall(VecRestoreArrayWrite(v, &x)); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; const PetscScalar *l, *r, *li, *ri; PetscScalar x; MatScalar *aa, *v; PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai; const PetscInt *ai, *aj; PetscFunctionBegin; ai = a->i; aj = a->j; aa = a->a; m = A->rmap->n; n = A->cmap->n; bs = A->rmap->bs; mbs = a->mbs; bs2 = a->bs2; if (ll) { PetscCall(VecGetArrayRead(ll, &l)); PetscCall(VecGetLocalSize(ll, &lm)); PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); for (i = 0; i < mbs; i++) { /* for each block row */ M = ai[i + 1] - ai[i]; li = l + i * bs; v = PetscSafePointerPlusOffset(aa, bs2 * ai[i]); for (j = 0; j < M; j++) { /* for each block */ for (k = 0; k < bs2; k++) (*v++) *= li[k % bs]; } } PetscCall(VecRestoreArrayRead(ll, &l)); PetscCall(PetscLogFlops(a->nz)); } if (rr) { PetscCall(VecGetArrayRead(rr, &r)); PetscCall(VecGetLocalSize(rr, &rn)); PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); for (i = 0; i < mbs; i++) { /* for each block row */ iai = ai[i]; M = ai[i + 1] - iai; v = PetscSafePointerPlusOffset(aa, bs2 * iai); for (j = 0; j < M; j++) { /* for each block */ ri = r + bs * aj[iai + j]; for (k = 0; k < bs; k++) { x = ri[k]; for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x; v += bs; } } } PetscCall(VecRestoreArrayRead(rr, &r)); PetscCall(PetscLogFlops(a->nz)); } PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscFunctionBegin; info->block_size = a->bs2; info->nz_allocated = a->bs2 * a->maxnz; info->nz_used = a->bs2 * a->nz; info->nz_unneeded = info->nz_allocated - info->nz_used; info->assemblies = A->num_ass; info->mallocs = A->info.mallocs; info->memory = 0; /* REVIEW ME */ if (A->factortype) { info->fill_ratio_given = A->info.fill_ratio_given; info->fill_ratio_needed = A->info.fill_ratio_needed; info->factor_mallocs = A->info.factor_mallocs; } else { info->fill_ratio_given = 0; info->fill_ratio_needed = 0; info->factor_mallocs = 0; } PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscFunctionBegin; PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs])); PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { PetscFunctionBegin; PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense; PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { PetscFunctionBegin; PetscCall(MatTransposeMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqBAIJ_SeqDense; PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1; const PetscScalar *xb; PetscScalar x1; const MatScalar *v, *vv; PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ if (usecprow) z = c + ridx[i]; jj = idx; vv = v; for (k = 0; k < cn; k++) { idx = jj; v = vv; sum1 = 0.0; for (j = 0; j < n; j++) { xb = b + (*idx++); x1 = xb[0 + k * bm]; sum1 += v[0] * x1; v += 1; } z[0 + k * cm] = sum1; } if (!usecprow) z += 1; } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2; const PetscScalar *xb; PetscScalar x1, x2; const MatScalar *v, *vv; PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ if (usecprow) z = c + 2 * ridx[i]; jj = idx; vv = v; for (k = 0; k < cn; k++) { idx = jj; v = vv; sum1 = 0.0; sum2 = 0.0; for (j = 0; j < n; j++) { xb = b + 2 * (*idx++); x1 = xb[0 + k * bm]; x2 = xb[1 + k * bm]; sum1 += v[0] * x1 + v[2] * x2; sum2 += v[1] * x1 + v[3] * x2; v += 4; } z[0 + k * cm] = sum1; z[1 + k * cm] = sum2; } if (!usecprow) z += 2; } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3; const PetscScalar *xb; PetscScalar x1, x2, x3; const MatScalar *v, *vv; PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ if (usecprow) z = c + 3 * ridx[i]; jj = idx; vv = v; for (k = 0; k < cn; k++) { idx = jj; v = vv; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; for (j = 0; j < n; j++) { xb = b + 3 * (*idx++); x1 = xb[0 + k * bm]; x2 = xb[1 + k * bm]; x3 = xb[2 + k * bm]; sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3; sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3; sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3; v += 9; } z[0 + k * cm] = sum1; z[1 + k * cm] = sum2; z[2 + k * cm] = sum3; } if (!usecprow) z += 3; } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4; const PetscScalar *xb; PetscScalar x1, x2, x3, x4; const MatScalar *v, *vv; PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ if (usecprow) z = c + 4 * ridx[i]; jj = idx; vv = v; for (k = 0; k < cn; k++) { idx = jj; v = vv; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; for (j = 0; j < n; j++) { xb = b + 4 * (*idx++); x1 = xb[0 + k * bm]; x2 = xb[1 + k * bm]; x3 = xb[2 + k * bm]; x4 = xb[3 + k * bm]; sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; v += 16; } z[0 + k * cm] = sum1; z[1 + k * cm] = sum2; z[2 + k * cm] = sum3; z[3 + k * cm] = sum4; } if (!usecprow) z += 4; } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5; const PetscScalar *xb; PetscScalar x1, x2, x3, x4, x5; const MatScalar *v, *vv; PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ if (usecprow) z = c + 5 * ridx[i]; jj = idx; vv = v; for (k = 0; k < cn; k++) { idx = jj; v = vv; sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; for (j = 0; j < n; j++) { xb = b + 5 * (*idx++); x1 = xb[0 + k * bm]; x2 = xb[1 + k * bm]; x3 = xb[2 + k * bm]; x4 = xb[3 + k * bm]; x5 = xb[4 + k * bm]; sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; v += 25; } z[0 + k * cm] = sum1; z[1 + k * cm] = sum2; z[2 + k * cm] = sum3; z[3 + k * cm] = sum4; z[4 + k * cm] = sum5; } if (!usecprow) z += 5; } PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; Mat_SeqDense *bd = (Mat_SeqDense *)B->data; Mat_SeqDense *cd = (Mat_SeqDense *)C->data; PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; PetscBLASInt bbs, bcn, bbm, bcm; PetscScalar *z = NULL; PetscScalar *c, *b; const MatScalar *v; const PetscInt *idx, *ii, *ridx = NULL; PetscScalar _DZero = 0.0, _DOne = 1.0; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); b = bd->v; if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C)); PetscCall(MatDenseGetArrayWrite(C, &c)); switch (bs) { case 1: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break; case 2: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break; case 3: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break; case 4: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break; case 5: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break; default: /* block sizes larger than 5 by 5 are handled by BLAS */ PetscCall(PetscBLASIntCast(bs, &bbs)); PetscCall(PetscBLASIntCast(cn, &bcn)); PetscCall(PetscBLASIntCast(bm, &bbm)); PetscCall(PetscBLASIntCast(cm, &bcm)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; z = c; } for (i = 0; i < mbs; i++) { n = ii[1] - ii[0]; ii++; if (usecprow) z = c + bs * ridx[i]; if (n) { PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm)); v += bs2; } for (j = 1; j < n; j++) { PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm)); v += bs2; } if (!usecprow) z += bs; } } PetscCall(MatDenseRestoreArrayWrite(C, &c)); PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatTransposeMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; const PetscScalar *bi; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; } for (i = 0; i < mbs; i++) { PetscInt brow = usecprow ? ridx[i] : i; n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0, bi = b + 1 * brow; j < n; j++) { PetscScalar *zcol = c + 1 * (*idx++); for (k = 0; k < cn; k++) zcol[0 + k * cm] += v[0] * bi[k * bm]; ++v; } } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatTransposeMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; const PetscScalar *bi; PetscScalar x1, x2; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; } for (i = 0; i < mbs; i++) { PetscInt brow = usecprow ? ridx[i] : i; n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0, bi = b + 2 * brow; j < n; j++) { PetscScalar *zcol = c + 2 * (*idx++); for (k = 0; k < cn; k++) { x1 = bi[0 + k * bm]; x2 = bi[1 + k * bm]; zcol[0 + k * cm] += v[0] * x1 + v[1] * x2; zcol[1 + k * cm] += v[2] * x1 + v[3] * x2; } v += 4; } } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatTransposeMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; const PetscScalar *bi; PetscScalar x1, x2, x3; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; } for (i = 0; i < mbs; i++) { PetscInt brow = usecprow ? ridx[i] : i; n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0, bi = b + 3 * brow; j < n; j++) { PetscScalar *zcol = c + 3 * (*idx++); for (k = 0; k < cn; k++) { x1 = bi[0 + k * bm]; x2 = bi[1 + k * bm]; x3 = bi[2 + k * bm]; zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3; zcol[1 + k * cm] += v[3] * x1 + v[4] * x2 + v[5] * x3; zcol[2 + k * cm] += v[6] * x1 + v[7] * x2 + v[8] * x3; } v += 9; } } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatTransposeMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; const PetscScalar *bi; PetscScalar x1, x2, x3, x4; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; } for (i = 0; i < mbs; i++) { PetscInt brow = usecprow ? ridx[i] : i; n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0, bi = b + 4 * brow; j < n; j++) { PetscScalar *zcol = c + 4 * (*idx++); for (k = 0; k < cn; k++) { x1 = bi[0 + k * bm]; x2 = bi[1 + k * bm]; x3 = bi[2 + k * bm]; x4 = bi[3 + k * bm]; zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; zcol[1 + k * cm] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; zcol[2 + k * cm] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; zcol[3 + k * cm] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; } v += 16; } } PetscFunctionReturn(PETSC_SUCCESS); } static PetscErrorCode MatTransposeMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; const MatScalar *v; PetscInt mbs, i, *idx, *ii, j, n, k, *ridx = NULL; PetscBool usecprow = a->compressedrow.use; const PetscScalar *bi; PetscScalar x1, x2, x3, x4, x5; PetscFunctionBegin; idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; } for (i = 0; i < mbs; i++) { PetscInt brow = usecprow ? ridx[i] : i; n = ii[1] - ii[0]; ii++; PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ for (j = 0, bi = b + 5 * brow; j < n; j++) { PetscScalar *zcol = c + 5 * (*idx++); for (k = 0; k < cn; k++) { x1 = bi[0 + k * bm]; x2 = bi[1 + k * bm]; x3 = bi[2 + k * bm]; x4 = bi[3 + k * bm]; x5 = bi[4 + k * bm]; zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5; zcol[1 + k * cm] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5; zcol[2 + k * cm] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5; zcol[3 + k * cm] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5; zcol[4 + k * cm] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5; } v += 25; } } PetscFunctionReturn(PETSC_SUCCESS); } PetscErrorCode MatTransposeMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C) { Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; Mat_SeqDense *bd = (Mat_SeqDense *)B->data; Mat_SeqDense *cd = (Mat_SeqDense *)C->data; PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda; PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2; PetscBLASInt bbs, bcn, bbm, bcm; PetscScalar *c, *b; const MatScalar *v; const PetscInt *idx, *ii, *ridx = NULL; PetscScalar _DOne = 1.0; PetscBool usecprow = a->compressedrow.use; PetscFunctionBegin; if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); PetscCheck(B->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->rmap->n, B->rmap->n); PetscCheck(A->cmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal columns in A %" PetscInt_FMT, C->rmap->n, A->cmap->n); PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); b = bd->v; PetscCall(MatZeroEntries(C)); PetscCall(MatDenseGetArrayWrite(C, &c)); switch (bs) { case 1: PetscCall(MatTransposeMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn)); break; case 2: PetscCall(MatTransposeMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn)); break; case 3: PetscCall(MatTransposeMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn)); break; case 4: PetscCall(MatTransposeMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn)); break; case 5: PetscCall(MatTransposeMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn)); break; default: /* block sizes larger than 5 by 5 are handled by BLAS */ PetscCall(PetscBLASIntCast(bs, &bbs)); PetscCall(PetscBLASIntCast(cn, &bcn)); PetscCall(PetscBLASIntCast(bm, &bbm)); PetscCall(PetscBLASIntCast(cm, &bcm)); idx = a->j; v = a->a; if (usecprow) { mbs = a->compressedrow.nrows; ii = a->compressedrow.i; ridx = a->compressedrow.rindex; } else { mbs = a->mbs; ii = a->i; } for (i = 0; i < mbs; i++) { const PetscScalar *bi = b + bs * (usecprow ? ridx[i] : i); n = ii[1] - ii[0]; ii++; for (j = 0; j < n; j++) { PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, bi, &bbm, &_DOne, c + bs * (*idx++), &bcm)); v += bs2; } } } PetscCall(MatDenseRestoreArrayWrite(C, &c)); PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn)); PetscFunctionReturn(PETSC_SUCCESS); }