112facf1bSJunchao Zhang #include <petscdevice_cuda.h>
212facf1bSJunchao Zhang #include <petsc/private/petsclegacycupmblas.h>
312facf1bSJunchao Zhang #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h>
412facf1bSJunchao Zhang
512facf1bSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0)
MatMultBatched(PetscInt bs,PetscInt mbs,const PetscScalar * A,const PetscScalar * x,PetscScalar * y,PetscBool transpose)612facf1bSJunchao Zhang __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
712facf1bSJunchao Zhang {
812facf1bSJunchao Zhang const PetscInt gridSize = gridDim.x * blockDim.x;
912facf1bSJunchao Zhang PetscInt row = blockIdx.x * blockDim.x + threadIdx.x;
1012facf1bSJunchao Zhang const PetscInt bs2 = bs * bs;
1112facf1bSJunchao Zhang
1212facf1bSJunchao Zhang /* One row per thread. The blocks are stored in column-major order */
1312facf1bSJunchao Zhang for (; row < bs * mbs; row += gridSize) {
1412facf1bSJunchao Zhang const PetscScalar *Ap, *xp;
1512facf1bSJunchao Zhang PetscScalar *yp;
1612facf1bSJunchao Zhang PetscInt i, j, k;
1712facf1bSJunchao Zhang
1812facf1bSJunchao Zhang k = row / bs; /* k-th block */
1912facf1bSJunchao Zhang i = row % bs; /* this thread deals with i-th row of the block */
2012facf1bSJunchao Zhang Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */
2112facf1bSJunchao Zhang xp = &x[bs * k];
2212facf1bSJunchao Zhang yp = &y[bs * k];
2312facf1bSJunchao Zhang /* multiply i-th row (column) with x */
2412facf1bSJunchao Zhang yp[i] = 0.0;
2512facf1bSJunchao Zhang for (j = 0; j < bs; j++) {
2612facf1bSJunchao Zhang yp[i] += Ap[0] * xp[j];
2712facf1bSJunchao Zhang Ap += (transpose ? 1 : bs); /* block is in column major order */
2812facf1bSJunchao Zhang }
2912facf1bSJunchao Zhang }
3012facf1bSJunchao Zhang }
3112facf1bSJunchao Zhang #endif
3212facf1bSJunchao Zhang
PCApplyOrTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y,cublasOperation_t op)3312facf1bSJunchao Zhang static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op)
3412facf1bSJunchao Zhang {
3512facf1bSJunchao Zhang const PetscScalar *xx;
3612facf1bSJunchao Zhang PetscScalar *yy;
3712facf1bSJunchao Zhang cublasHandle_t handle;
3812facf1bSJunchao Zhang PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
3912facf1bSJunchao Zhang const PetscScalar *A = (const PetscScalar *)jac->spptr;
4012facf1bSJunchao Zhang const PetscInt bs = jac->bs, mbs = jac->mbs;
4112facf1bSJunchao Zhang
4212facf1bSJunchao Zhang PetscFunctionBegin;
4312facf1bSJunchao Zhang PetscCall(VecCUDAGetArrayRead(x, &xx));
4412facf1bSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(y, &yy));
4512facf1bSJunchao Zhang PetscCall(PetscCUBLASGetHandle(&handle));
4612facf1bSJunchao Zhang PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */
4712facf1bSJunchao Zhang
4812facf1bSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0)
4912facf1bSJunchao Zhang /* y = alpha op(A) x + beta y */
5012facf1bSJunchao Zhang const PetscScalar alpha = 1.0, beta = 0.0;
5112facf1bSJunchao Zhang PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs));
5212facf1bSJunchao Zhang #else
5312facf1bSJunchao Zhang PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */
54*57508eceSPierre Jolivet MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE);
5512facf1bSJunchao Zhang PetscCallCUDA(cudaGetLastError());
5612facf1bSJunchao Zhang #endif
5712facf1bSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(x, &xx));
5812facf1bSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(y, &yy));
5912facf1bSJunchao Zhang PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2));
603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
6112facf1bSJunchao Zhang }
6212facf1bSJunchao Zhang
PCApply_PBJacobi_CUDA(PC pc,Vec x,Vec y)6312facf1bSJunchao Zhang static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y)
6412facf1bSJunchao Zhang {
6512facf1bSJunchao Zhang PetscFunctionBegin;
6612facf1bSJunchao Zhang PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose
673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
6812facf1bSJunchao Zhang }
6912facf1bSJunchao Zhang
PCApplyTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y)7012facf1bSJunchao Zhang static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y)
7112facf1bSJunchao Zhang {
7212facf1bSJunchao Zhang PetscFunctionBegin;
7312facf1bSJunchao Zhang PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose
743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
7512facf1bSJunchao Zhang }
7612facf1bSJunchao Zhang
PCDestroy_PBJacobi_CUDA(PC pc)7712facf1bSJunchao Zhang static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc)
7812facf1bSJunchao Zhang {
7912facf1bSJunchao Zhang PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
8012facf1bSJunchao Zhang
8112facf1bSJunchao Zhang PetscFunctionBegin;
8212facf1bSJunchao Zhang PetscCallCUDA(cudaFree(jac->spptr));
8312facf1bSJunchao Zhang PetscCall(PCDestroy_PBJacobi(pc));
843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
8512facf1bSJunchao Zhang }
8612facf1bSJunchao Zhang
PCSetUp_PBJacobi_CUDA(PC pc,Mat diagVB)8742ce410bSJunchao Zhang PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc, Mat diagVB)
8812facf1bSJunchao Zhang {
8912facf1bSJunchao Zhang PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
9012facf1bSJunchao Zhang size_t size;
9112facf1bSJunchao Zhang
9212facf1bSJunchao Zhang PetscFunctionBegin;
9342ce410bSJunchao Zhang PetscCall(PCSetUp_PBJacobi_Host(pc, diagVB)); /* Compute the inverse on host now. Might worth doing it on device directly */
9412facf1bSJunchao Zhang size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs;
9512facf1bSJunchao Zhang
9612facf1bSJunchao Zhang /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */
9712facf1bSJunchao Zhang if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size));
9812facf1bSJunchao Zhang PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice));
9912facf1bSJunchao Zhang PetscCall(PetscLogCpuToGpu(size));
10012facf1bSJunchao Zhang
10112facf1bSJunchao Zhang pc->ops->apply = PCApply_PBJacobi_CUDA;
10212facf1bSJunchao Zhang pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA;
10312facf1bSJunchao Zhang pc->ops->destroy = PCDestroy_PBJacobi_CUDA;
1043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
10512facf1bSJunchao Zhang }
106