xref: /petsc/src/ksp/pc/impls/pbjacobi/cuda/pbjacobi_cuda.cu (revision d0e6bf2ad94dcc89b258ce16c7987200a4714786)
112facf1bSJunchao Zhang #include <petscdevice_cuda.h>
212facf1bSJunchao Zhang #include <petsc/private/petsclegacycupmblas.h>
312facf1bSJunchao Zhang #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h>
412facf1bSJunchao Zhang 
512facf1bSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0)
MatMultBatched(PetscInt bs,PetscInt mbs,const PetscScalar * A,const PetscScalar * x,PetscScalar * y,PetscBool transpose)612facf1bSJunchao Zhang __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
712facf1bSJunchao Zhang {
812facf1bSJunchao Zhang   const PetscInt gridSize = gridDim.x * blockDim.x;
912facf1bSJunchao Zhang   PetscInt       row      = blockIdx.x * blockDim.x + threadIdx.x;
1012facf1bSJunchao Zhang   const PetscInt bs2      = bs * bs;
1112facf1bSJunchao Zhang 
1212facf1bSJunchao Zhang   /* One row per thread. The blocks are stored in column-major order */
1312facf1bSJunchao Zhang   for (; row < bs * mbs; row += gridSize) {
1412facf1bSJunchao Zhang     const PetscScalar *Ap, *xp;
1512facf1bSJunchao Zhang     PetscScalar       *yp;
1612facf1bSJunchao Zhang     PetscInt           i, j, k;
1712facf1bSJunchao Zhang 
1812facf1bSJunchao Zhang     k  = row / bs;                               /* k-th block */
1912facf1bSJunchao Zhang     i  = row % bs;                               /* this thread deals with i-th row of the block */
2012facf1bSJunchao Zhang     Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */
2112facf1bSJunchao Zhang     xp = &x[bs * k];
2212facf1bSJunchao Zhang     yp = &y[bs * k];
2312facf1bSJunchao Zhang     /* multiply i-th row (column) with x */
2412facf1bSJunchao Zhang     yp[i] = 0.0;
2512facf1bSJunchao Zhang     for (j = 0; j < bs; j++) {
2612facf1bSJunchao Zhang       yp[i] += Ap[0] * xp[j];
2712facf1bSJunchao Zhang       Ap += (transpose ? 1 : bs); /* block is in column major order */
2812facf1bSJunchao Zhang     }
2912facf1bSJunchao Zhang   }
3012facf1bSJunchao Zhang }
3112facf1bSJunchao Zhang #endif
3212facf1bSJunchao Zhang 
PCApplyOrTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y,cublasOperation_t op)3312facf1bSJunchao Zhang static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op)
3412facf1bSJunchao Zhang {
3512facf1bSJunchao Zhang   const PetscScalar *xx;
3612facf1bSJunchao Zhang   PetscScalar       *yy;
3712facf1bSJunchao Zhang   cublasHandle_t     handle;
3812facf1bSJunchao Zhang   PC_PBJacobi       *jac = (PC_PBJacobi *)pc->data;
3912facf1bSJunchao Zhang   const PetscScalar *A   = (const PetscScalar *)jac->spptr;
4012facf1bSJunchao Zhang   const PetscInt     bs = jac->bs, mbs = jac->mbs;
4112facf1bSJunchao Zhang 
4212facf1bSJunchao Zhang   PetscFunctionBegin;
4312facf1bSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(x, &xx));
4412facf1bSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(y, &yy));
4512facf1bSJunchao Zhang   PetscCall(PetscCUBLASGetHandle(&handle));
4612facf1bSJunchao Zhang   PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */
4712facf1bSJunchao Zhang 
4812facf1bSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0)
4912facf1bSJunchao Zhang   /* y = alpha op(A) x + beta y */
5012facf1bSJunchao Zhang   const PetscScalar alpha = 1.0, beta = 0.0;
5112facf1bSJunchao Zhang   PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs));
5212facf1bSJunchao Zhang #else
5312facf1bSJunchao Zhang   PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */
54*57508eceSPierre Jolivet   MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE);
5512facf1bSJunchao Zhang   PetscCallCUDA(cudaGetLastError());
5612facf1bSJunchao Zhang #endif
5712facf1bSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(x, &xx));
5812facf1bSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(y, &yy));
5912facf1bSJunchao Zhang   PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2));
603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6112facf1bSJunchao Zhang }
6212facf1bSJunchao Zhang 
PCApply_PBJacobi_CUDA(PC pc,Vec x,Vec y)6312facf1bSJunchao Zhang static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y)
6412facf1bSJunchao Zhang {
6512facf1bSJunchao Zhang   PetscFunctionBegin;
6612facf1bSJunchao Zhang   PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose
673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6812facf1bSJunchao Zhang }
6912facf1bSJunchao Zhang 
PCApplyTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y)7012facf1bSJunchao Zhang static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y)
7112facf1bSJunchao Zhang {
7212facf1bSJunchao Zhang   PetscFunctionBegin;
7312facf1bSJunchao Zhang   PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose
743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
7512facf1bSJunchao Zhang }
7612facf1bSJunchao Zhang 
PCDestroy_PBJacobi_CUDA(PC pc)7712facf1bSJunchao Zhang static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc)
7812facf1bSJunchao Zhang {
7912facf1bSJunchao Zhang   PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
8012facf1bSJunchao Zhang 
8112facf1bSJunchao Zhang   PetscFunctionBegin;
8212facf1bSJunchao Zhang   PetscCallCUDA(cudaFree(jac->spptr));
8312facf1bSJunchao Zhang   PetscCall(PCDestroy_PBJacobi(pc));
843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8512facf1bSJunchao Zhang }
8612facf1bSJunchao Zhang 
PCSetUp_PBJacobi_CUDA(PC pc,Mat diagVB)8742ce410bSJunchao Zhang PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc, Mat diagVB)
8812facf1bSJunchao Zhang {
8912facf1bSJunchao Zhang   PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
9012facf1bSJunchao Zhang   size_t       size;
9112facf1bSJunchao Zhang 
9212facf1bSJunchao Zhang   PetscFunctionBegin;
9342ce410bSJunchao Zhang   PetscCall(PCSetUp_PBJacobi_Host(pc, diagVB)); /* Compute the inverse on host now. Might worth doing it on device directly */
9412facf1bSJunchao Zhang   size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs;
9512facf1bSJunchao Zhang 
9612facf1bSJunchao Zhang   /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */
9712facf1bSJunchao Zhang   if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size));
9812facf1bSJunchao Zhang   PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice));
9912facf1bSJunchao Zhang   PetscCall(PetscLogCpuToGpu(size));
10012facf1bSJunchao Zhang 
10112facf1bSJunchao Zhang   pc->ops->apply          = PCApply_PBJacobi_CUDA;
10212facf1bSJunchao Zhang   pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA;
10312facf1bSJunchao Zhang   pc->ops->destroy        = PCDestroy_PBJacobi_CUDA;
1043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
10512facf1bSJunchao Zhang }
106