xref: /petsc/src/ksp/pc/impls/pbjacobi/cuda/pbjacobi_cuda.cu (revision d0e6bf2ad94dcc89b258ce16c7987200a4714786)
1 #include <petscdevice_cuda.h>
2 #include <petsc/private/petsclegacycupmblas.h>
3 #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h>
4 
5 #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0)
MatMultBatched(PetscInt bs,PetscInt mbs,const PetscScalar * A,const PetscScalar * x,PetscScalar * y,PetscBool transpose)6 __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
7 {
8   const PetscInt gridSize = gridDim.x * blockDim.x;
9   PetscInt       row      = blockIdx.x * blockDim.x + threadIdx.x;
10   const PetscInt bs2      = bs * bs;
11 
12   /* One row per thread. The blocks are stored in column-major order */
13   for (; row < bs * mbs; row += gridSize) {
14     const PetscScalar *Ap, *xp;
15     PetscScalar       *yp;
16     PetscInt           i, j, k;
17 
18     k  = row / bs;                               /* k-th block */
19     i  = row % bs;                               /* this thread deals with i-th row of the block */
20     Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */
21     xp = &x[bs * k];
22     yp = &y[bs * k];
23     /* multiply i-th row (column) with x */
24     yp[i] = 0.0;
25     for (j = 0; j < bs; j++) {
26       yp[i] += Ap[0] * xp[j];
27       Ap += (transpose ? 1 : bs); /* block is in column major order */
28     }
29   }
30 }
31 #endif
32 
PCApplyOrTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y,cublasOperation_t op)33 static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op)
34 {
35   const PetscScalar *xx;
36   PetscScalar       *yy;
37   cublasHandle_t     handle;
38   PC_PBJacobi       *jac = (PC_PBJacobi *)pc->data;
39   const PetscScalar *A   = (const PetscScalar *)jac->spptr;
40   const PetscInt     bs = jac->bs, mbs = jac->mbs;
41 
42   PetscFunctionBegin;
43   PetscCall(VecCUDAGetArrayRead(x, &xx));
44   PetscCall(VecCUDAGetArrayWrite(y, &yy));
45   PetscCall(PetscCUBLASGetHandle(&handle));
46   PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */
47 
48 #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0)
49   /* y = alpha op(A) x + beta y */
50   const PetscScalar alpha = 1.0, beta = 0.0;
51   PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs));
52 #else
53   PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */
54   MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE);
55   PetscCallCUDA(cudaGetLastError());
56 #endif
57   PetscCall(VecCUDARestoreArrayRead(x, &xx));
58   PetscCall(VecCUDARestoreArrayWrite(y, &yy));
59   PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2));
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
PCApply_PBJacobi_CUDA(PC pc,Vec x,Vec y)63 static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y)
64 {
65   PetscFunctionBegin;
66   PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose
67   PetscFunctionReturn(PETSC_SUCCESS);
68 }
69 
PCApplyTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y)70 static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y)
71 {
72   PetscFunctionBegin;
73   PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose
74   PetscFunctionReturn(PETSC_SUCCESS);
75 }
76 
PCDestroy_PBJacobi_CUDA(PC pc)77 static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc)
78 {
79   PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
80 
81   PetscFunctionBegin;
82   PetscCallCUDA(cudaFree(jac->spptr));
83   PetscCall(PCDestroy_PBJacobi(pc));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
PCSetUp_PBJacobi_CUDA(PC pc,Mat diagVB)87 PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc, Mat diagVB)
88 {
89   PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
90   size_t       size;
91 
92   PetscFunctionBegin;
93   PetscCall(PCSetUp_PBJacobi_Host(pc, diagVB)); /* Compute the inverse on host now. Might worth doing it on device directly */
94   size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs;
95 
96   /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */
97   if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size));
98   PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice));
99   PetscCall(PetscLogCpuToGpu(size));
100 
101   pc->ops->apply          = PCApply_PBJacobi_CUDA;
102   pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA;
103   pc->ops->destroy        = PCDestroy_PBJacobi_CUDA;
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106