1 #include <petscdevice_cuda.h>
2 #include <petsc/private/petsclegacycupmblas.h>
3 #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h>
4
5 #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0)
MatMultBatched(PetscInt bs,PetscInt mbs,const PetscScalar * A,const PetscScalar * x,PetscScalar * y,PetscBool transpose)6 __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
7 {
8 const PetscInt gridSize = gridDim.x * blockDim.x;
9 PetscInt row = blockIdx.x * blockDim.x + threadIdx.x;
10 const PetscInt bs2 = bs * bs;
11
12 /* One row per thread. The blocks are stored in column-major order */
13 for (; row < bs * mbs; row += gridSize) {
14 const PetscScalar *Ap, *xp;
15 PetscScalar *yp;
16 PetscInt i, j, k;
17
18 k = row / bs; /* k-th block */
19 i = row % bs; /* this thread deals with i-th row of the block */
20 Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */
21 xp = &x[bs * k];
22 yp = &y[bs * k];
23 /* multiply i-th row (column) with x */
24 yp[i] = 0.0;
25 for (j = 0; j < bs; j++) {
26 yp[i] += Ap[0] * xp[j];
27 Ap += (transpose ? 1 : bs); /* block is in column major order */
28 }
29 }
30 }
31 #endif
32
PCApplyOrTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y,cublasOperation_t op)33 static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op)
34 {
35 const PetscScalar *xx;
36 PetscScalar *yy;
37 cublasHandle_t handle;
38 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
39 const PetscScalar *A = (const PetscScalar *)jac->spptr;
40 const PetscInt bs = jac->bs, mbs = jac->mbs;
41
42 PetscFunctionBegin;
43 PetscCall(VecCUDAGetArrayRead(x, &xx));
44 PetscCall(VecCUDAGetArrayWrite(y, &yy));
45 PetscCall(PetscCUBLASGetHandle(&handle));
46 PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */
47
48 #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0)
49 /* y = alpha op(A) x + beta y */
50 const PetscScalar alpha = 1.0, beta = 0.0;
51 PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs));
52 #else
53 PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */
54 MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE);
55 PetscCallCUDA(cudaGetLastError());
56 #endif
57 PetscCall(VecCUDARestoreArrayRead(x, &xx));
58 PetscCall(VecCUDARestoreArrayWrite(y, &yy));
59 PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2));
60 PetscFunctionReturn(PETSC_SUCCESS);
61 }
62
PCApply_PBJacobi_CUDA(PC pc,Vec x,Vec y)63 static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y)
64 {
65 PetscFunctionBegin;
66 PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose
67 PetscFunctionReturn(PETSC_SUCCESS);
68 }
69
PCApplyTranspose_PBJacobi_CUDA(PC pc,Vec x,Vec y)70 static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y)
71 {
72 PetscFunctionBegin;
73 PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose
74 PetscFunctionReturn(PETSC_SUCCESS);
75 }
76
PCDestroy_PBJacobi_CUDA(PC pc)77 static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc)
78 {
79 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
80
81 PetscFunctionBegin;
82 PetscCallCUDA(cudaFree(jac->spptr));
83 PetscCall(PCDestroy_PBJacobi(pc));
84 PetscFunctionReturn(PETSC_SUCCESS);
85 }
86
PCSetUp_PBJacobi_CUDA(PC pc,Mat diagVB)87 PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc, Mat diagVB)
88 {
89 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
90 size_t size;
91
92 PetscFunctionBegin;
93 PetscCall(PCSetUp_PBJacobi_Host(pc, diagVB)); /* Compute the inverse on host now. Might worth doing it on device directly */
94 size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs;
95
96 /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */
97 if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size));
98 PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice));
99 PetscCall(PetscLogCpuToGpu(size));
100
101 pc->ops->apply = PCApply_PBJacobi_CUDA;
102 pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA;
103 pc->ops->destroy = PCDestroy_PBJacobi_CUDA;
104 PetscFunctionReturn(PETSC_SUCCESS);
105 }
106