1 #include <petscdevice_cuda.h> 2 #include <petsc/private/petsclegacycupmblas.h> 3 #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h> 4 5 #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0) 6 __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose) 7 { 8 const PetscInt gridSize = gridDim.x * blockDim.x; 9 PetscInt row = blockIdx.x * blockDim.x + threadIdx.x; 10 const PetscInt bs2 = bs * bs; 11 12 /* One row per thread. The blocks are stored in column-major order */ 13 for (; row < bs * mbs; row += gridSize) { 14 const PetscScalar *Ap, *xp; 15 PetscScalar *yp; 16 PetscInt i, j, k; 17 18 k = row / bs; /* k-th block */ 19 i = row % bs; /* this thread deals with i-th row of the block */ 20 Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */ 21 xp = &x[bs * k]; 22 yp = &y[bs * k]; 23 /* multiply i-th row (column) with x */ 24 yp[i] = 0.0; 25 for (j = 0; j < bs; j++) { 26 yp[i] += Ap[0] * xp[j]; 27 Ap += (transpose ? 1 : bs); /* block is in column major order */ 28 } 29 } 30 } 31 #endif 32 33 static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op) 34 { 35 const PetscScalar *xx; 36 PetscScalar *yy; 37 cublasHandle_t handle; 38 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 39 const PetscScalar *A = (const PetscScalar *)jac->spptr; 40 const PetscInt bs = jac->bs, mbs = jac->mbs; 41 42 PetscFunctionBegin; 43 PetscCall(VecCUDAGetArrayRead(x, &xx)); 44 PetscCall(VecCUDAGetArrayWrite(y, &yy)); 45 PetscCall(PetscCUBLASGetHandle(&handle)); 46 PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */ 47 48 #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0) 49 /* y = alpha op(A) x + beta y */ 50 const PetscScalar alpha = 1.0, beta = 0.0; 51 PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs)); 52 #else 53 PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */ 54 MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE); 55 PetscCallCUDA(cudaGetLastError()); 56 #endif 57 PetscCall(VecCUDARestoreArrayRead(x, &xx)); 58 PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 59 PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2)); 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y) 64 { 65 PetscFunctionBegin; 66 PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose 67 PetscFunctionReturn(PETSC_SUCCESS); 68 } 69 70 static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y) 71 { 72 PetscFunctionBegin; 73 PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc) 78 { 79 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 80 81 PetscFunctionBegin; 82 PetscCallCUDA(cudaFree(jac->spptr)); 83 PetscCall(PCDestroy_PBJacobi(pc)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc, Mat diagVB) 88 { 89 PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 90 size_t size; 91 92 PetscFunctionBegin; 93 PetscCall(PCSetUp_PBJacobi_Host(pc, diagVB)); /* Compute the inverse on host now. Might worth doing it on device directly */ 94 size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs; 95 96 /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */ 97 if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size)); 98 PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice)); 99 PetscCall(PetscLogCpuToGpu(size)); 100 101 pc->ops->apply = PCApply_PBJacobi_CUDA; 102 pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA; 103 pc->ops->destroy = PCDestroy_PBJacobi_CUDA; 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106