1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 14 #include "../cuda/ceed-cuda-common.h" 15 #include "../cuda/ceed-cuda-compile.h" 16 #include "ceed-cuda-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 22 Ceed ceed; 23 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 24 Ceed_Cuda *ceed_Cuda; 25 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 26 CeedBasis_Cuda *data; 27 CeedCallBackend(CeedBasisGetData(basis, &data)); 28 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 29 const int max_block_size = 32; 30 31 // Read vectors 32 const CeedScalar *d_u; 33 CeedScalar *d_v; 34 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 35 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 36 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 37 38 // Clear v for transpose operation 39 if (t_mode == CEED_TRANSPOSE) { 40 CeedSize length; 41 CeedCallBackend(CeedVectorGetLength(v, &length)); 42 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 43 } 44 CeedInt Q_1d, dim; 45 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 46 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 47 48 // Basis action 49 switch (eval_mode) { 50 case CEED_EVAL_INTERP: { 51 void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &d_u, &d_v}; 52 CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 53 54 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 55 } break; 56 case CEED_EVAL_GRAD: { 57 void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 58 CeedInt block_size = max_block_size; 59 60 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 61 } break; 62 case CEED_EVAL_WEIGHT: { 63 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 64 const int grid_size = num_elem; 65 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, dim >= 2 ? Q_1d : 1, 1, weight_args)); 66 } break; 67 // LCOV_EXCL_START 68 // Evaluate the divergence to/from the quadrature points 69 case CEED_EVAL_DIV: 70 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 71 // Evaluate the curl to/from the quadrature points 72 case CEED_EVAL_CURL: 73 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 74 // Take no action, BasisApply should not have been called 75 case CEED_EVAL_NONE: 76 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 77 // LCOV_EXCL_STOP 78 } 79 80 // Restore vectors 81 if (eval_mode != CEED_EVAL_WEIGHT) { 82 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 83 } 84 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 85 return CEED_ERROR_SUCCESS; 86 } 87 88 //------------------------------------------------------------------------------ 89 // Basis apply - non-tensor 90 //------------------------------------------------------------------------------ 91 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 92 CeedVector v) { 93 Ceed ceed; 94 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 95 Ceed_Cuda *ceed_Cuda; 96 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 97 CeedBasisNonTensor_Cuda *data; 98 CeedCallBackend(CeedBasisGetData(basis, &data)); 99 CeedInt num_nodes, num_qpts; 100 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 101 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 102 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 103 int elems_per_block = 1; 104 int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 105 106 // Read vectors 107 const CeedScalar *d_u; 108 CeedScalar *d_v; 109 if (eval_mode != CEED_EVAL_WEIGHT) { 110 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 111 } 112 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 113 114 // Clear v for transpose operation 115 if (t_mode == CEED_TRANSPOSE) { 116 CeedSize length; 117 CeedCallBackend(CeedVectorGetLength(v, &length)); 118 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 119 } 120 121 // Apply basis operation 122 switch (eval_mode) { 123 case CEED_EVAL_INTERP: { 124 void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; 125 if (transpose) { 126 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, num_nodes, 1, elems_per_block, interp_args)); 127 } else { 128 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, num_qpts, 1, elems_per_block, interp_args)); 129 } 130 } break; 131 case CEED_EVAL_GRAD: { 132 void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; 133 if (transpose) { 134 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Grad, grid, num_nodes, 1, elems_per_block, grad_args)); 135 } else { 136 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Grad, grid, num_qpts, 1, elems_per_block, grad_args)); 137 } 138 } break; 139 case CEED_EVAL_WEIGHT: { 140 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 141 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 142 } break; 143 // LCOV_EXCL_START 144 // Evaluate the divergence to/from the quadrature points 145 case CEED_EVAL_DIV: 146 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 147 // Evaluate the curl to/from the quadrature points 148 case CEED_EVAL_CURL: 149 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 150 // Take no action, BasisApply should not have been called 151 case CEED_EVAL_NONE: 152 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 153 // LCOV_EXCL_STOP 154 } 155 156 // Restore vectors 157 if (eval_mode != CEED_EVAL_WEIGHT) { 158 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 159 } 160 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 161 return CEED_ERROR_SUCCESS; 162 } 163 164 //------------------------------------------------------------------------------ 165 // Destroy tensor basis 166 //------------------------------------------------------------------------------ 167 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 168 Ceed ceed; 169 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 170 171 CeedBasis_Cuda *data; 172 CeedCallBackend(CeedBasisGetData(basis, &data)); 173 174 CeedCallCuda(ceed, cuModuleUnload(data->module)); 175 176 CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 177 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 178 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 179 CeedCallBackend(CeedFree(&data)); 180 181 return CEED_ERROR_SUCCESS; 182 } 183 184 //------------------------------------------------------------------------------ 185 // Destroy non-tensor basis 186 //------------------------------------------------------------------------------ 187 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 188 Ceed ceed; 189 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 190 191 CeedBasisNonTensor_Cuda *data; 192 CeedCallBackend(CeedBasisGetData(basis, &data)); 193 194 CeedCallCuda(ceed, cuModuleUnload(data->module)); 195 196 CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 197 CeedCallCuda(ceed, cudaFree(data->d_interp)); 198 CeedCallCuda(ceed, cudaFree(data->d_grad)); 199 CeedCallBackend(CeedFree(&data)); 200 201 return CEED_ERROR_SUCCESS; 202 } 203 204 //------------------------------------------------------------------------------ 205 // Create tensor 206 //------------------------------------------------------------------------------ 207 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 208 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 209 Ceed ceed; 210 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 211 CeedBasis_Cuda *data; 212 CeedCallBackend(CeedCalloc(1, &data)); 213 214 // Copy data to GPU 215 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 216 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 217 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 218 219 const CeedInt interp_bytes = q_bytes * P_1d; 220 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 221 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 222 223 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 224 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 225 226 // Compile basis kernels 227 CeedInt num_comp; 228 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 229 char *basis_kernel_path, *basis_kernel_source; 230 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 231 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 232 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 233 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 234 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 235 num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 236 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 237 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 238 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 239 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 240 CeedCallBackend(CeedFree(&basis_kernel_path)); 241 CeedCallBackend(CeedFree(&basis_kernel_source)); 242 243 CeedCallBackend(CeedBasisSetData(basis, data)); 244 245 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 246 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 247 return CEED_ERROR_SUCCESS; 248 } 249 250 //------------------------------------------------------------------------------ 251 // Create non-tensor 252 //------------------------------------------------------------------------------ 253 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 254 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 255 Ceed ceed; 256 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 257 CeedBasisNonTensor_Cuda *data; 258 CeedCallBackend(CeedCalloc(1, &data)); 259 260 // Copy basis data to GPU 261 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 262 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 263 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 264 265 const CeedInt interp_bytes = q_bytes * num_nodes; 266 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 267 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 268 269 const CeedInt grad_bytes = q_bytes * num_nodes * dim; 270 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 271 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 272 273 // Compile basis kernels 274 CeedInt num_comp; 275 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 276 char *basis_kernel_path, *basis_kernel_source; 277 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 278 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 279 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 280 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 281 CeedCallCuda(ceed, CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, 282 "BASIS_NUM_COMP", num_comp)); 283 CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 284 CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 285 CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 286 CeedCallBackend(CeedFree(&basis_kernel_path)); 287 CeedCallBackend(CeedFree(&basis_kernel_source)); 288 289 CeedCallBackend(CeedBasisSetData(basis, data)); 290 291 // Register backend functions 292 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 293 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 294 return CEED_ERROR_SUCCESS; 295 } 296 297 //------------------------------------------------------------------------------ 298