1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 14 #include "../cuda/ceed-cuda-common.h" 15 #include "../cuda/ceed-cuda-compile.h" 16 #include "ceed-cuda-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 22 Ceed ceed; 23 Ceed_Cuda *ceed_Cuda; 24 CeedInt Q_1d, dim; 25 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 26 const int max_block_size = 32; 27 const CeedScalar *d_u; 28 CeedScalar *d_v; 29 CeedBasis_Cuda *data; 30 31 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 32 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 33 CeedCallBackend(CeedBasisGetData(basis, &data)); 34 35 // Read vectors 36 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 37 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 39 40 // Clear v for transpose operation 41 if (t_mode == CEED_TRANSPOSE) { 42 CeedSize length; 43 44 CeedCallBackend(CeedVectorGetLength(v, &length)); 45 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 46 } 47 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 48 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 49 50 // Basis action 51 switch (eval_mode) { 52 case CEED_EVAL_INTERP: { 53 void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &d_u, &d_v}; 54 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 55 56 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 57 } break; 58 case CEED_EVAL_GRAD: { 59 void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 60 const CeedInt block_size = max_block_size; 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 63 } break; 64 case CEED_EVAL_WEIGHT: { 65 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 66 const int block_size_x = Q_1d; 67 const int block_size_y = dim >= 2 ? Q_1d : 1; 68 69 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 70 } break; 71 // LCOV_EXCL_START 72 // Evaluate the divergence to/from the quadrature points 73 case CEED_EVAL_DIV: 74 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 75 // Evaluate the curl to/from the quadrature points 76 case CEED_EVAL_CURL: 77 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 78 // Take no action, BasisApply should not have been called 79 case CEED_EVAL_NONE: 80 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 81 // LCOV_EXCL_STOP 82 } 83 84 // Restore vectors 85 if (eval_mode != CEED_EVAL_WEIGHT) { 86 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 87 } 88 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 89 return CEED_ERROR_SUCCESS; 90 } 91 92 //------------------------------------------------------------------------------ 93 // Basis apply - non-tensor 94 //------------------------------------------------------------------------------ 95 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 96 CeedVector v) { 97 Ceed ceed; 98 Ceed_Cuda *ceed_Cuda; 99 CeedInt num_nodes, num_qpts; 100 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 101 int elems_per_block = 1; 102 int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 103 const CeedScalar *d_u; 104 CeedScalar *d_v; 105 CeedBasisNonTensor_Cuda *data; 106 107 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 108 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 109 CeedCallBackend(CeedBasisGetData(basis, &data)); 110 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 111 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 112 113 // Read vectors 114 if (eval_mode != CEED_EVAL_WEIGHT) { 115 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 116 } 117 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 118 119 // Clear v for transpose operation 120 if (t_mode == CEED_TRANSPOSE) { 121 CeedSize length; 122 123 CeedCallBackend(CeedVectorGetLength(v, &length)); 124 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 125 } 126 127 // Apply basis operation 128 switch (eval_mode) { 129 case CEED_EVAL_INTERP: { 130 void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; 131 const int block_size_x = transpose ? num_nodes : num_qpts; 132 133 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 134 } break; 135 case CEED_EVAL_GRAD: { 136 void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; 137 const int block_size_x = transpose ? num_nodes : num_qpts; 138 139 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Grad, grid, block_size_x, 1, elems_per_block, grad_args)); 140 } break; 141 case CEED_EVAL_WEIGHT: { 142 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 143 144 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 145 } break; 146 // LCOV_EXCL_START 147 // Evaluate the divergence to/from the quadrature points 148 case CEED_EVAL_DIV: 149 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 150 // Evaluate the curl to/from the quadrature points 151 case CEED_EVAL_CURL: 152 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 153 // Take no action, BasisApply should not have been called 154 case CEED_EVAL_NONE: 155 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 156 // LCOV_EXCL_STOP 157 } 158 159 // Restore vectors 160 if (eval_mode != CEED_EVAL_WEIGHT) { 161 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 162 } 163 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 164 return CEED_ERROR_SUCCESS; 165 } 166 167 //------------------------------------------------------------------------------ 168 // Destroy tensor basis 169 //------------------------------------------------------------------------------ 170 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 171 Ceed ceed; 172 CeedBasis_Cuda *data; 173 174 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 175 CeedCallBackend(CeedBasisGetData(basis, &data)); 176 CeedCallCuda(ceed, cuModuleUnload(data->module)); 177 CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 178 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 179 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 180 CeedCallBackend(CeedFree(&data)); 181 return CEED_ERROR_SUCCESS; 182 } 183 184 //------------------------------------------------------------------------------ 185 // Destroy non-tensor basis 186 //------------------------------------------------------------------------------ 187 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 188 Ceed ceed; 189 CeedBasisNonTensor_Cuda *data; 190 191 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 192 CeedCallBackend(CeedBasisGetData(basis, &data)); 193 CeedCallCuda(ceed, cuModuleUnload(data->module)); 194 CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 195 CeedCallCuda(ceed, cudaFree(data->d_interp)); 196 CeedCallCuda(ceed, cudaFree(data->d_grad)); 197 CeedCallBackend(CeedFree(&data)); 198 return CEED_ERROR_SUCCESS; 199 } 200 201 //------------------------------------------------------------------------------ 202 // Create tensor 203 //------------------------------------------------------------------------------ 204 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 205 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 206 Ceed ceed; 207 char *basis_kernel_path, *basis_kernel_source; 208 CeedInt num_comp; 209 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 210 const CeedInt interp_bytes = q_bytes * P_1d; 211 CeedBasis_Cuda *data; 212 213 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 214 CeedCallBackend(CeedCalloc(1, &data)); 215 216 // Copy data to GPU 217 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 218 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 219 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 220 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 221 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 222 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 223 224 // Compile basis kernels 225 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 226 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 227 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 228 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 229 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 230 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 231 num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 232 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 233 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 234 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 235 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 236 CeedCallBackend(CeedFree(&basis_kernel_path)); 237 CeedCallBackend(CeedFree(&basis_kernel_source)); 238 239 CeedCallBackend(CeedBasisSetData(basis, data)); 240 241 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 242 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 243 return CEED_ERROR_SUCCESS; 244 } 245 246 //------------------------------------------------------------------------------ 247 // Create non-tensor 248 //------------------------------------------------------------------------------ 249 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 250 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 251 Ceed ceed; 252 char *basis_kernel_path, *basis_kernel_source; 253 CeedInt num_comp; 254 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 255 const CeedInt interp_bytes = q_bytes * num_nodes; 256 const CeedInt grad_bytes = q_bytes * num_nodes * dim; 257 CeedBasisNonTensor_Cuda *data; 258 259 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 260 CeedCallBackend(CeedCalloc(1, &data)); 261 262 // Copy basis data to GPU 263 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 264 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 265 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 266 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 267 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 268 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 269 270 // Compile basis kernels 271 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 272 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 273 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 274 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 275 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 276 CeedCallCuda(ceed, CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, 277 "BASIS_NUM_COMP", num_comp)); 278 CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 279 CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 280 CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 281 CeedCallBackend(CeedFree(&basis_kernel_path)); 282 CeedCallBackend(CeedFree(&basis_kernel_source)); 283 284 CeedCallBackend(CeedBasisSetData(basis, data)); 285 286 // Register backend functions 287 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 288 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 289 return CEED_ERROR_SUCCESS; 290 } 291 292 //------------------------------------------------------------------------------ 293