1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 14 #include "../cuda/ceed-cuda-common.h" 15 #include "../cuda/ceed-cuda-compile.h" 16 #include "ceed-cuda-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Basis apply - tensor 20 //------------------------------------------------------------------------------ 21 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 22 Ceed ceed; 23 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 24 Ceed_Cuda *ceed_Cuda; 25 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 26 CeedBasis_Cuda *data; 27 CeedCallBackend(CeedBasisGetData(basis, &data)); 28 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 29 const int max_block_size = 32; 30 31 // Read vectors 32 const CeedScalar *d_u; 33 CeedScalar *d_v; 34 if (eval_mode != CEED_EVAL_WEIGHT) { 35 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 36 } 37 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 38 39 // Clear v for transpose operation 40 if (t_mode == CEED_TRANSPOSE) { 41 CeedSize length; 42 CeedCallBackend(CeedVectorGetLength(v, &length)); 43 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 44 } 45 CeedInt Q_1d, dim; 46 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 47 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 48 49 // Basis action 50 switch (eval_mode) { 51 case CEED_EVAL_INTERP: { 52 void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &d_u, &d_v}; 53 CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 54 55 CeedCallBackend(CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args)); 56 } break; 57 case CEED_EVAL_GRAD: { 58 void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 59 CeedInt block_size = max_block_size; 60 61 CeedCallBackend(CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args)); 62 } break; 63 case CEED_EVAL_WEIGHT: { 64 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 65 const int grid_size = num_elem; 66 CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid_size, Q_1d, dim >= 2 ? Q_1d : 1, 1, weight_args)); 67 } break; 68 // LCOV_EXCL_START 69 // Evaluate the divergence to/from the quadrature points 70 case CEED_EVAL_DIV: 71 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 72 // Evaluate the curl to/from the quadrature points 73 case CEED_EVAL_CURL: 74 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 75 // Take no action, BasisApply should not have been called 76 case CEED_EVAL_NONE: 77 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 78 // LCOV_EXCL_STOP 79 } 80 81 // Restore vectors 82 if (eval_mode != CEED_EVAL_WEIGHT) { 83 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 84 } 85 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 86 return CEED_ERROR_SUCCESS; 87 } 88 89 //------------------------------------------------------------------------------ 90 // Basis apply - non-tensor 91 //------------------------------------------------------------------------------ 92 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 93 CeedVector v) { 94 Ceed ceed; 95 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 96 Ceed_Cuda *ceed_Cuda; 97 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 98 CeedBasisNonTensor_Cuda *data; 99 CeedCallBackend(CeedBasisGetData(basis, &data)); 100 CeedInt num_nodes, num_qpts; 101 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 102 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 103 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 104 int elems_per_block = 1; 105 int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 106 107 // Read vectors 108 const CeedScalar *d_u; 109 CeedScalar *d_v; 110 if (eval_mode != CEED_EVAL_WEIGHT) { 111 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 112 } 113 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 114 115 // Clear v for transpose operation 116 if (t_mode == CEED_TRANSPOSE) { 117 CeedSize length; 118 CeedCallBackend(CeedVectorGetLength(v, &length)); 119 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 120 } 121 122 // Apply basis operation 123 switch (eval_mode) { 124 case CEED_EVAL_INTERP: { 125 void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; 126 if (transpose) { 127 CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, elems_per_block, interp_args)); 128 } else { 129 CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, elems_per_block, interp_args)); 130 } 131 } break; 132 case CEED_EVAL_GRAD: { 133 void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; 134 if (transpose) { 135 CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, elems_per_block, grad_args)); 136 } else { 137 CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, elems_per_block, grad_args)); 138 } 139 } break; 140 case CEED_EVAL_WEIGHT: { 141 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 142 CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 143 } break; 144 // LCOV_EXCL_START 145 // Evaluate the divergence to/from the quadrature points 146 case CEED_EVAL_DIV: 147 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 148 // Evaluate the curl to/from the quadrature points 149 case CEED_EVAL_CURL: 150 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 151 // Take no action, BasisApply should not have been called 152 case CEED_EVAL_NONE: 153 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 154 // LCOV_EXCL_STOP 155 } 156 157 // Restore vectors 158 if (eval_mode != CEED_EVAL_WEIGHT) { 159 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 160 } 161 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 162 return CEED_ERROR_SUCCESS; 163 } 164 165 //------------------------------------------------------------------------------ 166 // Destroy tensor basis 167 //------------------------------------------------------------------------------ 168 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 169 Ceed ceed; 170 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 171 172 CeedBasis_Cuda *data; 173 CeedCallBackend(CeedBasisGetData(basis, &data)); 174 175 CeedCallCuda(ceed, cuModuleUnload(data->module)); 176 177 CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 178 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 179 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 180 CeedCallBackend(CeedFree(&data)); 181 182 return CEED_ERROR_SUCCESS; 183 } 184 185 //------------------------------------------------------------------------------ 186 // Destroy non-tensor basis 187 //------------------------------------------------------------------------------ 188 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 189 Ceed ceed; 190 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 191 192 CeedBasisNonTensor_Cuda *data; 193 CeedCallBackend(CeedBasisGetData(basis, &data)); 194 195 CeedCallCuda(ceed, cuModuleUnload(data->module)); 196 197 CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 198 CeedCallCuda(ceed, cudaFree(data->d_interp)); 199 CeedCallCuda(ceed, cudaFree(data->d_grad)); 200 CeedCallBackend(CeedFree(&data)); 201 202 return CEED_ERROR_SUCCESS; 203 } 204 205 //------------------------------------------------------------------------------ 206 // Create tensor 207 //------------------------------------------------------------------------------ 208 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 209 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 210 Ceed ceed; 211 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 212 CeedBasis_Cuda *data; 213 CeedCallBackend(CeedCalloc(1, &data)); 214 215 // Copy data to GPU 216 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 217 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 218 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 219 220 const CeedInt interp_bytes = q_bytes * P_1d; 221 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 222 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 223 224 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 225 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 226 227 // Compile basis kernels 228 CeedInt num_comp; 229 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 230 char *basis_kernel_path, *basis_kernel_source; 231 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 232 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 233 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 234 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 235 CeedCallBackend(CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 236 num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 237 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 238 CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp)); 239 CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad)); 240 CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight)); 241 CeedCallBackend(CeedFree(&basis_kernel_path)); 242 CeedCallBackend(CeedFree(&basis_kernel_source)); 243 244 CeedCallBackend(CeedBasisSetData(basis, data)); 245 246 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 247 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 248 return CEED_ERROR_SUCCESS; 249 } 250 251 //------------------------------------------------------------------------------ 252 // Create non-tensor 253 //------------------------------------------------------------------------------ 254 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 255 const CeedScalar *qref, const CeedScalar *q_weight, CeedBasis basis) { 256 Ceed ceed; 257 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 258 CeedBasisNonTensor_Cuda *data; 259 CeedCallBackend(CeedCalloc(1, &data)); 260 261 // Copy basis data to GPU 262 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 263 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 264 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 265 266 const CeedInt interp_bytes = q_bytes * num_nodes; 267 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 268 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 269 270 const CeedInt grad_bytes = q_bytes * num_nodes * dim; 271 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 272 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 273 274 // Compile basis kernels 275 CeedInt num_comp; 276 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 277 char *basis_kernel_path, *basis_kernel_source; 278 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 279 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 280 CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 281 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 282 CeedCallCuda(ceed, CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, 283 "BASIS_NUM_COMP", num_comp)); 284 CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp)); 285 CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad)); 286 CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight)); 287 CeedCallBackend(CeedFree(&basis_kernel_path)); 288 CeedCallBackend(CeedFree(&basis_kernel_source)); 289 290 CeedCallBackend(CeedBasisSetData(basis, data)); 291 292 // Register backend functions 293 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 294 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 295 return CEED_ERROR_SUCCESS; 296 } 297 //------------------------------------------------------------------------------ 298