1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include "ceed-cuda-ref.h" 14 #include "../cuda/ceed-cuda-compile.h" 15 16 //------------------------------------------------------------------------------ 17 // Basis apply - tensor 18 //------------------------------------------------------------------------------ 19 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, 20 CeedTransposeMode t_mode, CeedEvalMode eval_mode, 21 CeedVector u, CeedVector v) { 22 int ierr; 23 Ceed ceed; 24 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 25 Ceed_Cuda *ceed_Cuda; 26 ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 27 CeedBasis_Cuda *data; 28 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 29 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 30 const int max_block_size = 32; 31 32 // Read vectors 33 const CeedScalar *d_u; 34 CeedScalar *d_v; 35 if (eval_mode != CEED_EVAL_WEIGHT) { 36 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 37 } 38 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 39 40 // Clear v for transpose operation 41 if (t_mode == CEED_TRANSPOSE) { 42 CeedSize length; 43 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 44 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 45 CeedChk_Cu(ceed, ierr); 46 } 47 CeedInt Q_1d, dim; 48 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 49 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 50 51 // Basis action 52 switch (eval_mode) { 53 case CEED_EVAL_INTERP: { 54 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 55 &data->d_interp_1d, &d_u, &d_v 56 }; 57 CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 58 59 ierr = CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args); 60 CeedChkBackend(ierr); 61 } break; 62 case CEED_EVAL_GRAD: { 63 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 64 &data->d_grad_1d, &d_u, &d_v 65 }; 66 CeedInt block_size = max_block_size; 67 68 ierr = CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args); 69 CeedChkBackend(ierr); 70 } break; 71 case CEED_EVAL_WEIGHT: { 72 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 73 const int grid_size = num_elem; 74 ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid_size, 75 Q_1d, dim >= 2 ? Q_1d : 1, 1, 76 weight_args); CeedChkBackend(ierr); 77 } break; 78 // LCOV_EXCL_START 79 // Evaluate the divergence to/from the quadrature points 80 case CEED_EVAL_DIV: 81 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 82 // Evaluate the curl to/from the quadrature points 83 case CEED_EVAL_CURL: 84 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 85 // Take no action, BasisApply should not have been called 86 case CEED_EVAL_NONE: 87 return CeedError(ceed, CEED_ERROR_BACKEND, 88 "CEED_EVAL_NONE does not make sense in this context"); 89 // LCOV_EXCL_STOP 90 } 91 92 // Restore vectors 93 if (eval_mode != CEED_EVAL_WEIGHT) { 94 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 95 } 96 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 97 return CEED_ERROR_SUCCESS; 98 } 99 100 //------------------------------------------------------------------------------ 101 // Basis apply - non-tensor 102 //------------------------------------------------------------------------------ 103 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, 104 CeedTransposeMode t_mode, CeedEvalMode eval_mode, 105 CeedVector u, CeedVector v) { 106 int ierr; 107 Ceed ceed; 108 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 109 Ceed_Cuda *ceed_Cuda; 110 ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 111 CeedBasisNonTensor_Cuda *data; 112 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 113 CeedInt num_nodes, num_qpts; 114 ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); 115 ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); 116 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 117 int elems_per_block = 1; 118 int grid = num_elem / elems_per_block + 119 ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 120 121 // Read vectors 122 const CeedScalar *d_u; 123 CeedScalar *d_v; 124 if (eval_mode != CEED_EVAL_WEIGHT) { 125 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 126 } 127 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 128 129 // Clear v for transpose operation 130 if (t_mode == CEED_TRANSPOSE) { 131 CeedSize length; 132 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 133 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 134 CeedChk_Cu(ceed, ierr); 135 } 136 137 // Apply basis operation 138 switch (eval_mode) { 139 case CEED_EVAL_INTERP: { 140 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 141 &data->d_interp, &d_u, &d_v 142 }; 143 if (transpose) { 144 ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, 145 elems_per_block, interp_args); CeedChkBackend(ierr); 146 } else { 147 ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, 148 elems_per_block, interp_args); CeedChkBackend(ierr); 149 } 150 } break; 151 case CEED_EVAL_GRAD: { 152 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, 153 &d_u, &d_v 154 }; 155 if (transpose) { 156 ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, 157 elems_per_block, grad_args); CeedChkBackend(ierr); 158 } else { 159 ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, 160 elems_per_block, grad_args); CeedChkBackend(ierr); 161 } 162 } break; 163 case CEED_EVAL_WEIGHT: { 164 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; 165 ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, 166 elems_per_block, weight_args); CeedChkBackend(ierr); 167 } break; 168 // LCOV_EXCL_START 169 // Evaluate the divergence to/from the quadrature points 170 case CEED_EVAL_DIV: 171 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 172 // Evaluate the curl to/from the quadrature points 173 case CEED_EVAL_CURL: 174 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 175 // Take no action, BasisApply should not have been called 176 case CEED_EVAL_NONE: 177 return CeedError(ceed, CEED_ERROR_BACKEND, 178 "CEED_EVAL_NONE does not make sense in this context"); 179 // LCOV_EXCL_STOP 180 } 181 182 // Restore vectors 183 if (eval_mode != CEED_EVAL_WEIGHT) { 184 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 185 } 186 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 187 return CEED_ERROR_SUCCESS; 188 } 189 190 //------------------------------------------------------------------------------ 191 // Destroy tensor basis 192 //------------------------------------------------------------------------------ 193 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 194 int ierr; 195 Ceed ceed; 196 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 197 198 CeedBasis_Cuda *data; 199 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 200 201 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 202 203 ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 204 ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 205 ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 206 ierr = CeedFree(&data); CeedChkBackend(ierr); 207 208 return CEED_ERROR_SUCCESS; 209 } 210 211 //------------------------------------------------------------------------------ 212 // Destroy non-tensor basis 213 //------------------------------------------------------------------------------ 214 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 215 int ierr; 216 Ceed ceed; 217 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 218 219 CeedBasisNonTensor_Cuda *data; 220 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 221 222 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 223 224 ierr = cudaFree(data->d_q_weight); CeedChk_Cu(ceed, ierr); 225 ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr); 226 ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr); 227 ierr = CeedFree(&data); CeedChkBackend(ierr); 228 229 return CEED_ERROR_SUCCESS; 230 } 231 232 //------------------------------------------------------------------------------ 233 // Create tensor 234 //------------------------------------------------------------------------------ 235 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 236 const CeedScalar *interp_1d, 237 const CeedScalar *grad_1d, 238 const CeedScalar *q_ref_1d, 239 const CeedScalar *q_weight_1d, 240 CeedBasis basis) { 241 int ierr; 242 Ceed ceed; 243 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 244 CeedBasis_Cuda *data; 245 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 246 247 // Copy data to GPU 248 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 249 ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 250 CeedChk_Cu(ceed, ierr); 251 ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 252 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 253 254 const CeedInt interp_bytes = q_bytes * P_1d; 255 ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 256 CeedChk_Cu(ceed, ierr); 257 ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 258 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 259 260 ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 261 CeedChk_Cu(ceed, ierr); 262 ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 263 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 264 265 // Complie basis kernels 266 CeedInt num_comp; 267 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 268 char *basis_kernel_path, *basis_kernel_source; 269 ierr = CeedGetInstalledJitPath(ceed, 270 "ceed-jit-source/cuda/cuda-ref-basis-tensor.h", 271 &basis_kernel_path); CeedChkBackend(ierr); 272 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 273 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 274 CeedChkBackend(ierr); 275 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 276 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, 277 "BASIS_Q_1D", Q_1d, 278 "BASIS_P_1D", P_1d, 279 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 280 Q_1d : P_1d, dim), 281 "BASIS_DIM", dim, 282 "BASIS_NUM_COMP", num_comp, 283 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 284 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) 285 ); CeedChkBackend(ierr); 286 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 287 CeedChkBackend(ierr); 288 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 289 CeedChkBackend(ierr); 290 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 291 CeedChkBackend(ierr); 292 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 293 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 294 295 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 296 297 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 298 CeedBasisApply_Cuda); CeedChkBackend(ierr); 299 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 300 CeedBasisDestroy_Cuda); CeedChkBackend(ierr); 301 return CEED_ERROR_SUCCESS; 302 } 303 304 //------------------------------------------------------------------------------ 305 // Create non-tensor 306 //------------------------------------------------------------------------------ 307 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, 308 CeedInt num_nodes, 309 CeedInt num_qpts, const CeedScalar *interp, 310 const CeedScalar *grad, const CeedScalar *qref, 311 const CeedScalar *q_weight, CeedBasis basis) { 312 int ierr; 313 Ceed ceed; 314 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 315 CeedBasisNonTensor_Cuda *data; 316 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 317 318 // Copy basis data to GPU 319 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 320 ierr = cudaMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Cu(ceed, ierr); 321 ierr = cudaMemcpy(data->d_q_weight, q_weight, q_bytes, 322 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 323 324 const CeedInt interp_bytes = q_bytes * num_nodes; 325 ierr = cudaMalloc((void **)&data->d_interp, interp_bytes); 326 CeedChk_Cu(ceed, ierr); 327 ierr = cudaMemcpy(data->d_interp, interp, interp_bytes, 328 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 329 330 const CeedInt grad_bytes = q_bytes * num_nodes * dim; 331 ierr = cudaMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Cu(ceed, ierr); 332 ierr = cudaMemcpy(data->d_grad, grad, grad_bytes, 333 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 334 335 // Compile basis kernels 336 CeedInt num_comp; 337 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 338 char *basis_kernel_path, *basis_kernel_source; 339 ierr = CeedGetInstalledJitPath(ceed, 340 "ceed-jit-source/cuda/cuda-ref-basis-nontensor.h", 341 &basis_kernel_path); CeedChkBackend(ierr); 342 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 343 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 344 CeedChkBackend(ierr); 345 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 346 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, 347 "BASIS_Q", num_qpts, 348 "BASIS_P", num_nodes, 349 "BASIS_DIM", dim, 350 "BASIS_NUM_COMP", num_comp 351 ); CeedChk_Cu(ceed, ierr); 352 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 353 CeedChk_Cu(ceed, ierr); 354 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 355 CeedChk_Cu(ceed, ierr); 356 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 357 CeedChk_Cu(ceed, ierr); 358 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 359 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 360 361 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 362 363 // Register backend functions 364 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 365 CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr); 366 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 367 CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr); 368 return CEED_ERROR_SUCCESS; 369 } 370 //------------------------------------------------------------------------------ 371