1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <cuda.h> 21 #include <cuda_runtime.h> 22 #include "ceed-cuda-ref.h" 23 #include "../cuda/ceed-cuda-compile.h" 24 25 //------------------------------------------------------------------------------ 26 // Basis apply - tensor 27 //------------------------------------------------------------------------------ 28 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, 29 CeedTransposeMode t_mode, CeedEvalMode eval_mode, 30 CeedVector u, CeedVector v) { 31 int ierr; 32 Ceed ceed; 33 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 34 Ceed_Cuda *ceed_Cuda; 35 ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 36 CeedBasis_Cuda *data; 37 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 38 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 39 const int max_block_size = 32; 40 41 // Read vectors 42 const CeedScalar *d_u; 43 CeedScalar *d_v; 44 if (eval_mode != CEED_EVAL_WEIGHT) { 45 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 46 } 47 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 48 49 // Clear v for transpose operation 50 if (t_mode == CEED_TRANSPOSE) { 51 CeedInt length; 52 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 53 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 54 CeedChk_Cu(ceed, ierr); 55 } 56 CeedInt Q_1d, dim; 57 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 58 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 59 60 // Basis action 61 switch (eval_mode) { 62 case CEED_EVAL_INTERP: { 63 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 64 &data->d_interp_1d, &d_u, &d_v 65 }; 66 CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 67 68 ierr = CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args); 69 CeedChkBackend(ierr); 70 } break; 71 case CEED_EVAL_GRAD: { 72 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 73 &data->d_grad_1d, &d_u, &d_v 74 }; 75 CeedInt block_size = max_block_size; 76 77 ierr = CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args); 78 CeedChkBackend(ierr); 79 } break; 80 case CEED_EVAL_WEIGHT: { 81 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 82 const int grid_size = num_elem; 83 ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid_size, 84 Q_1d, dim >= 2 ? Q_1d : 1, 1, 85 weight_args); CeedChkBackend(ierr); 86 } break; 87 // LCOV_EXCL_START 88 // Evaluate the divergence to/from the quadrature points 89 case CEED_EVAL_DIV: 90 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 91 // Evaluate the curl to/from the quadrature points 92 case CEED_EVAL_CURL: 93 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 94 // Take no action, BasisApply should not have been called 95 case CEED_EVAL_NONE: 96 return CeedError(ceed, CEED_ERROR_BACKEND, 97 "CEED_EVAL_NONE does not make sense in this context"); 98 // LCOV_EXCL_STOP 99 } 100 101 // Restore vectors 102 if (eval_mode != CEED_EVAL_WEIGHT) { 103 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 104 } 105 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 106 return CEED_ERROR_SUCCESS; 107 } 108 109 //------------------------------------------------------------------------------ 110 // Basis apply - non-tensor 111 //------------------------------------------------------------------------------ 112 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, 113 CeedTransposeMode t_mode, CeedEvalMode eval_mode, 114 CeedVector u, CeedVector v) { 115 int ierr; 116 Ceed ceed; 117 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 118 Ceed_Cuda *ceed_Cuda; 119 ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 120 CeedBasisNonTensor_Cuda *data; 121 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 122 CeedInt num_nodes, num_qpts; 123 ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); 124 ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); 125 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 126 int elems_per_block = 1; 127 int grid = num_elem / elems_per_block + 128 ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 129 130 // Read vectors 131 const CeedScalar *d_u; 132 CeedScalar *d_v; 133 if (eval_mode != CEED_EVAL_WEIGHT) { 134 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 135 } 136 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 137 138 // Clear v for transpose operation 139 if (t_mode == CEED_TRANSPOSE) { 140 CeedInt length; 141 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 142 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 143 CeedChk_Cu(ceed, ierr); 144 } 145 146 // Apply basis operation 147 switch (eval_mode) { 148 case CEED_EVAL_INTERP: { 149 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 150 &data->d_interp, &d_u, &d_v 151 }; 152 if (transpose) { 153 ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, 154 elems_per_block, interp_args); CeedChkBackend(ierr); 155 } else { 156 ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, 157 elems_per_block, interp_args); CeedChkBackend(ierr); 158 } 159 } break; 160 case CEED_EVAL_GRAD: { 161 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, 162 &d_u, &d_v 163 }; 164 if (transpose) { 165 ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, 166 elems_per_block, grad_args); CeedChkBackend(ierr); 167 } else { 168 ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, 169 elems_per_block, grad_args); CeedChkBackend(ierr); 170 } 171 } break; 172 case CEED_EVAL_WEIGHT: { 173 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; 174 ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, 175 elems_per_block, weight_args); CeedChkBackend(ierr); 176 } break; 177 // LCOV_EXCL_START 178 // Evaluate the divergence to/from the quadrature points 179 case CEED_EVAL_DIV: 180 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 181 // Evaluate the curl to/from the quadrature points 182 case CEED_EVAL_CURL: 183 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 184 // Take no action, BasisApply should not have been called 185 case CEED_EVAL_NONE: 186 return CeedError(ceed, CEED_ERROR_BACKEND, 187 "CEED_EVAL_NONE does not make sense in this context"); 188 // LCOV_EXCL_STOP 189 } 190 191 // Restore vectors 192 if (eval_mode != CEED_EVAL_WEIGHT) { 193 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 194 } 195 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 196 return CEED_ERROR_SUCCESS; 197 } 198 199 //------------------------------------------------------------------------------ 200 // Destroy tensor basis 201 //------------------------------------------------------------------------------ 202 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 203 int ierr; 204 Ceed ceed; 205 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 206 207 CeedBasis_Cuda *data; 208 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 209 210 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 211 212 ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 213 ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 214 ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 215 ierr = CeedFree(&data); CeedChkBackend(ierr); 216 217 return CEED_ERROR_SUCCESS; 218 } 219 220 //------------------------------------------------------------------------------ 221 // Destroy non-tensor basis 222 //------------------------------------------------------------------------------ 223 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 224 int ierr; 225 Ceed ceed; 226 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 227 228 CeedBasisNonTensor_Cuda *data; 229 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 230 231 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 232 233 ierr = cudaFree(data->d_q_weight); CeedChk_Cu(ceed, ierr); 234 ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr); 235 ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr); 236 ierr = CeedFree(&data); CeedChkBackend(ierr); 237 238 return CEED_ERROR_SUCCESS; 239 } 240 241 //------------------------------------------------------------------------------ 242 // Create tensor 243 //------------------------------------------------------------------------------ 244 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 245 const CeedScalar *interp_1d, 246 const CeedScalar *grad_1d, 247 const CeedScalar *q_ref_1d, 248 const CeedScalar *q_weight_1d, 249 CeedBasis basis) { 250 int ierr; 251 Ceed ceed; 252 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 253 CeedBasis_Cuda *data; 254 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 255 256 // Copy data to GPU 257 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 258 ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 259 CeedChk_Cu(ceed, ierr); 260 ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 261 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 262 263 const CeedInt interp_bytes = q_bytes * P_1d; 264 ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 265 CeedChk_Cu(ceed, ierr); 266 ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 267 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 268 269 ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 270 CeedChk_Cu(ceed, ierr); 271 ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 272 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 273 274 // Complie basis kernels 275 CeedInt num_comp; 276 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 277 char *basis_kernel_path, *basis_kernel_source; 278 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-ref-basis-tensor.h", 279 &basis_kernel_path); CeedChkBackend(ierr); 280 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 281 CeedChkBackend(ierr); 282 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, 283 "BASIS_Q1D", Q_1d, 284 "BASIS_P1D", P_1d, 285 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 286 Q_1d : P_1d, dim), 287 "BASIS_DIM", dim, 288 "BASIS_NCOMP", num_comp, 289 "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 290 "BASIS_NQPT", CeedIntPow(Q_1d, dim) 291 ); CeedChkBackend(ierr); 292 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 293 CeedChkBackend(ierr); 294 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 295 CeedChkBackend(ierr); 296 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 297 CeedChkBackend(ierr); 298 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 299 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 300 301 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 302 303 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 304 CeedBasisApply_Cuda); CeedChkBackend(ierr); 305 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 306 CeedBasisDestroy_Cuda); CeedChkBackend(ierr); 307 return CEED_ERROR_SUCCESS; 308 } 309 310 //------------------------------------------------------------------------------ 311 // Create non-tensor 312 //------------------------------------------------------------------------------ 313 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, 314 CeedInt num_nodes, 315 CeedInt num_qpts, const CeedScalar *interp, 316 const CeedScalar *grad, const CeedScalar *qref, 317 const CeedScalar *q_weight, CeedBasis basis) { 318 int ierr; 319 Ceed ceed; 320 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 321 CeedBasisNonTensor_Cuda *data; 322 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 323 324 // Copy basis data to GPU 325 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 326 ierr = cudaMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Cu(ceed, ierr); 327 ierr = cudaMemcpy(data->d_q_weight, q_weight, q_bytes, 328 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 329 330 const CeedInt interp_bytes = q_bytes * num_nodes; 331 ierr = cudaMalloc((void **)&data->d_interp, interp_bytes); 332 CeedChk_Cu(ceed, ierr); 333 ierr = cudaMemcpy(data->d_interp, interp, interp_bytes, 334 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 335 336 const CeedInt grad_bytes = q_bytes * num_nodes * dim; 337 ierr = cudaMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Cu(ceed, ierr); 338 ierr = cudaMemcpy(data->d_grad, grad, grad_bytes, 339 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 340 341 // Compile basis kernels 342 CeedInt num_comp; 343 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 344 char *basis_kernel_path, *basis_kernel_source; 345 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-ref-basis-nontensor.h", 346 &basis_kernel_path); CeedChkBackend(ierr); 347 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 348 CeedChkBackend(ierr); 349 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, 350 "Q", num_qpts, 351 "P", num_nodes, 352 "BASIS_DIM", dim, 353 "BASIS_NCOMP", num_comp 354 ); CeedChk_Cu(ceed, ierr); 355 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 356 CeedChk_Cu(ceed, ierr); 357 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 358 CeedChk_Cu(ceed, ierr); 359 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 360 CeedChk_Cu(ceed, ierr); 361 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 362 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 363 364 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 365 366 // Register backend functions 367 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 368 CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr); 369 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 370 CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr); 371 return CEED_ERROR_SUCCESS; 372 } 373 //------------------------------------------------------------------------------ 374