1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <cuda.h> 21 #include <cuda_runtime.h> 22 #include <stddef.h> 23 #include "ceed-cuda-shared.h" 24 #include "../cuda/ceed-cuda-compile.h" 25 26 27 //------------------------------------------------------------------------------ 28 // Device initalization 29 //------------------------------------------------------------------------------ 30 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, 31 CeedScalar **c_B); 32 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 33 CeedInt Q_1d, CeedScalar **c_B_ptr, 34 CeedScalar **c_G_ptr); 35 36 //------------------------------------------------------------------------------ 37 // Apply basis 38 //------------------------------------------------------------------------------ 39 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, 40 CeedTransposeMode t_mode, 41 CeedEvalMode eval_mode, CeedVector u, 42 CeedVector v) { 43 int ierr; 44 Ceed ceed; 45 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 46 Ceed_Cuda *ceed_Cuda; 47 CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 48 CeedBasis_Cuda_shared *data; 49 CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 50 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 51 CeedInt dim, num_comp; 52 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 53 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 54 55 // Read vectors 56 const CeedScalar *d_u; 57 CeedScalar *d_v; 58 if (eval_mode != CEED_EVAL_WEIGHT) { 59 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 60 } 61 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 62 63 // Clear v for transpose mode 64 if (t_mode == CEED_TRANSPOSE) { 65 CeedInt length; 66 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 67 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 68 } 69 70 // Apply basis operation 71 switch (eval_mode) { 72 case CEED_EVAL_INTERP: { 73 CeedInt P_1d, Q_1d; 74 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 75 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 76 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 77 ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); 78 CeedChkBackend(ierr); 79 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 80 &d_u, &d_v 81 }; 82 if (dim == 1) { 83 CeedInt elems_per_block = 32; 84 CeedInt grid = num_elem/elems_per_block + 85 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 86 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 87 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, 88 elems_per_block, shared_mem, 89 interp_args); CeedChkBackend(ierr); 90 } else if (dim == 2) { 91 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 92 // elems_per_block must be at least 1 93 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 94 num_comp : 1, 1); 95 CeedInt grid = num_elem / elems_per_block + 96 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 97 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 98 CeedScalar); 99 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 100 thread_1d, 101 num_comp*elems_per_block, shared_mem, 102 interp_args); CeedChkBackend(ierr); 103 } else if (dim == 3) { 104 CeedInt elems_per_block = 1; 105 CeedInt grid = num_elem / elems_per_block + 106 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 107 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 108 CeedScalar); 109 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 110 thread_1d, 111 num_comp*elems_per_block, shared_mem, 112 interp_args); CeedChkBackend(ierr); 113 } 114 } break; 115 case CEED_EVAL_GRAD: { 116 CeedInt P_1d, Q_1d; 117 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 118 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 119 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 120 ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d, 121 Q_1d, &data->c_B, &data->c_G); 122 CeedChkBackend(ierr); 123 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 124 &data->c_G, &d_u, &d_v 125 }; 126 if (dim == 1) { 127 CeedInt elems_per_block = 32; 128 CeedInt grid = num_elem / elems_per_block + 129 ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 ); 130 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 131 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, 132 elems_per_block, shared_mem, grad_args); 133 CeedChkBackend(ierr); 134 } else if (dim == 2) { 135 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 136 // elems_per_block must be at least 1 137 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 138 num_comp : 1, 1); 139 CeedInt grid = num_elem / elems_per_block + 140 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 141 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 142 CeedScalar); 143 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 144 num_comp*elems_per_block, shared_mem, 145 grad_args); CeedChkBackend(ierr); 146 } else if (dim == 3) { 147 CeedInt elems_per_block = 1; 148 CeedInt grid = num_elem / elems_per_block + 149 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 150 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 151 CeedScalar); 152 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 153 num_comp*elems_per_block, shared_mem, 154 grad_args); CeedChkBackend(ierr); 155 } 156 } break; 157 case CEED_EVAL_WEIGHT: { 158 CeedInt Q_1d; 159 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 160 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 161 if (dim == 1) { 162 const CeedInt elems_per_block = 32 / Q_1d; 163 const CeedInt gridsize = num_elem / elems_per_block + 164 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 165 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, 166 elems_per_block, 1, weight_args); 167 CeedChkBackend(ierr); 168 } else if (dim == 2) { 169 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 170 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 171 const CeedInt gridsize = num_elem / elems_per_block + 172 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 173 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 174 elems_per_block, weight_args); 175 CeedChkBackend(ierr); 176 } else if (dim == 3) { 177 const CeedInt gridsize = num_elem; 178 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d, 179 weight_args); 180 CeedChkBackend(ierr); 181 } 182 } break; 183 // LCOV_EXCL_START 184 // Evaluate the divergence to/from the quadrature points 185 case CEED_EVAL_DIV: 186 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 187 // Evaluate the curl to/from the quadrature points 188 case CEED_EVAL_CURL: 189 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 190 // Take no action, BasisApply should not have been called 191 case CEED_EVAL_NONE: 192 return CeedError(ceed, CEED_ERROR_BACKEND, 193 "CEED_EVAL_NONE does not make sense in this context"); 194 // LCOV_EXCL_STOP 195 } 196 197 // Restore vectors 198 if (eval_mode != CEED_EVAL_WEIGHT) { 199 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 200 } 201 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 202 return CEED_ERROR_SUCCESS; 203 } 204 205 //------------------------------------------------------------------------------ 206 // Destroy basis 207 //------------------------------------------------------------------------------ 208 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 209 int ierr; 210 Ceed ceed; 211 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 212 213 CeedBasis_Cuda_shared *data; 214 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 215 216 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 217 218 ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 219 ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 220 ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 221 ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); 222 223 ierr = CeedFree(&data); CeedChkBackend(ierr); 224 225 return CEED_ERROR_SUCCESS; 226 } 227 228 //------------------------------------------------------------------------------ 229 // Create tensor basis 230 //------------------------------------------------------------------------------ 231 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 232 const CeedScalar *interp_1d, 233 const CeedScalar *grad_1d, 234 const CeedScalar *q_ref_1d, 235 const CeedScalar *q_weight_1d, 236 CeedBasis basis) { 237 int ierr; 238 Ceed ceed; 239 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 240 CeedBasis_Cuda_shared *data; 241 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 242 243 // Copy basis data to GPU 244 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 245 ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 246 CeedChk_Cu(ceed, ierr); 247 ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 248 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 249 250 const CeedInt interp_bytes = q_bytes * P_1d; 251 ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 252 CeedChk_Cu(ceed, ierr); 253 ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 254 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 255 256 ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 257 CeedChk_Cu(ceed, ierr); 258 ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 259 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 260 261 // Compute collocated gradient and copy to GPU 262 data->d_collo_grad_1d = NULL; 263 if (dim == 3 && Q_1d >= P_1d) { 264 CeedScalar *collo_grad_1d; 265 ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 266 ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 267 ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); 268 CeedChk_Cu(ceed, ierr); 269 ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, 270 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 271 ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 272 } 273 274 // Compile basis kernels 275 CeedInt num_comp; 276 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 277 char *basis_kernel_path, *basis_kernel_source; 278 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-shared-basis.h", 279 &basis_kernel_path); CeedChkBackend(ierr); 280 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 281 CeedChkBackend(ierr); 282 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, 283 "Q1D", Q_1d, 284 "P1D", P_1d, 285 "T1D", CeedIntMax(Q_1d, P_1d), 286 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 287 Q_1d : P_1d, dim), 288 "BASIS_DIM", dim, 289 "BASIS_NCOMP", num_comp, 290 "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 291 "BASIS_NQPT", CeedIntPow(Q_1d, dim) 292 ); CeedChkBackend(ierr); 293 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 294 CeedChkBackend(ierr); 295 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 296 CeedChkBackend(ierr); 297 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 298 CeedChkBackend(ierr); 299 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 300 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 301 302 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 303 304 // Register backend functions 305 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 306 CeedBasisApplyTensor_Cuda_shared); 307 CeedChkBackend(ierr); 308 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 309 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); 310 return CEED_ERROR_SUCCESS; 311 } 312 //------------------------------------------------------------------------------ 313