1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <cuda.h> 21 #include <cuda_runtime.h> 22 #include <stddef.h> 23 #include "ceed-cuda-shared.h" 24 #include "../cuda/ceed-cuda-compile.h" 25 26 27 //------------------------------------------------------------------------------ 28 // Device initalization 29 //------------------------------------------------------------------------------ 30 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, 31 CeedScalar **c_B); 32 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 33 CeedInt Q_1d, CeedScalar **c_B_ptr, 34 CeedScalar **c_G_ptr); 35 36 //------------------------------------------------------------------------------ 37 // Apply basis 38 //------------------------------------------------------------------------------ 39 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, 40 CeedTransposeMode t_mode, 41 CeedEvalMode eval_mode, CeedVector u, 42 CeedVector v) { 43 int ierr; 44 Ceed ceed; 45 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 46 Ceed_Cuda *ceed_Cuda; 47 CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 48 CeedBasis_Cuda_shared *data; 49 CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 50 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 51 CeedInt dim, num_comp; 52 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 53 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 54 55 // Read vectors 56 const CeedScalar *d_u; 57 CeedScalar *d_v; 58 if (eval_mode != CEED_EVAL_WEIGHT) { 59 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 60 } 61 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 62 63 // Clear v for transpose mode 64 if (t_mode == CEED_TRANSPOSE) { 65 CeedInt length; 66 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 67 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 68 } 69 70 // Apply basis operation 71 switch (eval_mode) { 72 case CEED_EVAL_INTERP: { 73 CeedInt P_1d, Q_1d; 74 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 75 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 76 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 77 ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); 78 CeedChkBackend(ierr); 79 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 80 &d_u, &d_v 81 }; 82 if (dim == 1) { 83 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 84 CeedIntMax(512 / thread_1d, 85 1)); // avoid >512 total threads 86 CeedInt grid = num_elem/elems_per_block + 87 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 88 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 89 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, 90 elems_per_block, shared_mem, 91 interp_args); CeedChkBackend(ierr); 92 } else if (dim == 2) { 93 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 94 // elems_per_block must be at least 1 95 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 96 num_comp : 1, 1); 97 CeedInt grid = num_elem / elems_per_block + 98 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 99 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 100 CeedScalar); 101 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 102 thread_1d, 103 num_comp*elems_per_block, shared_mem, 104 interp_args); CeedChkBackend(ierr); 105 } else if (dim == 3) { 106 CeedInt elems_per_block = 1; 107 CeedInt grid = num_elem / elems_per_block + 108 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 109 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 110 CeedScalar); 111 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 112 thread_1d, 113 num_comp*elems_per_block, shared_mem, 114 interp_args); CeedChkBackend(ierr); 115 } 116 } break; 117 case CEED_EVAL_GRAD: { 118 CeedInt P_1d, Q_1d; 119 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 120 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 121 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 122 ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d, 123 Q_1d, &data->c_B, &data->c_G); 124 CeedChkBackend(ierr); 125 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 126 &data->c_G, &d_u, &d_v 127 }; 128 if (dim == 1) { 129 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 130 CeedIntMax(512 / thread_1d, 131 1)); // avoid >512 total threads 132 CeedInt grid = num_elem / elems_per_block + 133 ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 ); 134 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 135 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, 136 elems_per_block, shared_mem, grad_args); 137 CeedChkBackend(ierr); 138 } else if (dim == 2) { 139 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 140 // elems_per_block must be at least 1 141 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 142 num_comp : 1, 1); 143 CeedInt grid = num_elem / elems_per_block + 144 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 145 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 146 CeedScalar); 147 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 148 num_comp*elems_per_block, shared_mem, 149 grad_args); CeedChkBackend(ierr); 150 } else if (dim == 3) { 151 CeedInt elems_per_block = 1; 152 CeedInt grid = num_elem / elems_per_block + 153 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 154 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 155 CeedScalar); 156 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 157 num_comp*elems_per_block, shared_mem, 158 grad_args); CeedChkBackend(ierr); 159 } 160 } break; 161 case CEED_EVAL_WEIGHT: { 162 CeedInt Q_1d; 163 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 164 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 165 if (dim == 1) { 166 const CeedInt elems_per_block = 32 / Q_1d; 167 const CeedInt gridsize = num_elem / elems_per_block + 168 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 169 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, 170 elems_per_block, 1, weight_args); 171 CeedChkBackend(ierr); 172 } else if (dim == 2) { 173 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 174 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 175 const CeedInt gridsize = num_elem / elems_per_block + 176 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 177 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 178 elems_per_block, weight_args); 179 CeedChkBackend(ierr); 180 } else if (dim == 3) { 181 const CeedInt gridsize = num_elem; 182 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d, 183 weight_args); 184 CeedChkBackend(ierr); 185 } 186 } break; 187 // LCOV_EXCL_START 188 // Evaluate the divergence to/from the quadrature points 189 case CEED_EVAL_DIV: 190 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 191 // Evaluate the curl to/from the quadrature points 192 case CEED_EVAL_CURL: 193 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 194 // Take no action, BasisApply should not have been called 195 case CEED_EVAL_NONE: 196 return CeedError(ceed, CEED_ERROR_BACKEND, 197 "CEED_EVAL_NONE does not make sense in this context"); 198 // LCOV_EXCL_STOP 199 } 200 201 // Restore vectors 202 if (eval_mode != CEED_EVAL_WEIGHT) { 203 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 204 } 205 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 206 return CEED_ERROR_SUCCESS; 207 } 208 209 //------------------------------------------------------------------------------ 210 // Destroy basis 211 //------------------------------------------------------------------------------ 212 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 213 int ierr; 214 Ceed ceed; 215 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 216 217 CeedBasis_Cuda_shared *data; 218 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 219 220 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 221 222 ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 223 ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 224 ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 225 ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); 226 227 ierr = CeedFree(&data); CeedChkBackend(ierr); 228 229 return CEED_ERROR_SUCCESS; 230 } 231 232 //------------------------------------------------------------------------------ 233 // Create tensor basis 234 //------------------------------------------------------------------------------ 235 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 236 const CeedScalar *interp_1d, 237 const CeedScalar *grad_1d, 238 const CeedScalar *q_ref_1d, 239 const CeedScalar *q_weight_1d, 240 CeedBasis basis) { 241 int ierr; 242 Ceed ceed; 243 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 244 CeedBasis_Cuda_shared *data; 245 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 246 247 // Copy basis data to GPU 248 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 249 ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 250 CeedChk_Cu(ceed, ierr); 251 ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 252 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 253 254 const CeedInt interp_bytes = q_bytes * P_1d; 255 ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 256 CeedChk_Cu(ceed, ierr); 257 ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 258 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 259 260 ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 261 CeedChk_Cu(ceed, ierr); 262 ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 263 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 264 265 // Compute collocated gradient and copy to GPU 266 data->d_collo_grad_1d = NULL; 267 if (dim == 3 && Q_1d >= P_1d) { 268 CeedScalar *collo_grad_1d; 269 ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 270 ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 271 ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); 272 CeedChk_Cu(ceed, ierr); 273 ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, 274 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 275 ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 276 } 277 278 // Compile basis kernels 279 CeedInt num_comp; 280 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 281 char *basis_kernel_path, *basis_kernel_source; 282 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-shared-basis.h", 283 &basis_kernel_path); CeedChkBackend(ierr); 284 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 285 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 286 CeedChkBackend(ierr); 287 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n"); 288 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, 289 "BASIS_Q_1D", Q_1d, 290 "BASIS_P_1D", P_1d, 291 "BASIS_T_1D", CeedIntMax(Q_1d, P_1d), 292 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 293 Q_1d : P_1d, dim), 294 "BASIS_DIM", dim, 295 "BASIS_NUM_COMP", num_comp, 296 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 297 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) 298 ); CeedChkBackend(ierr); 299 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 300 CeedChkBackend(ierr); 301 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 302 CeedChkBackend(ierr); 303 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 304 CeedChkBackend(ierr); 305 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 306 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 307 308 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 309 310 // Register backend functions 311 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 312 CeedBasisApplyTensor_Cuda_shared); 313 CeedChkBackend(ierr); 314 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 315 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); 316 return CEED_ERROR_SUCCESS; 317 } 318 //------------------------------------------------------------------------------ 319