1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <cuda.h> 21 #include <cuda_runtime.h> 22 #include <stddef.h> 23 #include "ceed-cuda-shared.h" 24 #include "../cuda/ceed-cuda-compile.h" 25 26 27 //------------------------------------------------------------------------------ 28 // Device initalization 29 //------------------------------------------------------------------------------ 30 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, 31 CeedScalar **c_B); 32 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 33 CeedInt Q_1d, CeedScalar **c_B_ptr, 34 CeedScalar **c_G_ptr); 35 36 //------------------------------------------------------------------------------ 37 // Apply basis 38 //------------------------------------------------------------------------------ 39 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, 40 CeedTransposeMode t_mode, 41 CeedEvalMode eval_mode, CeedVector u, 42 CeedVector v) { 43 int ierr; 44 Ceed ceed; 45 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 46 Ceed_Cuda *ceed_Cuda; 47 CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 48 CeedBasis_Cuda_shared *data; 49 CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 50 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 51 CeedInt dim, num_comp; 52 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 53 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 54 55 // Read vectors 56 const CeedScalar *d_u; 57 CeedScalar *d_v; 58 if (eval_mode != CEED_EVAL_WEIGHT) { 59 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 60 } 61 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 62 63 // Clear v for transpose mode 64 if (t_mode == CEED_TRANSPOSE) { 65 CeedInt length; 66 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 67 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 68 } 69 70 // Apply basis operation 71 switch (eval_mode) { 72 case CEED_EVAL_INTERP: { 73 CeedInt P_1d, Q_1d; 74 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 75 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 76 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 77 ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); 78 CeedChkBackend(ierr); 79 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 80 &d_u, &d_v 81 }; 82 if (dim == 1) { 83 CeedInt elems_per_block = CeedIntMax(512 / thread_1d, 84 1); // avoid >512 total threads 85 CeedInt grid = num_elem/elems_per_block + 86 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 87 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 88 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, 89 elems_per_block, shared_mem, 90 interp_args); CeedChkBackend(ierr); 91 } else if (dim == 2) { 92 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 93 // elems_per_block must be at least 1 94 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 95 num_comp : 1, 1); 96 CeedInt grid = num_elem / elems_per_block + 97 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 98 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 99 CeedScalar); 100 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 101 thread_1d, 102 num_comp*elems_per_block, shared_mem, 103 interp_args); CeedChkBackend(ierr); 104 } else if (dim == 3) { 105 CeedInt elems_per_block = 1; 106 CeedInt grid = num_elem / elems_per_block + 107 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 108 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 109 CeedScalar); 110 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 111 thread_1d, 112 num_comp*elems_per_block, shared_mem, 113 interp_args); CeedChkBackend(ierr); 114 } 115 } break; 116 case CEED_EVAL_GRAD: { 117 CeedInt P_1d, Q_1d; 118 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 119 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 120 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 121 ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d, 122 Q_1d, &data->c_B, &data->c_G); 123 CeedChkBackend(ierr); 124 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 125 &data->c_G, &d_u, &d_v 126 }; 127 if (dim == 1) { 128 CeedInt elems_per_block = CeedIntMax(512 / thread_1d, 129 1); // avoid >512 total threads 130 CeedInt grid = num_elem / elems_per_block + 131 ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 ); 132 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 133 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, 134 elems_per_block, shared_mem, grad_args); 135 CeedChkBackend(ierr); 136 } else if (dim == 2) { 137 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 138 // elems_per_block must be at least 1 139 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 140 num_comp : 1, 1); 141 CeedInt grid = num_elem / elems_per_block + 142 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 143 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 144 CeedScalar); 145 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 146 num_comp*elems_per_block, shared_mem, 147 grad_args); CeedChkBackend(ierr); 148 } else if (dim == 3) { 149 CeedInt elems_per_block = 1; 150 CeedInt grid = num_elem / elems_per_block + 151 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 152 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 153 CeedScalar); 154 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 155 num_comp*elems_per_block, shared_mem, 156 grad_args); CeedChkBackend(ierr); 157 } 158 } break; 159 case CEED_EVAL_WEIGHT: { 160 CeedInt Q_1d; 161 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 162 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 163 if (dim == 1) { 164 const CeedInt elems_per_block = 32 / Q_1d; 165 const CeedInt gridsize = num_elem / elems_per_block + 166 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 167 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, 168 elems_per_block, 1, weight_args); 169 CeedChkBackend(ierr); 170 } else if (dim == 2) { 171 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 172 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 173 const CeedInt gridsize = num_elem / elems_per_block + 174 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 175 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 176 elems_per_block, weight_args); 177 CeedChkBackend(ierr); 178 } else if (dim == 3) { 179 const CeedInt gridsize = num_elem; 180 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d, 181 weight_args); 182 CeedChkBackend(ierr); 183 } 184 } break; 185 // LCOV_EXCL_START 186 // Evaluate the divergence to/from the quadrature points 187 case CEED_EVAL_DIV: 188 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 189 // Evaluate the curl to/from the quadrature points 190 case CEED_EVAL_CURL: 191 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 192 // Take no action, BasisApply should not have been called 193 case CEED_EVAL_NONE: 194 return CeedError(ceed, CEED_ERROR_BACKEND, 195 "CEED_EVAL_NONE does not make sense in this context"); 196 // LCOV_EXCL_STOP 197 } 198 199 // Restore vectors 200 if (eval_mode != CEED_EVAL_WEIGHT) { 201 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 202 } 203 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 204 return CEED_ERROR_SUCCESS; 205 } 206 207 //------------------------------------------------------------------------------ 208 // Destroy basis 209 //------------------------------------------------------------------------------ 210 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 211 int ierr; 212 Ceed ceed; 213 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 214 215 CeedBasis_Cuda_shared *data; 216 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 217 218 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 219 220 ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 221 ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 222 ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 223 ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); 224 225 ierr = CeedFree(&data); CeedChkBackend(ierr); 226 227 return CEED_ERROR_SUCCESS; 228 } 229 230 //------------------------------------------------------------------------------ 231 // Create tensor basis 232 //------------------------------------------------------------------------------ 233 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 234 const CeedScalar *interp_1d, 235 const CeedScalar *grad_1d, 236 const CeedScalar *q_ref_1d, 237 const CeedScalar *q_weight_1d, 238 CeedBasis basis) { 239 int ierr; 240 Ceed ceed; 241 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 242 CeedBasis_Cuda_shared *data; 243 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 244 245 // Copy basis data to GPU 246 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 247 ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 248 CeedChk_Cu(ceed, ierr); 249 ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 250 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 251 252 const CeedInt interp_bytes = q_bytes * P_1d; 253 ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 254 CeedChk_Cu(ceed, ierr); 255 ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 256 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 257 258 ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 259 CeedChk_Cu(ceed, ierr); 260 ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 261 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 262 263 // Compute collocated gradient and copy to GPU 264 data->d_collo_grad_1d = NULL; 265 if (dim == 3 && Q_1d >= P_1d) { 266 CeedScalar *collo_grad_1d; 267 ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 268 ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 269 ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); 270 CeedChk_Cu(ceed, ierr); 271 ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, 272 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 273 ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 274 } 275 276 // Compile basis kernels 277 CeedInt num_comp; 278 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 279 char *basis_kernel_path, *basis_kernel_source; 280 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-shared-basis.h", 281 &basis_kernel_path); CeedChkBackend(ierr); 282 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 283 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 284 CeedChkBackend(ierr); 285 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n"); 286 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, 287 "BASIS_Q_1D", Q_1d, 288 "BASIS_P_1D", P_1d, 289 "BASIS_T_1D", CeedIntMax(Q_1d, P_1d), 290 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 291 Q_1d : P_1d, dim), 292 "BASIS_DIM", dim, 293 "BASIS_NUM_COMP", num_comp, 294 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 295 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) 296 ); CeedChkBackend(ierr); 297 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 298 CeedChkBackend(ierr); 299 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 300 CeedChkBackend(ierr); 301 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 302 CeedChkBackend(ierr); 303 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 304 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 305 306 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 307 308 // Register backend functions 309 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 310 CeedBasisApplyTensor_Cuda_shared); 311 CeedChkBackend(ierr); 312 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 313 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); 314 return CEED_ERROR_SUCCESS; 315 } 316 //------------------------------------------------------------------------------ 317