1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stddef.h> 14 #include "ceed-cuda-shared.h" 15 #include "../cuda/ceed-cuda-compile.h" 16 17 18 //------------------------------------------------------------------------------ 19 // Device initalization 20 //------------------------------------------------------------------------------ 21 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, 22 CeedScalar **c_B); 23 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 24 CeedInt Q_1d, CeedScalar **c_B_ptr, 25 CeedScalar **c_G_ptr); 26 27 //------------------------------------------------------------------------------ 28 // Apply basis 29 //------------------------------------------------------------------------------ 30 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, 31 CeedTransposeMode t_mode, 32 CeedEvalMode eval_mode, CeedVector u, 33 CeedVector v) { 34 int ierr; 35 Ceed ceed; 36 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 37 Ceed_Cuda *ceed_Cuda; 38 CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 39 CeedBasis_Cuda_shared *data; 40 CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 41 const CeedInt transpose = t_mode == CEED_TRANSPOSE; 42 CeedInt dim, num_comp; 43 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 44 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 45 46 // Read vectors 47 const CeedScalar *d_u; 48 CeedScalar *d_v; 49 if (eval_mode != CEED_EVAL_WEIGHT) { 50 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 51 } 52 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 53 54 // Clear v for transpose mode 55 if (t_mode == CEED_TRANSPOSE) { 56 CeedSize length; 57 ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 58 ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 59 } 60 61 // Apply basis operation 62 switch (eval_mode) { 63 case CEED_EVAL_INTERP: { 64 CeedInt P_1d, Q_1d; 65 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 66 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 67 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 68 ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); 69 CeedChkBackend(ierr); 70 void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 71 &d_u, &d_v 72 }; 73 if (dim == 1) { 74 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 75 CeedIntMax(512 / thread_1d, 76 1)); // avoid >512 total threads 77 CeedInt grid = num_elem/elems_per_block + 78 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 79 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 80 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, 81 elems_per_block, shared_mem, 82 interp_args); CeedChkBackend(ierr); 83 } else if (dim == 2) { 84 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 85 // elems_per_block must be at least 1 86 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 87 num_comp : 1, 1); 88 CeedInt grid = num_elem / elems_per_block + 89 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 90 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 91 CeedScalar); 92 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 93 thread_1d, 94 num_comp*elems_per_block, shared_mem, 95 interp_args); CeedChkBackend(ierr); 96 } else if (dim == 3) { 97 CeedInt elems_per_block = 1; 98 CeedInt grid = num_elem / elems_per_block + 99 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 100 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 101 CeedScalar); 102 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 103 thread_1d, 104 num_comp*elems_per_block, shared_mem, 105 interp_args); CeedChkBackend(ierr); 106 } 107 } break; 108 case CEED_EVAL_GRAD: { 109 CeedInt P_1d, Q_1d; 110 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 111 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 112 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 113 ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d, 114 Q_1d, &data->c_B, &data->c_G); 115 CeedChkBackend(ierr); 116 void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 117 &data->c_G, &d_u, &d_v 118 }; 119 if (dim == 1) { 120 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 121 CeedIntMax(512 / thread_1d, 122 1)); // avoid >512 total threads 123 CeedInt grid = num_elem / elems_per_block + 124 ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 ); 125 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 126 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, 127 elems_per_block, shared_mem, grad_args); 128 CeedChkBackend(ierr); 129 } else if (dim == 2) { 130 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 131 // elems_per_block must be at least 1 132 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 133 num_comp : 1, 1); 134 CeedInt grid = num_elem / elems_per_block + 135 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 136 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 137 CeedScalar); 138 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 139 num_comp*elems_per_block, shared_mem, 140 grad_args); CeedChkBackend(ierr); 141 } else if (dim == 3) { 142 CeedInt elems_per_block = 1; 143 CeedInt grid = num_elem / elems_per_block + 144 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 145 CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 146 CeedScalar); 147 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 148 num_comp*elems_per_block, shared_mem, 149 grad_args); CeedChkBackend(ierr); 150 } 151 } break; 152 case CEED_EVAL_WEIGHT: { 153 CeedInt Q_1d; 154 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 155 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 156 if (dim == 1) { 157 const CeedInt elems_per_block = 32 / Q_1d; 158 const CeedInt gridsize = num_elem / elems_per_block + 159 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 160 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, 161 elems_per_block, 1, weight_args); 162 CeedChkBackend(ierr); 163 } else if (dim == 2) { 164 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 165 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 166 const CeedInt gridsize = num_elem / elems_per_block + 167 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 168 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 169 elems_per_block, weight_args); 170 CeedChkBackend(ierr); 171 } else if (dim == 3) { 172 const CeedInt gridsize = num_elem; 173 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d, 174 weight_args); 175 CeedChkBackend(ierr); 176 } 177 } break; 178 // LCOV_EXCL_START 179 // Evaluate the divergence to/from the quadrature points 180 case CEED_EVAL_DIV: 181 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 182 // Evaluate the curl to/from the quadrature points 183 case CEED_EVAL_CURL: 184 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 185 // Take no action, BasisApply should not have been called 186 case CEED_EVAL_NONE: 187 return CeedError(ceed, CEED_ERROR_BACKEND, 188 "CEED_EVAL_NONE does not make sense in this context"); 189 // LCOV_EXCL_STOP 190 } 191 192 // Restore vectors 193 if (eval_mode != CEED_EVAL_WEIGHT) { 194 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 195 } 196 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 197 return CEED_ERROR_SUCCESS; 198 } 199 200 //------------------------------------------------------------------------------ 201 // Destroy basis 202 //------------------------------------------------------------------------------ 203 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 204 int ierr; 205 Ceed ceed; 206 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 207 208 CeedBasis_Cuda_shared *data; 209 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 210 211 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 212 213 ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 214 ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 215 ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 216 ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); 217 218 ierr = CeedFree(&data); CeedChkBackend(ierr); 219 220 return CEED_ERROR_SUCCESS; 221 } 222 223 //------------------------------------------------------------------------------ 224 // Create tensor basis 225 //------------------------------------------------------------------------------ 226 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 227 const CeedScalar *interp_1d, 228 const CeedScalar *grad_1d, 229 const CeedScalar *q_ref_1d, 230 const CeedScalar *q_weight_1d, 231 CeedBasis basis) { 232 int ierr; 233 Ceed ceed; 234 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 235 CeedBasis_Cuda_shared *data; 236 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 237 238 // Copy basis data to GPU 239 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 240 ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 241 CeedChk_Cu(ceed, ierr); 242 ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 243 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 244 245 const CeedInt interp_bytes = q_bytes * P_1d; 246 ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 247 CeedChk_Cu(ceed, ierr); 248 ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 249 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 250 251 ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 252 CeedChk_Cu(ceed, ierr); 253 ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 254 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 255 256 // Compute collocated gradient and copy to GPU 257 data->d_collo_grad_1d = NULL; 258 if (dim == 3 && Q_1d >= P_1d) { 259 CeedScalar *collo_grad_1d; 260 ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 261 ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 262 ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); 263 CeedChk_Cu(ceed, ierr); 264 ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, 265 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 266 ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 267 } 268 269 // Compile basis kernels 270 CeedInt num_comp; 271 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 272 char *basis_kernel_path, *basis_kernel_source; 273 ierr = CeedGetJitAbsolutePath(ceed, 274 "ceed/jit-source/cuda/cuda-shared-basis.h", 275 &basis_kernel_path); CeedChkBackend(ierr); 276 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 277 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 278 CeedChkBackend(ierr); 279 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n"); 280 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, 281 "BASIS_Q_1D", Q_1d, 282 "BASIS_P_1D", P_1d, 283 "BASIS_T_1D", CeedIntMax(Q_1d, P_1d), 284 "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 285 Q_1d : P_1d, dim), 286 "BASIS_DIM", dim, 287 "BASIS_NUM_COMP", num_comp, 288 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 289 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) 290 ); CeedChkBackend(ierr); 291 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 292 CeedChkBackend(ierr); 293 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 294 CeedChkBackend(ierr); 295 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 296 CeedChkBackend(ierr); 297 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 298 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 299 300 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 301 302 // Register backend functions 303 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 304 CeedBasisApplyTensor_Cuda_shared); 305 CeedChkBackend(ierr); 306 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 307 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); 308 return CEED_ERROR_SUCCESS; 309 } 310 //------------------------------------------------------------------------------ 311