1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stddef.h> 14 #include "ceed-cuda-shared.h" 15 #include "../cuda/ceed-cuda-compile.h" 16 17 18 //------------------------------------------------------------------------------ 19 // Device initalization 20 //------------------------------------------------------------------------------ 21 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, 22 CeedScalar **c_B); 23 int CeedCudaInitGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 24 CeedInt Q_1d, CeedScalar **c_B_ptr, 25 CeedScalar **c_G_ptr); 26 int CeedCudaInitCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 27 CeedInt Q_1d, CeedScalar **c_B_ptr, 28 CeedScalar **c_G_ptr); 29 30 //------------------------------------------------------------------------------ 31 // Apply basis 32 //------------------------------------------------------------------------------ 33 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, 34 CeedTransposeMode t_mode, 35 CeedEvalMode eval_mode, CeedVector u, 36 CeedVector v) { 37 int ierr; 38 Ceed ceed; 39 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 40 Ceed_Cuda *ceed_Cuda; 41 CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 42 CeedBasis_Cuda_shared *data; 43 CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 44 CeedInt dim, num_comp; 45 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 46 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 47 48 // Read vectors 49 const CeedScalar *d_u; 50 CeedScalar *d_v; 51 if (eval_mode != CEED_EVAL_WEIGHT) { 52 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 53 } 54 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 55 56 // Apply basis operation 57 switch (eval_mode) { 58 case CEED_EVAL_INTERP: { 59 CeedInt P_1d, Q_1d; 60 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 61 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 62 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 63 ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); 64 CeedChkBackend(ierr); 65 void *interp_args[] = {(void *) &num_elem, &data->c_B, 66 &d_u, &d_v 67 }; 68 if (dim == 1) { 69 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 70 CeedIntMax(512 / thread_1d, 71 1)); // avoid >512 total threads 72 CeedInt grid = num_elem/elems_per_block + 73 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 74 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 75 if (t_mode == CEED_TRANSPOSE) { 76 ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, 77 1, 78 elems_per_block, shared_mem, 79 interp_args); CeedChkBackend(ierr); 80 } else { 81 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, 82 elems_per_block, shared_mem, 83 interp_args); CeedChkBackend(ierr); 84 } 85 } else if (dim == 2) { 86 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 87 // elems_per_block must be at least 1 88 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 89 num_comp : 1, 1); 90 CeedInt grid = num_elem / elems_per_block + 91 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 92 CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( 93 CeedScalar); 94 if (t_mode == CEED_TRANSPOSE) { 95 ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, 96 thread_1d, 97 elems_per_block, shared_mem, 98 interp_args); CeedChkBackend(ierr); 99 } else { 100 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 101 thread_1d, 102 elems_per_block, shared_mem, 103 interp_args); CeedChkBackend(ierr); 104 } 105 } else if (dim == 3) { 106 CeedInt elems_per_block = 1; 107 CeedInt grid = num_elem / elems_per_block + 108 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 109 CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( 110 CeedScalar); 111 if (t_mode == CEED_TRANSPOSE) { 112 ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, 113 thread_1d, 114 elems_per_block, shared_mem, 115 interp_args); CeedChkBackend(ierr); 116 } else { 117 ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 118 thread_1d, 119 elems_per_block, shared_mem, 120 interp_args); CeedChkBackend(ierr); 121 } 122 } 123 } break; 124 case CEED_EVAL_GRAD: { 125 CeedInt P_1d, Q_1d; 126 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 127 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 128 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 129 if (data->d_collo_grad_1d) { 130 ierr = CeedCudaInitCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, 131 P_1d, 132 Q_1d, &data->c_B, &data->c_G); 133 CeedChkBackend(ierr); 134 } else { 135 ierr = CeedCudaInitGrad(data->d_interp_1d, data->d_grad_1d, P_1d, 136 Q_1d, &data->c_B, &data->c_G); 137 CeedChkBackend(ierr); 138 } 139 void *grad_args[] = {(void *) &num_elem, &data->c_B, 140 &data->c_G, &d_u, &d_v 141 }; 142 if (dim == 1) { 143 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 144 CeedIntMax(512 / thread_1d, 145 1)); // avoid >512 total threads 146 CeedInt grid = num_elem / elems_per_block + 147 ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 ); 148 CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 149 if (t_mode == CEED_TRANSPOSE) { 150 ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, 1, 151 elems_per_block, shared_mem, grad_args); 152 CeedChkBackend(ierr); 153 } else { 154 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, 155 elems_per_block, shared_mem, grad_args); 156 CeedChkBackend(ierr); 157 } 158 } else if (dim == 2) { 159 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 160 // elems_per_block must be at least 1 161 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 162 num_comp : 1, 1); 163 CeedInt grid = num_elem / elems_per_block + 164 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 165 CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( 166 CeedScalar); 167 if (t_mode == CEED_TRANSPOSE) { 168 ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, 169 thread_1d, 170 elems_per_block, shared_mem, 171 grad_args); CeedChkBackend(ierr); 172 } else { 173 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 174 elems_per_block, shared_mem, 175 grad_args); CeedChkBackend(ierr); 176 } 177 } else if (dim == 3) { 178 CeedInt elems_per_block = 1; 179 CeedInt grid = num_elem / elems_per_block + 180 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 181 CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( 182 CeedScalar); 183 if (t_mode == CEED_TRANSPOSE) { 184 ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, 185 thread_1d, 186 elems_per_block, shared_mem, 187 grad_args); CeedChkBackend(ierr); 188 } else { 189 ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 190 elems_per_block, shared_mem, 191 grad_args); CeedChkBackend(ierr); 192 } 193 } 194 } break; 195 case CEED_EVAL_WEIGHT: { 196 CeedInt Q_1d; 197 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 198 void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 199 if (dim == 1) { 200 const CeedInt elems_per_block = 32 / Q_1d; 201 const CeedInt gridsize = num_elem / elems_per_block + 202 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 203 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, 204 elems_per_block, 1, weight_args); 205 CeedChkBackend(ierr); 206 } else if (dim == 2) { 207 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 208 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 209 const CeedInt gridsize = num_elem / elems_per_block + 210 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 211 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 212 elems_per_block, weight_args); 213 CeedChkBackend(ierr); 214 } else if (dim == 3) { 215 const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 216 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 217 const CeedInt gridsize = num_elem / elems_per_block + 218 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 219 ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 220 elems_per_block, weight_args); 221 CeedChkBackend(ierr); 222 } 223 } break; 224 // LCOV_EXCL_START 225 // Evaluate the divergence to/from the quadrature points 226 case CEED_EVAL_DIV: 227 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 228 // Evaluate the curl to/from the quadrature points 229 case CEED_EVAL_CURL: 230 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 231 // Take no action, BasisApply should not have been called 232 case CEED_EVAL_NONE: 233 return CeedError(ceed, CEED_ERROR_BACKEND, 234 "CEED_EVAL_NONE does not make sense in this context"); 235 // LCOV_EXCL_STOP 236 } 237 238 // Restore vectors 239 if (eval_mode != CEED_EVAL_WEIGHT) { 240 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 241 } 242 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 243 return CEED_ERROR_SUCCESS; 244 } 245 246 //------------------------------------------------------------------------------ 247 // Destroy basis 248 //------------------------------------------------------------------------------ 249 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 250 int ierr; 251 Ceed ceed; 252 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 253 254 CeedBasis_Cuda_shared *data; 255 ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 256 257 CeedChk_Cu(ceed, cuModuleUnload(data->module)); 258 259 ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 260 ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 261 ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 262 ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); 263 264 ierr = CeedFree(&data); CeedChkBackend(ierr); 265 266 return CEED_ERROR_SUCCESS; 267 } 268 269 //------------------------------------------------------------------------------ 270 // Create tensor basis 271 //------------------------------------------------------------------------------ 272 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 273 const CeedScalar *interp_1d, 274 const CeedScalar *grad_1d, 275 const CeedScalar *q_ref_1d, 276 const CeedScalar *q_weight_1d, 277 CeedBasis basis) { 278 int ierr; 279 Ceed ceed; 280 ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 281 CeedBasis_Cuda_shared *data; 282 ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 283 284 // Copy basis data to GPU 285 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 286 ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 287 CeedChk_Cu(ceed, ierr); 288 ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 289 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 290 291 const CeedInt interp_bytes = q_bytes * P_1d; 292 ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 293 CeedChk_Cu(ceed, ierr); 294 ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 295 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 296 297 ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 298 CeedChk_Cu(ceed, ierr); 299 ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 300 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 301 302 // Compute collocated gradient and copy to GPU 303 data->d_collo_grad_1d = NULL; 304 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 305 if (has_collocated_grad) { 306 CeedScalar *collo_grad_1d; 307 ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 308 ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 309 ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); 310 CeedChk_Cu(ceed, ierr); 311 ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, 312 cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 313 ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 314 } 315 316 // Compile basis kernels 317 CeedInt num_comp; 318 ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 319 char *basis_kernel_path, *basis_kernel_source; 320 ierr = CeedGetJitAbsolutePath(ceed, 321 "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", 322 &basis_kernel_path); CeedChkBackend(ierr); 323 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 324 ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 325 CeedChkBackend(ierr); 326 CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n"); 327 ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, 328 "BASIS_Q_1D", Q_1d, 329 "BASIS_P_1D", P_1d, 330 "T_1D", CeedIntMax(Q_1d, P_1d), 331 "BASIS_DIM", dim, 332 "BASIS_NUM_COMP", num_comp, 333 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 334 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), 335 "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad 336 ); CeedChkBackend(ierr); 337 ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 338 CeedChkBackend(ierr); 339 ierr = CeedGetKernelCuda(ceed, data->module, "InterpTranspose", 340 &data->InterpTranspose); 341 CeedChkBackend(ierr); 342 ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 343 CeedChkBackend(ierr); 344 ierr = CeedGetKernelCuda(ceed, data->module, "GradTranspose", 345 &data->GradTranspose); 346 CeedChkBackend(ierr); 347 ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 348 CeedChkBackend(ierr); 349 ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 350 ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 351 352 ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 353 354 // Register backend functions 355 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 356 CeedBasisApplyTensor_Cuda_shared); 357 CeedChkBackend(ierr); 358 ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 359 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); 360 return CEED_ERROR_SUCCESS; 361 } 362 //------------------------------------------------------------------------------ 363