1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-shared.h" 20 21 //------------------------------------------------------------------------------ 22 // Device initalization 23 //------------------------------------------------------------------------------ 24 int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B); 25 int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); 26 int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); 27 28 //------------------------------------------------------------------------------ 29 // Apply tensor basis 30 //------------------------------------------------------------------------------ 31 static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 32 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 33 Ceed ceed; 34 Ceed_Cuda *ceed_Cuda; 35 CeedInt dim, num_comp; 36 const CeedScalar *d_u; 37 CeedScalar *d_v; 38 CeedBasis_Cuda_shared *data; 39 40 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 41 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 42 CeedCallBackend(CeedBasisGetData(basis, &data)); 43 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 44 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 45 46 // Get read/write access to u, v 47 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 48 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 49 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 50 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 51 52 // Apply basis operation 53 switch (eval_mode) { 54 case CEED_EVAL_INTERP: { 55 CeedInt P_1d, Q_1d; 56 57 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 58 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 59 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 60 61 CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B)); 62 void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; 63 64 if (dim == 1) { 65 // avoid >512 total threads 66 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 67 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 68 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 69 70 if (t_mode == CEED_TRANSPOSE) { 71 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, 72 elems_per_block, shared_mem, interp_args)); 73 } else { 74 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 75 } 76 } else if (dim == 2) { 77 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 78 // elems_per_block must be at least 1 79 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 80 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 81 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 82 83 if (t_mode == CEED_TRANSPOSE) { 84 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 85 elems_per_block, shared_mem, interp_args)); 86 } else { 87 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 88 } 89 } else if (dim == 3) { 90 CeedInt elems_per_block = 1; 91 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 92 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 93 94 if (t_mode == CEED_TRANSPOSE) { 95 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 96 elems_per_block, shared_mem, interp_args)); 97 } else { 98 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 99 } 100 } 101 } break; 102 case CEED_EVAL_GRAD: { 103 CeedInt P_1d, Q_1d; 104 105 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 106 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 107 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 108 109 if (data->d_collo_grad_1d) { 110 CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); 111 } else { 112 CeedCallBackend(CeedInit_CudaGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); 113 } 114 void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v}; 115 if (dim == 1) { 116 // avoid >512 total threads 117 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 118 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 119 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 120 121 if (t_mode == CEED_TRANSPOSE) { 122 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, 123 elems_per_block, shared_mem, grad_args)); 124 } else { 125 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 126 } 127 } else if (dim == 2) { 128 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 129 // elems_per_block must be at least 1 130 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 131 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 132 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 133 134 if (t_mode == CEED_TRANSPOSE) { 135 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 136 elems_per_block, shared_mem, grad_args)); 137 } else { 138 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 139 } 140 } else if (dim == 3) { 141 CeedInt elems_per_block = 1; 142 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 143 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 144 145 if (t_mode == CEED_TRANSPOSE) { 146 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 147 elems_per_block, shared_mem, grad_args)); 148 } else { 149 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 150 } 151 } 152 } break; 153 case CEED_EVAL_WEIGHT: { 154 CeedInt Q_1d; 155 CeedInt block_size = 32; 156 157 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 158 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 159 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 160 if (dim == 1) { 161 const CeedInt elems_per_block = block_size / Q_1d; 162 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 163 164 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); 165 } else if (dim == 2) { 166 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 167 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 168 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 169 170 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 171 } else if (dim == 3) { 172 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 173 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 174 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 175 176 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 177 } 178 } break; 179 case CEED_EVAL_NONE: /* handled separately below */ 180 break; 181 // LCOV_EXCL_START 182 case CEED_EVAL_DIV: 183 case CEED_EVAL_CURL: 184 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 185 // LCOV_EXCL_STOP 186 } 187 188 // Restore vectors, cover CEED_EVAL_NONE 189 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 190 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 191 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 192 CeedCallBackend(CeedDestroy(&ceed)); 193 return CEED_ERROR_SUCCESS; 194 } 195 196 static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 197 CeedVector v) { 198 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 199 return CEED_ERROR_SUCCESS; 200 } 201 202 static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 203 CeedVector u, CeedVector v) { 204 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 205 return CEED_ERROR_SUCCESS; 206 } 207 208 //------------------------------------------------------------------------------ 209 // Basis apply - tensor AtPoints 210 //------------------------------------------------------------------------------ 211 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 212 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 213 Ceed ceed; 214 Ceed_Cuda *ceed_Cuda; 215 CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; 216 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 217 const CeedScalar *d_x, *d_u; 218 CeedScalar *d_v; 219 CeedBasis_Cuda_shared *data; 220 221 CeedCallBackend(CeedBasisGetData(basis, &data)); 222 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 223 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 224 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 225 226 // Weight handled separately 227 if (eval_mode == CEED_EVAL_WEIGHT) { 228 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 229 return CEED_ERROR_SUCCESS; 230 } 231 232 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 233 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 234 235 // Check padded to uniform number of points per elem 236 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 237 { 238 CeedInt q_comp; 239 CeedSize len, len_required; 240 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 241 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 242 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 243 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 244 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 245 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 246 len, len_required); 247 } 248 249 // Move num_points array to device 250 if (is_transpose) { 251 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 252 253 if (num_elem != data->num_elem_at_points) { 254 data->num_elem_at_points = num_elem; 255 256 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 257 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 258 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 259 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 260 } 261 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 262 memcpy(data->h_points_per_elem, num_points, num_bytes); 263 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 264 } 265 } 266 267 // Build kernels if needed 268 if (data->num_points != max_num_points) { 269 CeedInt P_1d; 270 271 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 272 data->num_points = max_num_points; 273 274 // -- Create interp matrix to Chebyshev coefficients 275 if (!data->d_chebyshev_interp_1d) { 276 CeedSize interp_bytes; 277 CeedScalar *chebyshev_interp_1d; 278 279 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 280 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 281 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 282 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 283 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 284 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 285 } 286 287 // -- Compile kernels 288 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n"; 289 CeedInt num_comp; 290 291 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 292 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 293 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 294 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 295 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); 296 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 297 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 298 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 299 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 300 } 301 302 // Get read/write access to u, v 303 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 304 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 305 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 306 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 307 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 308 309 // Clear v for transpose operation 310 if (is_transpose && !apply_add) { 311 CeedInt num_comp, q_comp, num_nodes; 312 CeedSize length; 313 314 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 315 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 316 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 317 length = 318 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 319 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 320 } 321 322 // Basis action 323 switch (eval_mode) { 324 case CEED_EVAL_INTERP: { 325 CeedInt P_1d, Q_1d; 326 327 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 328 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 329 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 330 331 CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B)); 332 void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 333 334 if (dim == 1) { 335 // avoid >512 total threads 336 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 337 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 338 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 339 340 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1, 341 elems_per_block, shared_mem, interp_args)); 342 } else if (dim == 2) { 343 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 344 // elems_per_block must be at least 1 345 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 346 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 347 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 348 349 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 350 thread_1d, elems_per_block, shared_mem, interp_args)); 351 } else if (dim == 3) { 352 CeedInt elems_per_block = 1; 353 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 354 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 355 356 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 357 thread_1d, elems_per_block, shared_mem, interp_args)); 358 } 359 } break; 360 case CEED_EVAL_GRAD: { 361 CeedInt P_1d, Q_1d; 362 363 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 364 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 365 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 366 367 CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B)); 368 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 369 370 if (dim == 1) { 371 // avoid >512 total threads 372 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 373 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 374 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 375 376 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1, 377 elems_per_block, shared_mem, grad_args)); 378 } else if (dim == 2) { 379 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 380 // elems_per_block must be at least 1 381 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 382 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 383 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 384 385 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 386 elems_per_block, shared_mem, grad_args)); 387 } else if (dim == 3) { 388 CeedInt elems_per_block = 1; 389 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 390 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 391 392 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 393 elems_per_block, shared_mem, grad_args)); 394 } 395 } break; 396 case CEED_EVAL_WEIGHT: 397 case CEED_EVAL_NONE: /* handled separately below */ 398 break; 399 // LCOV_EXCL_START 400 case CEED_EVAL_DIV: 401 case CEED_EVAL_CURL: 402 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 403 // LCOV_EXCL_STOP 404 } 405 406 // Restore vectors, cover CEED_EVAL_NONE 407 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 408 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 409 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 410 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 411 CeedCallBackend(CeedDestroy(&ceed)); 412 return CEED_ERROR_SUCCESS; 413 } 414 415 static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 416 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 417 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 418 return CEED_ERROR_SUCCESS; 419 } 420 421 static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 422 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 423 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 424 return CEED_ERROR_SUCCESS; 425 } 426 427 //------------------------------------------------------------------------------ 428 // Apply non-tensor basis 429 //------------------------------------------------------------------------------ 430 static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 431 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 432 Ceed ceed; 433 Ceed_Cuda *ceed_Cuda; 434 CeedInt dim; 435 const CeedScalar *d_u; 436 CeedScalar *d_v; 437 CeedBasis_Cuda_shared *data; 438 439 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 440 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 441 CeedCallBackend(CeedBasisGetData(basis, &data)); 442 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 443 444 // Get read/write access to u, v 445 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 446 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 447 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 448 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 449 450 // Apply basis operation 451 switch (eval_mode) { 452 case CEED_EVAL_INTERP: { 453 CeedInt P, Q; 454 455 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 456 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 457 CeedInt thread = CeedIntMax(Q, P); 458 459 CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P, Q, &data->c_B)); 460 void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; 461 462 { 463 // avoid >512 total threads 464 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 465 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 466 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 467 468 if (t_mode == CEED_TRANSPOSE) { 469 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1, 470 elems_per_block, shared_mem, interp_args)); 471 } else { 472 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args)); 473 } 474 } 475 } break; 476 case CEED_EVAL_GRAD: { 477 CeedInt P, Q; 478 479 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 480 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 481 CeedInt thread = CeedIntMax(Q, P); 482 483 CeedCallBackend(CeedInit_CudaInterp(data->d_grad_1d, P, Q * dim, &data->c_G)); 484 void *grad_args[] = {(void *)&num_elem, &data->c_G, &d_u, &d_v}; 485 486 { 487 // avoid >512 total threads 488 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 489 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 490 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 491 492 if (t_mode == CEED_TRANSPOSE) { 493 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, 494 elems_per_block, shared_mem, grad_args)); 495 } else { 496 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args)); 497 } 498 } 499 } break; 500 case CEED_EVAL_WEIGHT: { 501 CeedInt Q; 502 503 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 504 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 505 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 506 507 { 508 // avoid >512 total threads 509 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / Q, 1)); 510 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 511 512 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, Q, elems_per_block, 1, weight_args)); 513 } 514 } break; 515 case CEED_EVAL_NONE: /* handled separately below */ 516 break; 517 // LCOV_EXCL_START 518 case CEED_EVAL_DIV: 519 case CEED_EVAL_CURL: 520 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 521 // LCOV_EXCL_STOP 522 } 523 524 // Restore vectors, cover CEED_EVAL_NONE 525 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 526 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 527 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 528 CeedCallBackend(CeedDestroy(&ceed)); 529 return CEED_ERROR_SUCCESS; 530 } 531 532 static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 533 CeedVector u, CeedVector v) { 534 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 535 return CEED_ERROR_SUCCESS; 536 } 537 538 static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 539 CeedVector u, CeedVector v) { 540 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 541 return CEED_ERROR_SUCCESS; 542 } 543 544 //------------------------------------------------------------------------------ 545 // Destroy basis 546 //------------------------------------------------------------------------------ 547 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 548 Ceed ceed; 549 CeedBasis_Cuda_shared *data; 550 551 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 552 CeedCallBackend(CeedBasisGetData(basis, &data)); 553 CeedCallCuda(ceed, cuModuleUnload(data->module)); 554 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 555 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 556 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 557 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 558 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 559 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 560 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); 561 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 562 CeedCallBackend(CeedFree(&data)); 563 CeedCallBackend(CeedDestroy(&ceed)); 564 return CEED_ERROR_SUCCESS; 565 } 566 567 //------------------------------------------------------------------------------ 568 // Create tensor basis 569 //------------------------------------------------------------------------------ 570 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 571 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 572 Ceed ceed; 573 CeedInt num_comp; 574 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 575 const CeedInt interp_bytes = q_bytes * P_1d; 576 CeedBasis_Cuda_shared *data; 577 578 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 579 CeedCallBackend(CeedCalloc(1, &data)); 580 581 // Copy basis data to GPU 582 if (q_weight_1d) { 583 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 584 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 585 } 586 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 587 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 588 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 589 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 590 591 // Compute collocated gradient and copy to GPU 592 data->d_collo_grad_1d = NULL; 593 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 594 595 if (has_collocated_grad) { 596 CeedScalar *collo_grad_1d; 597 598 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 599 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 600 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 601 CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); 602 CeedCallBackend(CeedFree(&collo_grad_1d)); 603 } 604 605 // Compile basis kernels 606 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n"; 607 608 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 609 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 610 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 611 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); 612 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 613 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 614 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 615 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 616 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 617 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 618 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 619 620 CeedCallBackend(CeedBasisSetData(basis, data)); 621 622 // Register backend functions 623 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); 624 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); 625 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); 626 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared)); 627 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 628 CeedCallBackend(CeedDestroy(&ceed)); 629 return CEED_ERROR_SUCCESS; 630 } 631 632 //------------------------------------------------------------------------------ 633 // Create non-tensor basis 634 //------------------------------------------------------------------------------ 635 int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 636 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 637 Ceed ceed; 638 CeedInt num_comp, q_comp_interp, q_comp_grad; 639 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 640 CeedBasis_Cuda_shared *data; 641 642 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 643 CeedCallBackend(CeedCalloc(1, &data)); 644 645 // Copy basis data to GPU 646 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 647 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 648 if (q_weight) { 649 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 650 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); 651 } 652 if (interp) { 653 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 654 655 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 656 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); 657 } 658 if (grad) { 659 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 660 661 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); 662 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); 663 } 664 665 // Compile basis kernels 666 const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n"; 667 668 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 669 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D", 670 CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); 671 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 672 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 673 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 674 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 675 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 676 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 677 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 678 679 CeedCallBackend(CeedBasisSetData(basis, data)); 680 681 // Register backend functions 682 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared)); 683 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared)); 684 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 685 CeedCallBackend(CeedDestroy(&ceed)); 686 return CEED_ERROR_SUCCESS; 687 } 688 689 //------------------------------------------------------------------------------ 690