1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-shared.h" 20 21 //------------------------------------------------------------------------------ 22 // Apply tensor basis 23 //------------------------------------------------------------------------------ 24 static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 25 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 26 Ceed ceed; 27 Ceed_Cuda *ceed_Cuda; 28 CeedInt dim, num_comp; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedBasis_Cuda_shared *data; 32 33 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 34 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 35 CeedCallBackend(CeedBasisGetData(basis, &data)); 36 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 37 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 38 39 // Get read/write access to u, v 40 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 41 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 42 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 43 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 44 45 // Apply basis operation 46 switch (eval_mode) { 47 case CEED_EVAL_INTERP: { 48 CeedInt P_1d, Q_1d; 49 50 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]); 51 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 52 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 53 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 54 55 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 56 57 if (dim == 1) { 58 // avoid >512 total threads 59 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 60 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 61 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 62 63 if (t_mode == CEED_TRANSPOSE) { 64 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, 65 elems_per_block, shared_mem, interp_args)); 66 } else { 67 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 68 } 69 } else if (dim == 2) { 70 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 71 // elems_per_block must be at least 1 72 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 73 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 74 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 75 76 if (t_mode == CEED_TRANSPOSE) { 77 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 78 elems_per_block, shared_mem, interp_args)); 79 } else { 80 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 81 } 82 } else if (dim == 3) { 83 CeedInt elems_per_block = 1; 84 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 85 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 86 87 if (t_mode == CEED_TRANSPOSE) { 88 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 89 elems_per_block, shared_mem, interp_args)); 90 } else { 91 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 92 } 93 } 94 } break; 95 case CEED_EVAL_GRAD: { 96 CeedInt P_1d, Q_1d; 97 98 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]); 99 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 100 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 101 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 102 CeedScalar *d_grad_1d = data->d_grad_1d; 103 104 if (data->d_collo_grad_1d) { 105 d_grad_1d = data->d_collo_grad_1d; 106 } 107 void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; 108 109 if (dim == 1) { 110 // avoid >512 total threads 111 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 112 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 113 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 114 115 if (t_mode == CEED_TRANSPOSE) { 116 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, 117 elems_per_block, shared_mem, grad_args)); 118 } else { 119 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 120 } 121 } else if (dim == 2) { 122 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 123 // elems_per_block must be at least 1 124 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 125 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 126 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 127 128 if (t_mode == CEED_TRANSPOSE) { 129 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 130 elems_per_block, shared_mem, grad_args)); 131 } else { 132 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 133 } 134 } else if (dim == 3) { 135 CeedInt elems_per_block = 1; 136 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 137 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 138 139 if (t_mode == CEED_TRANSPOSE) { 140 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 141 elems_per_block, shared_mem, grad_args)); 142 } else { 143 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 144 } 145 } 146 } break; 147 case CEED_EVAL_WEIGHT: { 148 CeedInt Q_1d; 149 CeedInt block_size = 32; 150 151 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 152 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 153 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 154 if (dim == 1) { 155 const CeedInt elems_per_block = block_size / Q_1d; 156 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 157 158 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); 159 } else if (dim == 2) { 160 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 161 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 162 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 163 164 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 165 } else if (dim == 3) { 166 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 167 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 168 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 169 170 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 171 } 172 } break; 173 case CEED_EVAL_NONE: /* handled separately below */ 174 break; 175 // LCOV_EXCL_START 176 case CEED_EVAL_DIV: 177 case CEED_EVAL_CURL: 178 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 179 // LCOV_EXCL_STOP 180 } 181 182 // Restore vectors, cover CEED_EVAL_NONE 183 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 184 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 185 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 186 CeedCallBackend(CeedDestroy(&ceed)); 187 return CEED_ERROR_SUCCESS; 188 } 189 190 static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 191 CeedVector v) { 192 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 193 return CEED_ERROR_SUCCESS; 194 } 195 196 static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 197 CeedVector u, CeedVector v) { 198 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 199 return CEED_ERROR_SUCCESS; 200 } 201 202 //------------------------------------------------------------------------------ 203 // Basis apply - tensor AtPoints 204 //------------------------------------------------------------------------------ 205 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 206 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 207 Ceed ceed; 208 Ceed_Cuda *ceed_Cuda; 209 CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; 210 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 211 const CeedScalar *d_x, *d_u; 212 CeedScalar *d_v; 213 CeedBasis_Cuda_shared *data; 214 215 CeedCallBackend(CeedBasisGetData(basis, &data)); 216 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 217 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 218 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 219 220 // Weight handled separately 221 if (eval_mode == CEED_EVAL_WEIGHT) { 222 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 223 return CEED_ERROR_SUCCESS; 224 } 225 226 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 227 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 228 229 // Check padded to uniform number of points per elem 230 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 231 { 232 CeedInt q_comp; 233 CeedSize len, len_required; 234 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 235 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 236 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 237 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 238 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 239 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 240 len, len_required); 241 } 242 243 // Move num_points array to device 244 if (is_transpose) { 245 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 246 247 if (num_elem != data->num_elem_at_points) { 248 data->num_elem_at_points = num_elem; 249 250 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 251 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 252 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 253 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 254 } 255 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 256 memcpy(data->h_points_per_elem, num_points, num_bytes); 257 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 258 } 259 } 260 261 // Build kernels if needed 262 if (data->num_points != max_num_points) { 263 CeedInt P_1d; 264 265 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 266 data->num_points = max_num_points; 267 268 // -- Create interp matrix to Chebyshev coefficients 269 if (!data->d_chebyshev_interp_1d) { 270 CeedSize interp_bytes; 271 CeedScalar *chebyshev_interp_1d; 272 273 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 274 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 275 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 276 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 277 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 278 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 279 } 280 281 // -- Compile kernels 282 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n"; 283 CeedInt num_comp; 284 285 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 286 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 287 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 288 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 289 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); 290 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 291 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 292 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 293 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 294 } 295 296 // Get read/write access to u, v 297 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 298 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 299 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 300 if (apply_add) { 301 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 302 } else { 303 // Clear v for transpose operation 304 if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 305 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 306 } 307 308 // Basis action 309 switch (eval_mode) { 310 case CEED_EVAL_INTERP: { 311 CeedInt P_1d, Q_1d; 312 313 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 314 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 315 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 316 317 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 318 319 if (dim == 1) { 320 // avoid >512 total threads 321 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 322 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 323 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 324 325 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1, 326 elems_per_block, shared_mem, interp_args)); 327 } else if (dim == 2) { 328 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 329 // elems_per_block must be at least 1 330 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 331 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 332 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 333 334 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 335 thread_1d, elems_per_block, shared_mem, interp_args)); 336 } else if (dim == 3) { 337 CeedInt elems_per_block = 1; 338 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 339 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 340 341 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 342 thread_1d, elems_per_block, shared_mem, interp_args)); 343 } 344 } break; 345 case CEED_EVAL_GRAD: { 346 CeedInt P_1d, Q_1d; 347 348 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 349 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 350 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 351 352 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 353 354 if (dim == 1) { 355 // avoid >512 total threads 356 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 357 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 358 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 359 360 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1, 361 elems_per_block, shared_mem, grad_args)); 362 } else if (dim == 2) { 363 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 364 // elems_per_block must be at least 1 365 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 366 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 367 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 368 369 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 370 elems_per_block, shared_mem, grad_args)); 371 } else if (dim == 3) { 372 CeedInt elems_per_block = 1; 373 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 374 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 375 376 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 377 elems_per_block, shared_mem, grad_args)); 378 } 379 } break; 380 case CEED_EVAL_WEIGHT: 381 case CEED_EVAL_NONE: /* handled separately below */ 382 break; 383 // LCOV_EXCL_START 384 case CEED_EVAL_DIV: 385 case CEED_EVAL_CURL: 386 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 387 // LCOV_EXCL_STOP 388 } 389 390 // Restore vectors, cover CEED_EVAL_NONE 391 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 392 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 393 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 394 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 395 CeedCallBackend(CeedDestroy(&ceed)); 396 return CEED_ERROR_SUCCESS; 397 } 398 399 static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 400 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 401 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 402 return CEED_ERROR_SUCCESS; 403 } 404 405 static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 406 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 407 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 408 return CEED_ERROR_SUCCESS; 409 } 410 411 //------------------------------------------------------------------------------ 412 // Apply non-tensor basis 413 //------------------------------------------------------------------------------ 414 static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 415 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 416 Ceed ceed; 417 Ceed_Cuda *ceed_Cuda; 418 CeedInt dim; 419 const CeedScalar *d_u; 420 CeedScalar *d_v; 421 CeedBasis_Cuda_shared *data; 422 423 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 424 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 425 CeedCallBackend(CeedBasisGetData(basis, &data)); 426 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 427 428 // Get read/write access to u, v 429 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 430 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 431 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 432 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 433 434 // Apply basis operation 435 switch (eval_mode) { 436 case CEED_EVAL_INTERP: { 437 CeedInt P, Q; 438 439 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]); 440 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 441 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 442 CeedInt thread = CeedIntMax(Q, P); 443 444 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 445 446 { 447 // avoid >512 total threads 448 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 449 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 450 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 451 452 if (t_mode == CEED_TRANSPOSE) { 453 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1, 454 elems_per_block, shared_mem, interp_args)); 455 } else { 456 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args)); 457 } 458 } 459 } break; 460 case CEED_EVAL_GRAD: { 461 CeedInt P, Q; 462 463 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]); 464 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 465 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 466 CeedInt thread = CeedIntMax(Q, P); 467 468 void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; 469 470 { 471 // avoid >512 total threads 472 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 473 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 474 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 475 476 if (t_mode == CEED_TRANSPOSE) { 477 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, 478 elems_per_block, shared_mem, grad_args)); 479 } else { 480 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args)); 481 } 482 } 483 } break; 484 case CEED_EVAL_WEIGHT: { 485 CeedInt P, Q; 486 487 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 488 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 489 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 490 CeedInt thread = CeedIntMax(Q, P); 491 492 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 493 494 { 495 // avoid >512 total threads 496 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 497 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 498 499 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args)); 500 } 501 } break; 502 case CEED_EVAL_NONE: /* handled separately below */ 503 break; 504 // LCOV_EXCL_START 505 case CEED_EVAL_DIV: 506 case CEED_EVAL_CURL: 507 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 508 // LCOV_EXCL_STOP 509 } 510 511 // Restore vectors, cover CEED_EVAL_NONE 512 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 513 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 514 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 515 CeedCallBackend(CeedDestroy(&ceed)); 516 return CEED_ERROR_SUCCESS; 517 } 518 519 static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 520 CeedVector u, CeedVector v) { 521 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 522 return CEED_ERROR_SUCCESS; 523 } 524 525 static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 526 CeedVector u, CeedVector v) { 527 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 528 return CEED_ERROR_SUCCESS; 529 } 530 531 //------------------------------------------------------------------------------ 532 // Destroy basis 533 //------------------------------------------------------------------------------ 534 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 535 Ceed ceed; 536 CeedBasis_Cuda_shared *data; 537 538 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 539 CeedCallBackend(CeedBasisGetData(basis, &data)); 540 CeedCallCuda(ceed, cuModuleUnload(data->module)); 541 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 542 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 543 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 544 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 545 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 546 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 547 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); 548 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 549 CeedCallBackend(CeedFree(&data)); 550 CeedCallBackend(CeedDestroy(&ceed)); 551 return CEED_ERROR_SUCCESS; 552 } 553 554 //------------------------------------------------------------------------------ 555 // Create tensor basis 556 //------------------------------------------------------------------------------ 557 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 558 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 559 Ceed ceed; 560 CeedInt num_comp; 561 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 562 const CeedInt interp_bytes = q_bytes * P_1d; 563 CeedBasis_Cuda_shared *data; 564 565 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 566 CeedCallBackend(CeedCalloc(1, &data)); 567 568 // Copy basis data to GPU 569 if (q_weight_1d) { 570 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 571 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 572 } 573 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 574 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 575 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 576 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 577 578 // Compute collocated gradient and copy to GPU 579 data->d_collo_grad_1d = NULL; 580 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 581 582 if (has_collocated_grad) { 583 CeedScalar *collo_grad_1d; 584 585 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 586 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 587 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 588 CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); 589 CeedCallBackend(CeedFree(&collo_grad_1d)); 590 } 591 592 // Compile basis kernels 593 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n"; 594 595 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 596 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 597 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 598 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); 599 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 600 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 601 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 602 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 603 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 604 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 605 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 606 607 CeedCallBackend(CeedBasisSetData(basis, data)); 608 609 // Register backend functions 610 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); 611 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); 612 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); 613 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared)); 614 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 615 CeedCallBackend(CeedDestroy(&ceed)); 616 return CEED_ERROR_SUCCESS; 617 } 618 619 //------------------------------------------------------------------------------ 620 // Create non-tensor basis 621 //------------------------------------------------------------------------------ 622 int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 623 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 624 Ceed ceed; 625 CeedInt num_comp, q_comp_interp, q_comp_grad; 626 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 627 CeedBasis_Cuda_shared *data; 628 629 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 630 631 // Check shared memory size 632 { 633 Ceed_Cuda *cuda_data; 634 635 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 636 if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > 637 cuda_data->device_prop.sharedMemPerBlock) { 638 CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); 639 CeedCallBackend(CeedDestroy(&ceed)); 640 return CEED_ERROR_SUCCESS; 641 } 642 } 643 644 CeedCallBackend(CeedCalloc(1, &data)); 645 646 // Copy basis data to GPU 647 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 648 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 649 if (q_weight) { 650 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 651 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); 652 } 653 if (interp) { 654 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 655 656 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 657 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); 658 } 659 if (grad) { 660 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 661 662 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); 663 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); 664 } 665 666 // Compile basis kernels 667 const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n"; 668 669 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 670 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D", 671 CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); 672 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 673 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 674 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 675 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 676 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 677 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 678 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 679 680 CeedCallBackend(CeedBasisSetData(basis, data)); 681 682 // Register backend functions 683 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared)); 684 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared)); 685 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 686 CeedCallBackend(CeedDestroy(&ceed)); 687 return CEED_ERROR_SUCCESS; 688 } 689 690 //------------------------------------------------------------------------------ 691