1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-shared.h" 20 21 //------------------------------------------------------------------------------ 22 // Apply tensor basis 23 //------------------------------------------------------------------------------ 24 static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 25 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 26 Ceed ceed; 27 Ceed_Cuda *ceed_Cuda; 28 CeedInt dim, num_comp; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedBasis_Cuda_shared *data; 32 33 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 34 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 35 CeedCallBackend(CeedBasisGetData(basis, &data)); 36 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 37 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 38 39 // Get read/write access to u, v 40 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 41 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 42 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 43 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 44 45 // Apply basis operation 46 switch (eval_mode) { 47 case CEED_EVAL_INTERP: { 48 CeedInt P_1d, Q_1d; 49 50 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]); 51 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 52 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 53 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 54 55 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 56 57 if (dim == 1) { 58 // avoid >512 total threads 59 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 60 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 61 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 62 63 if (t_mode == CEED_TRANSPOSE) { 64 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, 65 elems_per_block, shared_mem, interp_args)); 66 } else { 67 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 68 } 69 } else if (dim == 2) { 70 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 71 // elems_per_block must be at least 1 72 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 73 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 74 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 75 76 if (t_mode == CEED_TRANSPOSE) { 77 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 78 elems_per_block, shared_mem, interp_args)); 79 } else { 80 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 81 } 82 } else if (dim == 3) { 83 CeedInt elems_per_block = 1; 84 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 85 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 86 87 if (t_mode == CEED_TRANSPOSE) { 88 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 89 elems_per_block, shared_mem, interp_args)); 90 } else { 91 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 92 } 93 } 94 } break; 95 case CEED_EVAL_GRAD: { 96 CeedInt P_1d, Q_1d; 97 98 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]); 99 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 100 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 101 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 102 CeedScalar *d_grad_1d = data->d_grad_1d; 103 104 if (data->d_collo_grad_1d) { 105 d_grad_1d = data->d_collo_grad_1d; 106 } 107 void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; 108 109 if (dim == 1) { 110 // avoid >512 total threads 111 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 112 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 113 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 114 115 if (t_mode == CEED_TRANSPOSE) { 116 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, 117 elems_per_block, shared_mem, grad_args)); 118 } else { 119 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 120 } 121 } else if (dim == 2) { 122 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 123 // elems_per_block must be at least 1 124 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 125 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 126 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 127 128 if (t_mode == CEED_TRANSPOSE) { 129 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 130 elems_per_block, shared_mem, grad_args)); 131 } else { 132 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 133 } 134 } else if (dim == 3) { 135 CeedInt elems_per_block = 1; 136 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 137 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 138 139 if (t_mode == CEED_TRANSPOSE) { 140 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 141 elems_per_block, shared_mem, grad_args)); 142 } else { 143 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 144 } 145 } 146 } break; 147 case CEED_EVAL_WEIGHT: { 148 CeedInt Q_1d; 149 CeedInt block_size = 32; 150 151 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 152 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 153 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 154 if (dim == 1) { 155 const CeedInt elems_per_block = block_size / Q_1d; 156 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 157 158 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); 159 } else if (dim == 2) { 160 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 161 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 162 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 163 164 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 165 } else if (dim == 3) { 166 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 167 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 168 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 169 170 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 171 } 172 } break; 173 case CEED_EVAL_NONE: /* handled separately below */ 174 break; 175 // LCOV_EXCL_START 176 case CEED_EVAL_DIV: 177 case CEED_EVAL_CURL: 178 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 179 // LCOV_EXCL_STOP 180 } 181 182 // Restore vectors, cover CEED_EVAL_NONE 183 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 184 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 185 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 186 CeedCallBackend(CeedDestroy(&ceed)); 187 return CEED_ERROR_SUCCESS; 188 } 189 190 static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 191 CeedVector v) { 192 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 193 return CEED_ERROR_SUCCESS; 194 } 195 196 static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 197 CeedVector u, CeedVector v) { 198 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 199 return CEED_ERROR_SUCCESS; 200 } 201 202 //------------------------------------------------------------------------------ 203 // Basis apply - tensor AtPoints 204 //------------------------------------------------------------------------------ 205 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 206 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 207 Ceed ceed; 208 Ceed_Cuda *ceed_Cuda; 209 CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; 210 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 211 const CeedScalar *d_x, *d_u; 212 CeedScalar *d_v; 213 CeedBasis_Cuda_shared *data; 214 215 CeedCallBackend(CeedBasisGetData(basis, &data)); 216 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 217 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 218 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 219 220 // Weight handled separately 221 if (eval_mode == CEED_EVAL_WEIGHT) { 222 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 223 return CEED_ERROR_SUCCESS; 224 } 225 226 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 227 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 228 229 // Check padded to uniform number of points per elem 230 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 231 { 232 CeedInt q_comp; 233 CeedSize len, len_required; 234 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 235 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 236 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 237 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 238 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 239 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 240 len, len_required); 241 } 242 243 // Move num_points array to device 244 if (is_transpose) { 245 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 246 247 if (num_elem != data->num_elem_at_points) { 248 data->num_elem_at_points = num_elem; 249 250 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 251 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 252 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 253 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 254 } 255 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 256 memcpy(data->h_points_per_elem, num_points, num_bytes); 257 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 258 } 259 } 260 261 // Build kernels if needed 262 if (data->num_points != max_num_points) { 263 CeedInt P_1d; 264 265 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 266 data->num_points = max_num_points; 267 268 // -- Create interp matrix to Chebyshev coefficients 269 if (!data->d_chebyshev_interp_1d) { 270 CeedSize interp_bytes; 271 CeedScalar *chebyshev_interp_1d; 272 273 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 274 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 275 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 276 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 277 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 278 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 279 } 280 281 // -- Compile kernels 282 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n"; 283 CeedInt num_comp; 284 285 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 286 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 287 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 288 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 289 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); 290 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 291 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 292 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 293 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 294 } 295 296 // Get read/write access to u, v 297 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 298 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 299 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 300 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 301 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 302 303 // Clear v for transpose operation 304 if (is_transpose && !apply_add) { 305 CeedInt num_comp, q_comp, num_nodes; 306 CeedSize length; 307 308 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 309 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 310 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 311 length = 312 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 313 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 314 } 315 316 // Basis action 317 switch (eval_mode) { 318 case CEED_EVAL_INTERP: { 319 CeedInt P_1d, Q_1d; 320 321 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 322 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 323 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 324 325 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 326 327 if (dim == 1) { 328 // avoid >512 total threads 329 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 330 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 331 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 332 333 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1, 334 elems_per_block, shared_mem, interp_args)); 335 } else if (dim == 2) { 336 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 337 // elems_per_block must be at least 1 338 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 339 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 340 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 341 342 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 343 thread_1d, elems_per_block, shared_mem, interp_args)); 344 } else if (dim == 3) { 345 CeedInt elems_per_block = 1; 346 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 347 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 348 349 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 350 thread_1d, elems_per_block, shared_mem, interp_args)); 351 } 352 } break; 353 case CEED_EVAL_GRAD: { 354 CeedInt P_1d, Q_1d; 355 356 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 357 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 358 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 359 360 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 361 362 if (dim == 1) { 363 // avoid >512 total threads 364 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 365 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 366 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 367 368 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1, 369 elems_per_block, shared_mem, grad_args)); 370 } else if (dim == 2) { 371 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 372 // elems_per_block must be at least 1 373 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 374 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 375 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 376 377 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 378 elems_per_block, shared_mem, grad_args)); 379 } else if (dim == 3) { 380 CeedInt elems_per_block = 1; 381 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 382 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 383 384 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 385 elems_per_block, shared_mem, grad_args)); 386 } 387 } break; 388 case CEED_EVAL_WEIGHT: 389 case CEED_EVAL_NONE: /* handled separately below */ 390 break; 391 // LCOV_EXCL_START 392 case CEED_EVAL_DIV: 393 case CEED_EVAL_CURL: 394 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 395 // LCOV_EXCL_STOP 396 } 397 398 // Restore vectors, cover CEED_EVAL_NONE 399 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 400 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 401 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 402 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 403 CeedCallBackend(CeedDestroy(&ceed)); 404 return CEED_ERROR_SUCCESS; 405 } 406 407 static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 408 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 409 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 410 return CEED_ERROR_SUCCESS; 411 } 412 413 static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 414 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 415 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 416 return CEED_ERROR_SUCCESS; 417 } 418 419 //------------------------------------------------------------------------------ 420 // Apply non-tensor basis 421 //------------------------------------------------------------------------------ 422 static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 423 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 424 Ceed ceed; 425 Ceed_Cuda *ceed_Cuda; 426 CeedInt dim; 427 const CeedScalar *d_u; 428 CeedScalar *d_v; 429 CeedBasis_Cuda_shared *data; 430 431 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 432 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 433 CeedCallBackend(CeedBasisGetData(basis, &data)); 434 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 435 436 // Get read/write access to u, v 437 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 438 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 439 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 440 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 441 442 // Apply basis operation 443 switch (eval_mode) { 444 case CEED_EVAL_INTERP: { 445 CeedInt P, Q; 446 447 CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]); 448 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 449 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 450 CeedInt thread = CeedIntMax(Q, P); 451 452 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 453 454 { 455 // avoid >512 total threads 456 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 457 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 458 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 459 460 if (t_mode == CEED_TRANSPOSE) { 461 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1, 462 elems_per_block, shared_mem, interp_args)); 463 } else { 464 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args)); 465 } 466 } 467 } break; 468 case CEED_EVAL_GRAD: { 469 CeedInt P, Q; 470 471 CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]); 472 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 473 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 474 CeedInt thread = CeedIntMax(Q, P); 475 476 void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; 477 478 { 479 // avoid >512 total threads 480 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 481 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 482 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 483 484 if (t_mode == CEED_TRANSPOSE) { 485 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, 486 elems_per_block, shared_mem, grad_args)); 487 } else { 488 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args)); 489 } 490 } 491 } break; 492 case CEED_EVAL_WEIGHT: { 493 CeedInt P, Q; 494 495 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 496 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 497 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 498 CeedInt thread = CeedIntMax(Q, P); 499 500 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 501 502 { 503 // avoid >512 total threads 504 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 505 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 506 507 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args)); 508 } 509 } break; 510 case CEED_EVAL_NONE: /* handled separately below */ 511 break; 512 // LCOV_EXCL_START 513 case CEED_EVAL_DIV: 514 case CEED_EVAL_CURL: 515 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 516 // LCOV_EXCL_STOP 517 } 518 519 // Restore vectors, cover CEED_EVAL_NONE 520 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 521 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 522 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 523 CeedCallBackend(CeedDestroy(&ceed)); 524 return CEED_ERROR_SUCCESS; 525 } 526 527 static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 528 CeedVector u, CeedVector v) { 529 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 530 return CEED_ERROR_SUCCESS; 531 } 532 533 static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 534 CeedVector u, CeedVector v) { 535 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 536 return CEED_ERROR_SUCCESS; 537 } 538 539 //------------------------------------------------------------------------------ 540 // Destroy basis 541 //------------------------------------------------------------------------------ 542 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 543 Ceed ceed; 544 CeedBasis_Cuda_shared *data; 545 546 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 547 CeedCallBackend(CeedBasisGetData(basis, &data)); 548 CeedCallCuda(ceed, cuModuleUnload(data->module)); 549 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 550 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 551 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 552 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 553 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 554 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 555 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); 556 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 557 CeedCallBackend(CeedFree(&data)); 558 CeedCallBackend(CeedDestroy(&ceed)); 559 return CEED_ERROR_SUCCESS; 560 } 561 562 //------------------------------------------------------------------------------ 563 // Create tensor basis 564 //------------------------------------------------------------------------------ 565 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 566 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 567 Ceed ceed; 568 CeedInt num_comp; 569 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 570 const CeedInt interp_bytes = q_bytes * P_1d; 571 CeedBasis_Cuda_shared *data; 572 573 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 574 CeedCallBackend(CeedCalloc(1, &data)); 575 576 // Copy basis data to GPU 577 if (q_weight_1d) { 578 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 579 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 580 } 581 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 582 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 583 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 584 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 585 586 // Compute collocated gradient and copy to GPU 587 data->d_collo_grad_1d = NULL; 588 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 589 590 if (has_collocated_grad) { 591 CeedScalar *collo_grad_1d; 592 593 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 594 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 595 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 596 CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); 597 CeedCallBackend(CeedFree(&collo_grad_1d)); 598 } 599 600 // Compile basis kernels 601 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n"; 602 603 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 604 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 605 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 606 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); 607 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 608 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 609 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 610 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 611 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 612 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 613 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 614 615 CeedCallBackend(CeedBasisSetData(basis, data)); 616 617 // Register backend functions 618 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); 619 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); 620 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); 621 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared)); 622 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 623 CeedCallBackend(CeedDestroy(&ceed)); 624 return CEED_ERROR_SUCCESS; 625 } 626 627 //------------------------------------------------------------------------------ 628 // Create non-tensor basis 629 //------------------------------------------------------------------------------ 630 int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 631 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 632 Ceed ceed; 633 CeedInt num_comp, q_comp_interp, q_comp_grad; 634 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 635 CeedBasis_Cuda_shared *data; 636 637 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 638 639 // Check shared memory size 640 { 641 Ceed_Cuda *cuda_data; 642 643 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 644 if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > 645 cuda_data->device_prop.sharedMemPerBlock) { 646 CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); 647 CeedCallBackend(CeedDestroy(&ceed)); 648 return CEED_ERROR_SUCCESS; 649 } 650 } 651 652 CeedCallBackend(CeedCalloc(1, &data)); 653 654 // Copy basis data to GPU 655 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 656 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 657 if (q_weight) { 658 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 659 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); 660 } 661 if (interp) { 662 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 663 664 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 665 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); 666 } 667 if (grad) { 668 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 669 670 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); 671 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); 672 } 673 674 // Compile basis kernels 675 const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n"; 676 677 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 678 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D", 679 CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); 680 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 681 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 682 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 683 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 684 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 685 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 686 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 687 688 CeedCallBackend(CeedBasisSetData(basis, data)); 689 690 // Register backend functions 691 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared)); 692 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared)); 693 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 694 CeedCallBackend(CeedDestroy(&ceed)); 695 return CEED_ERROR_SUCCESS; 696 } 697 698 //------------------------------------------------------------------------------ 699