1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-shared.h" 20 21 //------------------------------------------------------------------------------ 22 // Apply tensor basis 23 //------------------------------------------------------------------------------ 24 static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 25 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 26 Ceed ceed; 27 Ceed_Cuda *ceed_Cuda; 28 CeedInt dim, num_comp; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedBasis_Cuda_shared *data; 32 33 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 34 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 35 CeedCallBackend(CeedBasisGetData(basis, &data)); 36 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 37 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 38 39 // Get read/write access to u, v 40 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 41 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 42 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 43 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 44 45 // Apply basis operation 46 switch (eval_mode) { 47 case CEED_EVAL_INTERP: { 48 CeedInt P_1d, Q_1d; 49 50 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 51 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 52 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 53 54 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 55 56 if (dim == 1) { 57 // avoid >512 total threads 58 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 59 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 60 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 61 62 if (t_mode == CEED_TRANSPOSE) { 63 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, 64 elems_per_block, shared_mem, interp_args)); 65 } else { 66 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 67 } 68 } else if (dim == 2) { 69 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 70 // elems_per_block must be at least 1 71 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 72 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 73 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 74 75 if (t_mode == CEED_TRANSPOSE) { 76 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 77 elems_per_block, shared_mem, interp_args)); 78 } else { 79 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 80 } 81 } else if (dim == 3) { 82 CeedInt elems_per_block = 1; 83 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 84 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 85 86 if (t_mode == CEED_TRANSPOSE) { 87 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 88 elems_per_block, shared_mem, interp_args)); 89 } else { 90 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 91 } 92 } 93 } break; 94 case CEED_EVAL_GRAD: { 95 CeedInt P_1d, Q_1d; 96 97 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 98 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 99 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 100 CeedScalar *d_grad_1d = data->d_grad_1d; 101 102 if (data->d_collo_grad_1d) { 103 d_grad_1d = data->d_collo_grad_1d; 104 } 105 void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; 106 107 if (dim == 1) { 108 // avoid >512 total threads 109 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 110 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 111 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 112 113 if (t_mode == CEED_TRANSPOSE) { 114 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, 115 elems_per_block, shared_mem, grad_args)); 116 } else { 117 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 118 } 119 } else if (dim == 2) { 120 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 121 // elems_per_block must be at least 1 122 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 123 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 124 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 125 126 if (t_mode == CEED_TRANSPOSE) { 127 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 128 elems_per_block, shared_mem, grad_args)); 129 } else { 130 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 131 } 132 } else if (dim == 3) { 133 CeedInt elems_per_block = 1; 134 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 135 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 136 137 if (t_mode == CEED_TRANSPOSE) { 138 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 139 elems_per_block, shared_mem, grad_args)); 140 } else { 141 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 142 } 143 } 144 } break; 145 case CEED_EVAL_WEIGHT: { 146 CeedInt Q_1d; 147 CeedInt block_size = 32; 148 149 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 150 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 151 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 152 if (dim == 1) { 153 const CeedInt elems_per_block = block_size / Q_1d; 154 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 155 156 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); 157 } else if (dim == 2) { 158 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 159 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 160 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 161 162 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 163 } else if (dim == 3) { 164 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 165 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 166 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 167 168 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 169 } 170 } break; 171 case CEED_EVAL_NONE: /* handled separately below */ 172 break; 173 // LCOV_EXCL_START 174 case CEED_EVAL_DIV: 175 case CEED_EVAL_CURL: 176 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 177 // LCOV_EXCL_STOP 178 } 179 180 // Restore vectors, cover CEED_EVAL_NONE 181 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 182 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 183 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 184 CeedCallBackend(CeedDestroy(&ceed)); 185 return CEED_ERROR_SUCCESS; 186 } 187 188 static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 189 CeedVector v) { 190 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 191 return CEED_ERROR_SUCCESS; 192 } 193 194 static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 195 CeedVector u, CeedVector v) { 196 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 197 return CEED_ERROR_SUCCESS; 198 } 199 200 //------------------------------------------------------------------------------ 201 // Basis apply - tensor AtPoints 202 //------------------------------------------------------------------------------ 203 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 204 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 205 Ceed ceed; 206 Ceed_Cuda *ceed_Cuda; 207 CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; 208 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 209 const CeedScalar *d_x, *d_u; 210 CeedScalar *d_v; 211 CeedBasis_Cuda_shared *data; 212 213 CeedCallBackend(CeedBasisGetData(basis, &data)); 214 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 215 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 216 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 217 218 // Weight handled separately 219 if (eval_mode == CEED_EVAL_WEIGHT) { 220 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 221 return CEED_ERROR_SUCCESS; 222 } 223 224 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 225 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 226 227 // Check padded to uniform number of points per elem 228 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 229 { 230 CeedInt q_comp; 231 CeedSize len, len_required; 232 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 233 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 234 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 235 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 236 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 237 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 238 len, len_required); 239 } 240 241 // Move num_points array to device 242 if (is_transpose) { 243 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 244 245 if (num_elem != data->num_elem_at_points) { 246 data->num_elem_at_points = num_elem; 247 248 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 249 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 250 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 251 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 252 } 253 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 254 memcpy(data->h_points_per_elem, num_points, num_bytes); 255 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 256 } 257 } 258 259 // Build kernels if needed 260 if (data->num_points != max_num_points) { 261 CeedInt P_1d; 262 263 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 264 data->num_points = max_num_points; 265 266 // -- Create interp matrix to Chebyshev coefficients 267 if (!data->d_chebyshev_interp_1d) { 268 CeedSize interp_bytes; 269 CeedScalar *chebyshev_interp_1d; 270 271 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 272 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 273 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 274 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 275 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 276 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 277 } 278 279 // -- Compile kernels 280 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n"; 281 CeedInt num_comp; 282 283 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 284 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 285 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 286 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 287 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); 288 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 289 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 290 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 291 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 292 } 293 294 // Get read/write access to u, v 295 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 296 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 297 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 298 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 299 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 300 301 // Clear v for transpose operation 302 if (is_transpose && !apply_add) { 303 CeedInt num_comp, q_comp, num_nodes; 304 CeedSize length; 305 306 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 307 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 308 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 309 length = 310 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 311 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 312 } 313 314 // Basis action 315 switch (eval_mode) { 316 case CEED_EVAL_INTERP: { 317 CeedInt P_1d, Q_1d; 318 319 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 320 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 321 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 322 323 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 324 325 if (dim == 1) { 326 // avoid >512 total threads 327 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 328 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 329 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 330 331 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1, 332 elems_per_block, shared_mem, interp_args)); 333 } else if (dim == 2) { 334 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 335 // elems_per_block must be at least 1 336 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 337 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 338 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 339 340 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 341 thread_1d, elems_per_block, shared_mem, interp_args)); 342 } else if (dim == 3) { 343 CeedInt elems_per_block = 1; 344 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 345 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 346 347 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 348 thread_1d, elems_per_block, shared_mem, interp_args)); 349 } 350 } break; 351 case CEED_EVAL_GRAD: { 352 CeedInt P_1d, Q_1d; 353 354 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 355 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 356 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 357 358 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 359 360 if (dim == 1) { 361 // avoid >512 total threads 362 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 363 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 364 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 365 366 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1, 367 elems_per_block, shared_mem, grad_args)); 368 } else if (dim == 2) { 369 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 370 // elems_per_block must be at least 1 371 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 372 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 373 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 374 375 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 376 elems_per_block, shared_mem, grad_args)); 377 } else if (dim == 3) { 378 CeedInt elems_per_block = 1; 379 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 380 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 381 382 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 383 elems_per_block, shared_mem, grad_args)); 384 } 385 } break; 386 case CEED_EVAL_WEIGHT: 387 case CEED_EVAL_NONE: /* handled separately below */ 388 break; 389 // LCOV_EXCL_START 390 case CEED_EVAL_DIV: 391 case CEED_EVAL_CURL: 392 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 393 // LCOV_EXCL_STOP 394 } 395 396 // Restore vectors, cover CEED_EVAL_NONE 397 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 398 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 399 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 400 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 401 CeedCallBackend(CeedDestroy(&ceed)); 402 return CEED_ERROR_SUCCESS; 403 } 404 405 static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 406 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 407 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 408 return CEED_ERROR_SUCCESS; 409 } 410 411 static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 412 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 413 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 414 return CEED_ERROR_SUCCESS; 415 } 416 417 //------------------------------------------------------------------------------ 418 // Apply non-tensor basis 419 //------------------------------------------------------------------------------ 420 static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 421 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 422 Ceed ceed; 423 Ceed_Cuda *ceed_Cuda; 424 CeedInt dim; 425 const CeedScalar *d_u; 426 CeedScalar *d_v; 427 CeedBasis_Cuda_shared *data; 428 429 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 430 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 431 CeedCallBackend(CeedBasisGetData(basis, &data)); 432 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 433 434 // Get read/write access to u, v 435 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 436 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 437 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 438 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 439 440 // Apply basis operation 441 switch (eval_mode) { 442 case CEED_EVAL_INTERP: { 443 CeedInt P, Q; 444 445 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 446 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 447 CeedInt thread = CeedIntMax(Q, P); 448 449 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 450 451 { 452 // avoid >512 total threads 453 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 454 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 455 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 456 457 if (t_mode == CEED_TRANSPOSE) { 458 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1, 459 elems_per_block, shared_mem, interp_args)); 460 } else { 461 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args)); 462 } 463 } 464 } break; 465 case CEED_EVAL_GRAD: { 466 CeedInt P, Q; 467 468 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 469 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 470 CeedInt thread = CeedIntMax(Q, P); 471 472 void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; 473 474 { 475 // avoid >512 total threads 476 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 477 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 478 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 479 480 if (t_mode == CEED_TRANSPOSE) { 481 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, 482 elems_per_block, shared_mem, grad_args)); 483 } else { 484 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args)); 485 } 486 } 487 } break; 488 case CEED_EVAL_WEIGHT: { 489 CeedInt Q; 490 491 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 492 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 493 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 494 495 { 496 // avoid >512 total threads 497 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / Q, 1)); 498 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 499 500 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, Q, elems_per_block, 1, weight_args)); 501 } 502 } break; 503 case CEED_EVAL_NONE: /* handled separately below */ 504 break; 505 // LCOV_EXCL_START 506 case CEED_EVAL_DIV: 507 case CEED_EVAL_CURL: 508 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 509 // LCOV_EXCL_STOP 510 } 511 512 // Restore vectors, cover CEED_EVAL_NONE 513 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 514 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 515 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 516 CeedCallBackend(CeedDestroy(&ceed)); 517 return CEED_ERROR_SUCCESS; 518 } 519 520 static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 521 CeedVector u, CeedVector v) { 522 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 523 return CEED_ERROR_SUCCESS; 524 } 525 526 static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 527 CeedVector u, CeedVector v) { 528 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 529 return CEED_ERROR_SUCCESS; 530 } 531 532 //------------------------------------------------------------------------------ 533 // Destroy basis 534 //------------------------------------------------------------------------------ 535 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 536 Ceed ceed; 537 CeedBasis_Cuda_shared *data; 538 539 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 540 CeedCallBackend(CeedBasisGetData(basis, &data)); 541 CeedCallCuda(ceed, cuModuleUnload(data->module)); 542 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 543 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 544 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 545 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 546 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 547 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 548 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); 549 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 550 CeedCallBackend(CeedFree(&data)); 551 CeedCallBackend(CeedDestroy(&ceed)); 552 return CEED_ERROR_SUCCESS; 553 } 554 555 //------------------------------------------------------------------------------ 556 // Create tensor basis 557 //------------------------------------------------------------------------------ 558 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 559 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 560 Ceed ceed; 561 CeedInt num_comp; 562 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 563 const CeedInt interp_bytes = q_bytes * P_1d; 564 CeedBasis_Cuda_shared *data; 565 566 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 567 CeedCallBackend(CeedCalloc(1, &data)); 568 569 // Copy basis data to GPU 570 if (q_weight_1d) { 571 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 572 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 573 } 574 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 575 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 576 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 577 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 578 579 // Compute collocated gradient and copy to GPU 580 data->d_collo_grad_1d = NULL; 581 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 582 583 if (has_collocated_grad) { 584 CeedScalar *collo_grad_1d; 585 586 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 587 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 588 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 589 CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); 590 CeedCallBackend(CeedFree(&collo_grad_1d)); 591 } 592 593 // Compile basis kernels 594 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n"; 595 596 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 597 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 598 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 599 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); 600 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 601 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 602 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 603 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 604 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 605 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 606 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 607 608 CeedCallBackend(CeedBasisSetData(basis, data)); 609 610 // Register backend functions 611 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); 612 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); 613 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); 614 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared)); 615 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 616 CeedCallBackend(CeedDestroy(&ceed)); 617 return CEED_ERROR_SUCCESS; 618 } 619 620 //------------------------------------------------------------------------------ 621 // Create non-tensor basis 622 //------------------------------------------------------------------------------ 623 int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 624 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 625 Ceed ceed; 626 CeedInt num_comp, q_comp_interp, q_comp_grad; 627 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 628 CeedBasis_Cuda_shared *data; 629 630 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 631 632 // Check shared memory size 633 { 634 Ceed_Cuda *cuda_data; 635 636 CeedCallBackend(CeedGetData(ceed, &cuda_data)); 637 if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > 638 cuda_data->device_prop.sharedMemPerBlock) { 639 CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); 640 CeedCallBackend(CeedDestroy(&ceed)); 641 return CEED_ERROR_SUCCESS; 642 } 643 } 644 645 CeedCallBackend(CeedCalloc(1, &data)); 646 647 // Copy basis data to GPU 648 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 649 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 650 if (q_weight) { 651 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 652 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); 653 } 654 if (interp) { 655 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 656 657 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 658 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); 659 } 660 if (grad) { 661 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 662 663 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); 664 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); 665 } 666 667 // Compile basis kernels 668 const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n"; 669 670 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 671 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D", 672 CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); 673 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 674 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 675 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 676 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 677 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 678 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 679 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 680 681 CeedCallBackend(CeedBasisSetData(basis, data)); 682 683 // Register backend functions 684 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared)); 685 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared)); 686 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 687 CeedCallBackend(CeedDestroy(&ceed)); 688 return CEED_ERROR_SUCCESS; 689 } 690 691 //------------------------------------------------------------------------------ 692