1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-shared.h" 20 21 //------------------------------------------------------------------------------ 22 // Apply tensor basis 23 //------------------------------------------------------------------------------ 24 static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 25 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 26 Ceed ceed; 27 Ceed_Cuda *ceed_Cuda; 28 CeedInt dim, num_comp; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedBasis_Cuda_shared *data; 32 33 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 34 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 35 CeedCallBackend(CeedBasisGetData(basis, &data)); 36 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 37 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 38 39 // Get read/write access to u, v 40 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 41 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 42 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 43 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 44 45 // Apply basis operation 46 switch (eval_mode) { 47 case CEED_EVAL_INTERP: { 48 CeedInt P_1d, Q_1d; 49 50 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 51 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 52 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 53 54 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 55 56 if (dim == 1) { 57 // avoid >512 total threads 58 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 59 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 60 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 61 62 if (t_mode == CEED_TRANSPOSE) { 63 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, 64 elems_per_block, shared_mem, interp_args)); 65 } else { 66 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 67 } 68 } else if (dim == 2) { 69 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 70 // elems_per_block must be at least 1 71 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 72 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 73 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 74 75 if (t_mode == CEED_TRANSPOSE) { 76 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 77 elems_per_block, shared_mem, interp_args)); 78 } else { 79 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 80 } 81 } else if (dim == 3) { 82 CeedInt elems_per_block = 1; 83 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 84 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 85 86 if (t_mode == CEED_TRANSPOSE) { 87 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, 88 elems_per_block, shared_mem, interp_args)); 89 } else { 90 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 91 } 92 } 93 } break; 94 case CEED_EVAL_GRAD: { 95 CeedInt P_1d, Q_1d; 96 97 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 98 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 99 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 100 CeedScalar *d_grad_1d = data->d_grad_1d; 101 102 if (data->d_collo_grad_1d) { 103 d_grad_1d = data->d_collo_grad_1d; 104 } 105 void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; 106 107 if (dim == 1) { 108 // avoid >512 total threads 109 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 110 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 111 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 112 113 if (t_mode == CEED_TRANSPOSE) { 114 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, 115 elems_per_block, shared_mem, grad_args)); 116 } else { 117 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 118 } 119 } else if (dim == 2) { 120 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 121 // elems_per_block must be at least 1 122 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 123 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 124 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 125 126 if (t_mode == CEED_TRANSPOSE) { 127 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 128 elems_per_block, shared_mem, grad_args)); 129 } else { 130 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 131 } 132 } else if (dim == 3) { 133 CeedInt elems_per_block = 1; 134 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 135 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 136 137 if (t_mode == CEED_TRANSPOSE) { 138 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, 139 elems_per_block, shared_mem, grad_args)); 140 } else { 141 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 142 } 143 } 144 } break; 145 case CEED_EVAL_WEIGHT: { 146 CeedInt Q_1d; 147 CeedInt block_size = 32; 148 149 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 150 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 151 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 152 if (dim == 1) { 153 const CeedInt elems_per_block = block_size / Q_1d; 154 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 155 156 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); 157 } else if (dim == 2) { 158 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 159 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 160 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 161 162 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 163 } else if (dim == 3) { 164 const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 165 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 166 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 167 168 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 169 } 170 } break; 171 case CEED_EVAL_NONE: /* handled separately below */ 172 break; 173 // LCOV_EXCL_START 174 case CEED_EVAL_DIV: 175 case CEED_EVAL_CURL: 176 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 177 // LCOV_EXCL_STOP 178 } 179 180 // Restore vectors, cover CEED_EVAL_NONE 181 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 182 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 183 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 184 CeedCallBackend(CeedDestroy(&ceed)); 185 return CEED_ERROR_SUCCESS; 186 } 187 188 static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 189 CeedVector v) { 190 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 191 return CEED_ERROR_SUCCESS; 192 } 193 194 static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 195 CeedVector u, CeedVector v) { 196 CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 197 return CEED_ERROR_SUCCESS; 198 } 199 200 //------------------------------------------------------------------------------ 201 // Basis apply - tensor AtPoints 202 //------------------------------------------------------------------------------ 203 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 204 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 205 Ceed ceed; 206 Ceed_Cuda *ceed_Cuda; 207 CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; 208 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 209 const CeedScalar *d_x, *d_u; 210 CeedScalar *d_v; 211 CeedBasis_Cuda_shared *data; 212 213 CeedCallBackend(CeedBasisGetData(basis, &data)); 214 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 215 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 216 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 217 218 // Weight handled separately 219 if (eval_mode == CEED_EVAL_WEIGHT) { 220 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 221 return CEED_ERROR_SUCCESS; 222 } 223 224 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 225 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 226 227 // Check padded to uniform number of points per elem 228 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 229 { 230 CeedInt q_comp; 231 CeedSize len, len_required; 232 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 233 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 234 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 235 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 236 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 237 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 238 len, len_required); 239 } 240 241 // Move num_points array to device 242 if (is_transpose) { 243 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 244 245 if (num_elem != data->num_elem_at_points) { 246 data->num_elem_at_points = num_elem; 247 248 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 249 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 250 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 251 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 252 } 253 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 254 memcpy(data->h_points_per_elem, num_points, num_bytes); 255 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 256 } 257 } 258 259 // Build kernels if needed 260 if (data->num_points != max_num_points) { 261 CeedInt P_1d; 262 263 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 264 data->num_points = max_num_points; 265 266 // -- Create interp matrix to Chebyshev coefficients 267 if (!data->d_chebyshev_interp_1d) { 268 CeedSize interp_bytes; 269 CeedScalar *chebyshev_interp_1d; 270 271 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 272 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 273 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 274 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 275 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 276 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 277 } 278 279 // -- Compile kernels 280 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n"; 281 CeedInt num_comp; 282 283 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 284 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 285 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 286 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 287 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); 288 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 289 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 290 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 291 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 292 } 293 294 // Get read/write access to u, v 295 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 296 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 297 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 298 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 299 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 300 301 // Clear v for transpose operation 302 if (is_transpose && !apply_add) { 303 CeedInt num_comp, q_comp, num_nodes; 304 CeedSize length; 305 306 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 307 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 308 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 309 length = 310 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 311 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 312 } 313 314 // Basis action 315 switch (eval_mode) { 316 case CEED_EVAL_INTERP: { 317 CeedInt P_1d, Q_1d; 318 319 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 320 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 321 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 322 323 void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 324 325 if (dim == 1) { 326 // avoid >512 total threads 327 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 328 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 329 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 330 331 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1, 332 elems_per_block, shared_mem, interp_args)); 333 } else if (dim == 2) { 334 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 335 // elems_per_block must be at least 1 336 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 337 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 338 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 339 340 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 341 thread_1d, elems_per_block, shared_mem, interp_args)); 342 } else if (dim == 3) { 343 CeedInt elems_per_block = 1; 344 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 345 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 346 347 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 348 thread_1d, elems_per_block, shared_mem, interp_args)); 349 } 350 } break; 351 case CEED_EVAL_GRAD: { 352 CeedInt P_1d, Q_1d; 353 354 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 355 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 356 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 357 358 void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 359 360 if (dim == 1) { 361 // avoid >512 total threads 362 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); 363 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 364 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 365 366 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1, 367 elems_per_block, shared_mem, grad_args)); 368 } else if (dim == 2) { 369 const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 370 // elems_per_block must be at least 1 371 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); 372 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 373 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 374 375 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 376 elems_per_block, shared_mem, grad_args)); 377 } else if (dim == 3) { 378 CeedInt elems_per_block = 1; 379 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 380 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 381 382 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, 383 elems_per_block, shared_mem, grad_args)); 384 } 385 } break; 386 case CEED_EVAL_WEIGHT: 387 case CEED_EVAL_NONE: /* handled separately below */ 388 break; 389 // LCOV_EXCL_START 390 case CEED_EVAL_DIV: 391 case CEED_EVAL_CURL: 392 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 393 // LCOV_EXCL_STOP 394 } 395 396 // Restore vectors, cover CEED_EVAL_NONE 397 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 398 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 399 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 400 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 401 CeedCallBackend(CeedDestroy(&ceed)); 402 return CEED_ERROR_SUCCESS; 403 } 404 405 static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 406 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 407 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 408 return CEED_ERROR_SUCCESS; 409 } 410 411 static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 412 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 413 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 414 return CEED_ERROR_SUCCESS; 415 } 416 417 //------------------------------------------------------------------------------ 418 // Apply non-tensor basis 419 //------------------------------------------------------------------------------ 420 static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 421 CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 422 Ceed ceed; 423 Ceed_Cuda *ceed_Cuda; 424 CeedInt dim; 425 const CeedScalar *d_u; 426 CeedScalar *d_v; 427 CeedBasis_Cuda_shared *data; 428 429 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 430 CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 431 CeedCallBackend(CeedBasisGetData(basis, &data)); 432 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 433 434 // Get read/write access to u, v 435 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 436 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 437 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 438 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 439 440 // Apply basis operation 441 switch (eval_mode) { 442 case CEED_EVAL_INTERP: { 443 CeedInt P, Q; 444 445 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 446 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 447 CeedInt thread = CeedIntMax(Q, P); 448 449 void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 450 451 { 452 // avoid >512 total threads 453 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 454 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 455 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 456 457 if (t_mode == CEED_TRANSPOSE) { 458 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1, 459 elems_per_block, shared_mem, interp_args)); 460 } else { 461 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args)); 462 } 463 } 464 } break; 465 case CEED_EVAL_GRAD: { 466 CeedInt P, Q; 467 468 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 469 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 470 CeedInt thread = CeedIntMax(Q, P); 471 472 void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; 473 474 { 475 // avoid >512 total threads 476 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 477 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 478 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 479 480 if (t_mode == CEED_TRANSPOSE) { 481 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, 482 elems_per_block, shared_mem, grad_args)); 483 } else { 484 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args)); 485 } 486 } 487 } break; 488 case CEED_EVAL_WEIGHT: { 489 CeedInt P, Q; 490 491 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 492 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 493 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 494 CeedInt thread = CeedIntMax(Q, P); 495 496 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 497 498 { 499 // avoid >512 total threads 500 CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); 501 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 502 503 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args)); 504 } 505 } break; 506 case CEED_EVAL_NONE: /* handled separately below */ 507 break; 508 // LCOV_EXCL_START 509 case CEED_EVAL_DIV: 510 case CEED_EVAL_CURL: 511 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 512 // LCOV_EXCL_STOP 513 } 514 515 // Restore vectors, cover CEED_EVAL_NONE 516 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 517 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 518 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 519 CeedCallBackend(CeedDestroy(&ceed)); 520 return CEED_ERROR_SUCCESS; 521 } 522 523 static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 524 CeedVector u, CeedVector v) { 525 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 526 return CEED_ERROR_SUCCESS; 527 } 528 529 static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 530 CeedVector u, CeedVector v) { 531 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 532 return CEED_ERROR_SUCCESS; 533 } 534 535 //------------------------------------------------------------------------------ 536 // Destroy basis 537 //------------------------------------------------------------------------------ 538 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 539 Ceed ceed; 540 CeedBasis_Cuda_shared *data; 541 542 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 543 CeedCallBackend(CeedBasisGetData(basis, &data)); 544 CeedCallCuda(ceed, cuModuleUnload(data->module)); 545 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 546 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 547 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 548 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 549 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 550 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 551 CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); 552 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 553 CeedCallBackend(CeedFree(&data)); 554 CeedCallBackend(CeedDestroy(&ceed)); 555 return CEED_ERROR_SUCCESS; 556 } 557 558 //------------------------------------------------------------------------------ 559 // Create tensor basis 560 //------------------------------------------------------------------------------ 561 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 562 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 563 Ceed ceed; 564 CeedInt num_comp; 565 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 566 const CeedInt interp_bytes = q_bytes * P_1d; 567 CeedBasis_Cuda_shared *data; 568 569 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 570 CeedCallBackend(CeedCalloc(1, &data)); 571 572 // Copy basis data to GPU 573 if (q_weight_1d) { 574 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 575 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 576 } 577 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 578 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 579 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 580 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 581 582 // Compute collocated gradient and copy to GPU 583 data->d_collo_grad_1d = NULL; 584 bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 585 586 if (has_collocated_grad) { 587 CeedScalar *collo_grad_1d; 588 589 CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 590 CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 591 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 592 CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); 593 CeedCallBackend(CeedFree(&collo_grad_1d)); 594 } 595 596 // Compile basis kernels 597 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n"; 598 599 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 600 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", 601 CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 602 "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); 603 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 604 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 605 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 606 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 607 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 608 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 609 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 610 611 CeedCallBackend(CeedBasisSetData(basis, data)); 612 613 // Register backend functions 614 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); 615 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); 616 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); 617 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared)); 618 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 619 CeedCallBackend(CeedDestroy(&ceed)); 620 return CEED_ERROR_SUCCESS; 621 } 622 623 //------------------------------------------------------------------------------ 624 // Create non-tensor basis 625 //------------------------------------------------------------------------------ 626 int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 627 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 628 Ceed ceed; 629 CeedInt num_comp, q_comp_interp, q_comp_grad; 630 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 631 CeedBasis_Cuda_shared *data; 632 633 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 634 CeedCallBackend(CeedCalloc(1, &data)); 635 636 // Check max sizes 637 CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3"); 638 CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large"); 639 640 // Copy basis data to GPU 641 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 642 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 643 if (q_weight) { 644 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 645 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice)); 646 } 647 if (interp) { 648 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 649 650 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 651 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice)); 652 } 653 if (grad) { 654 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 655 656 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes)); 657 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice)); 658 } 659 660 // Compile basis kernels 661 const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n"; 662 663 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 664 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D", 665 CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp)); 666 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 667 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 668 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 669 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 670 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); 671 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 672 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 673 674 CeedCallBackend(CeedBasisSetData(basis, data)); 675 676 // Register backend functions 677 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared)); 678 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared)); 679 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); 680 CeedCallBackend(CeedDestroy(&ceed)); 681 return CEED_ERROR_SUCCESS; 682 } 683 684 //------------------------------------------------------------------------------ 685