1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <string.h> 14 15 #include "../cuda/ceed-cuda-common.h" 16 #include "../cuda/ceed-cuda-compile.h" 17 #include "ceed-cuda-ref.h" 18 19 //------------------------------------------------------------------------------ 20 // Basis apply - tensor 21 //------------------------------------------------------------------------------ 22 static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23 CeedVector u, CeedVector v) { 24 Ceed ceed; 25 CeedInt Q_1d, dim; 26 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27 const int max_block_size = 32; 28 const CeedScalar *d_u; 29 CeedScalar *d_v; 30 CeedBasis_Cuda *data; 31 32 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33 CeedCallBackend(CeedBasisGetData(basis, &data)); 34 35 // Get read/write access to u, v 36 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 37 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 39 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 40 41 // Clear v for transpose operation 42 if (is_transpose && !apply_add) { 43 CeedInt num_comp, q_comp, num_nodes, num_qpts; 44 CeedSize length; 45 46 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 47 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 48 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 49 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 50 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 51 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 52 } 53 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 54 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 55 56 // Basis action 57 switch (eval_mode) { 58 case CEED_EVAL_INTERP: { 59 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 60 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 63 } break; 64 case CEED_EVAL_GRAD: { 65 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 66 const CeedInt block_size = max_block_size; 67 68 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 69 } break; 70 case CEED_EVAL_WEIGHT: { 71 CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 72 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 73 const int block_size_x = Q_1d; 74 const int block_size_y = dim >= 2 ? Q_1d : 1; 75 76 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 77 } break; 78 case CEED_EVAL_NONE: /* handled separately below */ 79 break; 80 // LCOV_EXCL_START 81 case CEED_EVAL_DIV: 82 case CEED_EVAL_CURL: 83 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 84 // LCOV_EXCL_STOP 85 } 86 87 // Restore vectors, cover CEED_EVAL_NONE 88 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 89 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 90 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 91 return CEED_ERROR_SUCCESS; 92 } 93 94 static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 95 CeedVector v) { 96 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 97 return CEED_ERROR_SUCCESS; 98 } 99 100 static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 101 CeedVector v) { 102 CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 103 return CEED_ERROR_SUCCESS; 104 } 105 106 //------------------------------------------------------------------------------ 107 // Basis apply - tensor AtPoints 108 //------------------------------------------------------------------------------ 109 static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 110 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 111 Ceed ceed; 112 CeedInt Q_1d, dim, max_num_points = num_points[0]; 113 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 114 const int max_block_size = 32; 115 const CeedScalar *d_x, *d_u; 116 CeedScalar *d_v; 117 CeedBasis_Cuda *data; 118 119 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 120 CeedCallBackend(CeedBasisGetData(basis, &data)); 121 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 122 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 123 124 // Weight handled separately 125 if (eval_mode == CEED_EVAL_WEIGHT) { 126 CeedCallBackend(CeedVectorSetValue(v, 1.0)); 127 return CEED_ERROR_SUCCESS; 128 } 129 130 // Check padded to uniform number of points per elem 131 for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 132 { 133 CeedInt num_comp, q_comp; 134 CeedSize len, len_required; 135 136 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 137 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 138 CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 139 len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 140 CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 141 "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 142 " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 143 len, len_required); 144 } 145 146 // Move num_points array to device 147 if (is_transpose) { 148 const CeedInt num_bytes = num_elem * sizeof(CeedInt); 149 150 if (num_elem != data->num_elem_at_points) { 151 data->num_elem_at_points = num_elem; 152 153 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 154 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 155 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 156 CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 157 } 158 if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 159 memcpy(data->h_points_per_elem, num_points, num_bytes); 160 CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 161 } 162 } 163 164 // Build kernels if needed 165 if (data->num_points != max_num_points) { 166 CeedInt P_1d; 167 168 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 169 data->num_points = max_num_points; 170 171 // -- Create interp matrix to Chebyshev coefficients 172 if (!data->d_chebyshev_interp_1d) { 173 CeedSize interp_bytes; 174 CeedScalar *chebyshev_interp_1d; 175 176 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 177 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 178 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 179 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 180 CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 181 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 182 } 183 184 // -- Compile kernels 185 const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n"; 186 CeedInt num_comp; 187 188 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 189 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 190 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 191 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 192 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 193 max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 194 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 195 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 196 } 197 198 // Get read/write access to u, v 199 CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 200 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 201 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 202 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 203 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 204 205 // Clear v for transpose operation 206 if (is_transpose && !apply_add) { 207 CeedInt num_comp, q_comp, num_nodes; 208 CeedSize length; 209 210 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 211 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 212 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 213 length = 214 (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 215 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 216 } 217 218 // Basis action 219 switch (eval_mode) { 220 case CEED_EVAL_INTERP: { 221 void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 222 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 223 224 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 225 } break; 226 case CEED_EVAL_GRAD: { 227 void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 228 const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 229 230 CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 231 } break; 232 case CEED_EVAL_WEIGHT: 233 case CEED_EVAL_NONE: /* handled separately below */ 234 break; 235 // LCOV_EXCL_START 236 case CEED_EVAL_DIV: 237 case CEED_EVAL_CURL: 238 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 239 // LCOV_EXCL_STOP 240 } 241 242 // Restore vectors, cover CEED_EVAL_NONE 243 CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 244 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 245 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 246 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 247 return CEED_ERROR_SUCCESS; 248 } 249 250 static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 251 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 252 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 253 return CEED_ERROR_SUCCESS; 254 } 255 256 static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 257 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 258 CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 259 return CEED_ERROR_SUCCESS; 260 } 261 262 //------------------------------------------------------------------------------ 263 // Basis apply - non-tensor 264 //------------------------------------------------------------------------------ 265 static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 266 CeedVector u, CeedVector v) { 267 Ceed ceed; 268 CeedInt num_nodes, num_qpts; 269 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 270 const int elems_per_block = 1; 271 const int grid = CeedDivUpInt(num_elem, elems_per_block); 272 const CeedScalar *d_u; 273 CeedScalar *d_v; 274 CeedBasisNonTensor_Cuda *data; 275 276 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 277 CeedCallBackend(CeedBasisGetData(basis, &data)); 278 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 279 CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 280 281 // Get read/write access to u, v 282 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 283 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 284 if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 285 else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 286 287 // Clear v for transpose operation 288 if (is_transpose && !apply_add) { 289 CeedInt num_comp, q_comp; 290 CeedSize length; 291 292 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 293 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 294 length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 295 CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 296 } 297 298 // Apply basis operation 299 switch (eval_mode) { 300 case CEED_EVAL_INTERP: { 301 void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 302 const int block_size_x = is_transpose ? num_nodes : num_qpts; 303 304 if (is_transpose) { 305 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 306 } else { 307 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 308 } 309 } break; 310 case CEED_EVAL_GRAD: { 311 void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 312 const int block_size_x = is_transpose ? num_nodes : num_qpts; 313 314 if (is_transpose) { 315 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 316 } else { 317 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 318 } 319 } break; 320 case CEED_EVAL_DIV: { 321 void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 322 const int block_size_x = is_transpose ? num_nodes : num_qpts; 323 324 if (is_transpose) { 325 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 326 } else { 327 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 328 } 329 } break; 330 case CEED_EVAL_CURL: { 331 void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 332 const int block_size_x = is_transpose ? num_nodes : num_qpts; 333 334 if (is_transpose) { 335 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 336 } else { 337 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 338 } 339 } break; 340 case CEED_EVAL_WEIGHT: { 341 CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 342 void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 343 344 CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 345 } break; 346 case CEED_EVAL_NONE: /* handled separately below */ 347 break; 348 } 349 350 // Restore vectors, cover CEED_EVAL_NONE 351 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 352 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 353 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 354 return CEED_ERROR_SUCCESS; 355 } 356 357 static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 358 CeedVector v) { 359 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 360 return CEED_ERROR_SUCCESS; 361 } 362 363 static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 364 CeedVector v) { 365 CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 366 return CEED_ERROR_SUCCESS; 367 } 368 369 //------------------------------------------------------------------------------ 370 // Destroy tensor basis 371 //------------------------------------------------------------------------------ 372 static int CeedBasisDestroy_Cuda(CeedBasis basis) { 373 Ceed ceed; 374 CeedBasis_Cuda *data; 375 376 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 377 CeedCallBackend(CeedBasisGetData(basis, &data)); 378 CeedCallCuda(ceed, cuModuleUnload(data->module)); 379 if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 380 if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 381 CeedCallBackend(CeedFree(&data->h_points_per_elem)); 382 if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 383 CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 384 CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 385 CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 386 CeedCallBackend(CeedFree(&data)); 387 return CEED_ERROR_SUCCESS; 388 } 389 390 //------------------------------------------------------------------------------ 391 // Destroy non-tensor basis 392 //------------------------------------------------------------------------------ 393 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 394 Ceed ceed; 395 CeedBasisNonTensor_Cuda *data; 396 397 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 398 CeedCallBackend(CeedBasisGetData(basis, &data)); 399 CeedCallCuda(ceed, cuModuleUnload(data->module)); 400 if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 401 CeedCallCuda(ceed, cudaFree(data->d_interp)); 402 CeedCallCuda(ceed, cudaFree(data->d_grad)); 403 CeedCallCuda(ceed, cudaFree(data->d_div)); 404 CeedCallCuda(ceed, cudaFree(data->d_curl)); 405 CeedCallBackend(CeedFree(&data)); 406 return CEED_ERROR_SUCCESS; 407 } 408 409 //------------------------------------------------------------------------------ 410 // Create tensor 411 //------------------------------------------------------------------------------ 412 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 413 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 414 Ceed ceed; 415 CeedInt num_comp; 416 const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 417 const CeedInt interp_bytes = q_bytes * P_1d; 418 CeedBasis_Cuda *data; 419 420 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 421 CeedCallBackend(CeedCalloc(1, &data)); 422 423 // Copy data to GPU 424 if (q_weight_1d) { 425 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 426 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 427 } 428 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 429 CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 430 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 431 CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 432 433 // Compile basis kernels 434 const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n"; 435 436 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 437 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 438 Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 439 "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 440 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 441 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 442 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 443 444 CeedCallBackend(CeedBasisSetData(basis, data)); 445 446 // Register backend functions 447 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 448 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 449 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 450 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 451 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 452 return CEED_ERROR_SUCCESS; 453 } 454 455 //------------------------------------------------------------------------------ 456 // Create non-tensor H^1 457 //------------------------------------------------------------------------------ 458 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 459 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 460 Ceed ceed; 461 CeedInt num_comp, q_comp_interp, q_comp_grad; 462 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 463 CeedBasisNonTensor_Cuda *data; 464 465 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 466 CeedCallBackend(CeedCalloc(1, &data)); 467 468 // Copy basis data to GPU 469 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 470 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 471 if (q_weight) { 472 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 473 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 474 } 475 if (interp) { 476 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 477 478 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 479 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 480 } 481 if (grad) { 482 const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 483 484 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 485 CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 486 } 487 488 // Compile basis kernels 489 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 490 491 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 492 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 493 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 494 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 495 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 496 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 497 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 498 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 499 500 CeedCallBackend(CeedBasisSetData(basis, data)); 501 502 // Register backend functions 503 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 504 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 505 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 506 return CEED_ERROR_SUCCESS; 507 } 508 509 //------------------------------------------------------------------------------ 510 // Create non-tensor H(div) 511 //------------------------------------------------------------------------------ 512 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 513 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 514 Ceed ceed; 515 CeedInt num_comp, q_comp_interp, q_comp_div; 516 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 517 CeedBasisNonTensor_Cuda *data; 518 519 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 520 CeedCallBackend(CeedCalloc(1, &data)); 521 522 // Copy basis data to GPU 523 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 524 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 525 if (q_weight) { 526 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 527 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 528 } 529 if (interp) { 530 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 531 532 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 533 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 534 } 535 if (div) { 536 const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 537 538 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 539 CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 540 } 541 542 // Compile basis kernels 543 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 544 545 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 546 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 547 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 548 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 549 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 550 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 551 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 552 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 553 554 CeedCallBackend(CeedBasisSetData(basis, data)); 555 556 // Register backend functions 557 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 558 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 559 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 560 return CEED_ERROR_SUCCESS; 561 } 562 563 //------------------------------------------------------------------------------ 564 // Create non-tensor H(curl) 565 //------------------------------------------------------------------------------ 566 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 567 const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 568 Ceed ceed; 569 CeedInt num_comp, q_comp_interp, q_comp_curl; 570 const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 571 CeedBasisNonTensor_Cuda *data; 572 573 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 574 CeedCallBackend(CeedCalloc(1, &data)); 575 576 // Copy basis data to GPU 577 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 578 CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 579 if (q_weight) { 580 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 581 CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 582 } 583 if (interp) { 584 const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 585 586 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 587 CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 588 } 589 if (curl) { 590 const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 591 592 CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 593 CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 594 } 595 596 // Compile basis kernels 597 const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 598 599 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 600 CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 601 q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 602 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 603 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 604 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 605 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 606 CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 607 608 CeedCallBackend(CeedBasisSetData(basis, data)); 609 610 // Register backend functions 611 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 612 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 613 CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 614 return CEED_ERROR_SUCCESS; 615 } 616 617 //------------------------------------------------------------------------------ 618