15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 110d0321e0SJeremy L Thompson #include <cuda.h> 120d0321e0SJeremy L Thompson #include <cuda_runtime.h> 132b730f8bSJeremy L Thompson 1449aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 150d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 162b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h" 170d0321e0SJeremy L Thompson 180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 190d0321e0SJeremy L Thompson // Basis apply - tensor 200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 21*db2becc9SJeremy L Thompson static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 22*db2becc9SJeremy L Thompson CeedVector u, CeedVector v) { 230d0321e0SJeremy L Thompson Ceed ceed; 24ca735530SJeremy L Thompson CeedInt Q_1d, dim; 257bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 26437930d1SJeremy L Thompson const int max_block_size = 32; 270d0321e0SJeremy L Thompson const CeedScalar *d_u; 280d0321e0SJeremy L Thompson CeedScalar *d_v; 29ca735530SJeremy L Thompson CeedBasis_Cuda *data; 30ca735530SJeremy L Thompson 31ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 32ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 33ca735530SJeremy L Thompson 349ea2cfd9SJeremy L Thompson // Get read/write access to u, v 356574a04fSJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 366574a04fSJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 37*db2becc9SJeremy L Thompson if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 38*db2becc9SJeremy L Thompson else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 390d0321e0SJeremy L Thompson 400d0321e0SJeremy L Thompson // Clear v for transpose operation 41*db2becc9SJeremy L Thompson if (is_transpose && !apply_add) { 421f9221feSJeremy L Thompson CeedSize length; 43ca735530SJeremy L Thompson 442b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 452b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 460d0321e0SJeremy L Thompson } 472b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 482b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 490d0321e0SJeremy L Thompson 500d0321e0SJeremy L Thompson // Basis action 51437930d1SJeremy L Thompson switch (eval_mode) { 520d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 537bbbfca3SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 54b2165e7aSSebastian Grimberg const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 550d0321e0SJeremy L Thompson 56eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 570d0321e0SJeremy L Thompson } break; 580d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 597bbbfca3SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 60b2165e7aSSebastian Grimberg const CeedInt block_size = max_block_size; 610d0321e0SJeremy L Thompson 62eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 630d0321e0SJeremy L Thompson } break; 640d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 65097cc795SJames Wright CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 66437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 67b2165e7aSSebastian Grimberg const int block_size_x = Q_1d; 68b2165e7aSSebastian Grimberg const int block_size_y = dim >= 2 ? Q_1d : 1; 69b2165e7aSSebastian Grimberg 70b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 710d0321e0SJeremy L Thompson } break; 729ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 739ea2cfd9SJeremy L Thompson break; 740d0321e0SJeremy L Thompson // LCOV_EXCL_START 750d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 760d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 77bcbe1c99SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 780d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 790d0321e0SJeremy L Thompson } 800d0321e0SJeremy L Thompson 819ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 822b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 839ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 849ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 850d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 860d0321e0SJeremy L Thompson } 870d0321e0SJeremy L Thompson 88*db2becc9SJeremy L Thompson static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 89*db2becc9SJeremy L Thompson CeedVector v) { 90*db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 91*db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 92*db2becc9SJeremy L Thompson } 93*db2becc9SJeremy L Thompson 94*db2becc9SJeremy L Thompson static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 95*db2becc9SJeremy L Thompson CeedVector v) { 96*db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 97*db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 98*db2becc9SJeremy L Thompson } 99*db2becc9SJeremy L Thompson 1000d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 10134d14614SJeremy L Thompson // Basis apply - tensor AtPoints 10234d14614SJeremy L Thompson //------------------------------------------------------------------------------ 103*db2becc9SJeremy L Thompson static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 104*db2becc9SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 10534d14614SJeremy L Thompson Ceed ceed; 10634d14614SJeremy L Thompson CeedInt Q_1d, dim, max_num_points = num_points[0]; 10734d14614SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 10834d14614SJeremy L Thompson const int max_block_size = 32; 10934d14614SJeremy L Thompson const CeedScalar *d_x, *d_u; 11034d14614SJeremy L Thompson CeedScalar *d_v; 11134d14614SJeremy L Thompson CeedBasis_Cuda *data; 11234d14614SJeremy L Thompson 11334d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 11434d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 11534d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 11634d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 11734d14614SJeremy L Thompson 11834d14614SJeremy L Thompson // Check uniform number of points per elem 11934d14614SJeremy L Thompson for (CeedInt i = 1; i < num_elem; i++) { 12034d14614SJeremy L Thompson CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND, 12134d14614SJeremy L Thompson "BasisApplyAtPoints only supported for the same number of points in each element"); 12234d14614SJeremy L Thompson } 12334d14614SJeremy L Thompson 12434d14614SJeremy L Thompson // Weight handled separately 12534d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_WEIGHT) { 12634d14614SJeremy L Thompson CeedCall(CeedVectorSetValue(v, 1.0)); 12734d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 12834d14614SJeremy L Thompson } 12934d14614SJeremy L Thompson 13034d14614SJeremy L Thompson // Build kernels if needed 13134d14614SJeremy L Thompson if (data->num_points != max_num_points) { 13234d14614SJeremy L Thompson CeedInt P_1d; 13334d14614SJeremy L Thompson 13434d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 13534d14614SJeremy L Thompson data->num_points = max_num_points; 13634d14614SJeremy L Thompson 13734d14614SJeremy L Thompson // -- Create interp matrix to Chebyshev coefficients 13834d14614SJeremy L Thompson if (!data->d_chebyshev_interp_1d) { 13934d14614SJeremy L Thompson CeedSize interp_bytes; 14034d14614SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 14134d14614SJeremy L Thompson 14234d14614SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 14334d14614SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 14434d14614SJeremy L Thompson CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 14534d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 14634d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 14734d14614SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 14834d14614SJeremy L Thompson } 14934d14614SJeremy L Thompson 15034d14614SJeremy L Thompson // -- Compile kernels 15134d14614SJeremy L Thompson char *basis_kernel_source; 15234d14614SJeremy L Thompson const char *basis_kernel_path; 15334d14614SJeremy L Thompson CeedInt num_comp; 15434d14614SJeremy L Thompson 15534d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 15634d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 15734d14614SJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path)); 15834d14614SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 15934d14614SJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 16034d14614SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 16134d14614SJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 162f7c9815fSJeremy L Thompson Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 16334d14614SJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 164f7c9815fSJeremy L Thompson max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 16534d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 16634d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 16734d14614SJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 16834d14614SJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 16934d14614SJeremy L Thompson } 17034d14614SJeremy L Thompson 17134d14614SJeremy L Thompson // Get read/write access to u, v 17234d14614SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 17334d14614SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 17434d14614SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 175*db2becc9SJeremy L Thompson if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 176*db2becc9SJeremy L Thompson else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 17734d14614SJeremy L Thompson 17834d14614SJeremy L Thompson // Clear v for transpose operation 179*db2becc9SJeremy L Thompson if (is_transpose && !apply_add) { 18034d14614SJeremy L Thompson CeedSize length; 18134d14614SJeremy L Thompson 18234d14614SJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 18334d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 18434d14614SJeremy L Thompson } 18534d14614SJeremy L Thompson 18634d14614SJeremy L Thompson // Basis action 18734d14614SJeremy L Thompson switch (eval_mode) { 18834d14614SJeremy L Thompson case CEED_EVAL_INTERP: { 18934d14614SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; 19034d14614SJeremy L Thompson const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 19134d14614SJeremy L Thompson 19234d14614SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 19334d14614SJeremy L Thompson } break; 19434d14614SJeremy L Thompson case CEED_EVAL_GRAD: { 19534d14614SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; 1962d10e82cSJeremy L Thompson const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 19734d14614SJeremy L Thompson 19834d14614SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 19934d14614SJeremy L Thompson } break; 20034d14614SJeremy L Thompson case CEED_EVAL_WEIGHT: 20134d14614SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 20234d14614SJeremy L Thompson break; 20334d14614SJeremy L Thompson // LCOV_EXCL_START 20434d14614SJeremy L Thompson case CEED_EVAL_DIV: 20534d14614SJeremy L Thompson case CEED_EVAL_CURL: 20634d14614SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 20734d14614SJeremy L Thompson // LCOV_EXCL_STOP 20834d14614SJeremy L Thompson } 20934d14614SJeremy L Thompson 21034d14614SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 21134d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 21234d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 21334d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 21434d14614SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 21534d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 21634d14614SJeremy L Thompson } 21734d14614SJeremy L Thompson 218*db2becc9SJeremy L Thompson static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 219*db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 220*db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 221*db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 222*db2becc9SJeremy L Thompson } 223*db2becc9SJeremy L Thompson 224*db2becc9SJeremy L Thompson static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 225*db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 226*db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 227*db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 228*db2becc9SJeremy L Thompson } 229*db2becc9SJeremy L Thompson 23034d14614SJeremy L Thompson //------------------------------------------------------------------------------ 2310d0321e0SJeremy L Thompson // Basis apply - non-tensor 2320d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 233*db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 234*db2becc9SJeremy L Thompson CeedVector u, CeedVector v) { 2350d0321e0SJeremy L Thompson Ceed ceed; 236437930d1SJeremy L Thompson CeedInt num_nodes, num_qpts; 2377bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 238d075f50bSSebastian Grimberg const int elems_per_block = 1; 239d075f50bSSebastian Grimberg const int grid = CeedDivUpInt(num_elem, elems_per_block); 2400d0321e0SJeremy L Thompson const CeedScalar *d_u; 2410d0321e0SJeremy L Thompson CeedScalar *d_v; 242ca735530SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 243ca735530SJeremy L Thompson 244ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 245ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 246ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 247ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 248ca735530SJeremy L Thompson 2499ea2cfd9SJeremy L Thompson // Get read/write access to u, v 2509ea2cfd9SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 2519ea2cfd9SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 252*db2becc9SJeremy L Thompson if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 253*db2becc9SJeremy L Thompson else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 2540d0321e0SJeremy L Thompson 2550d0321e0SJeremy L Thompson // Clear v for transpose operation 256*db2becc9SJeremy L Thompson if (is_transpose && !apply_add) { 2571f9221feSJeremy L Thompson CeedSize length; 258ca735530SJeremy L Thompson 2592b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 2602b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 2610d0321e0SJeremy L Thompson } 2620d0321e0SJeremy L Thompson 2630d0321e0SJeremy L Thompson // Apply basis operation 264437930d1SJeremy L Thompson switch (eval_mode) { 2650d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 266d075f50bSSebastian Grimberg void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 2677bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 268b2165e7aSSebastian Grimberg 2697bbbfca3SJeremy L Thompson if (is_transpose) { 270d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 271d075f50bSSebastian Grimberg } else { 272b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 273d075f50bSSebastian Grimberg } 2740d0321e0SJeremy L Thompson } break; 2750d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 276d075f50bSSebastian Grimberg void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 2777bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 278b2165e7aSSebastian Grimberg 2797bbbfca3SJeremy L Thompson if (is_transpose) { 280d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 281d075f50bSSebastian Grimberg } else { 282d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 283d075f50bSSebastian Grimberg } 284d075f50bSSebastian Grimberg } break; 285d075f50bSSebastian Grimberg case CEED_EVAL_DIV: { 286d075f50bSSebastian Grimberg void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 2877bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 288d075f50bSSebastian Grimberg 2897bbbfca3SJeremy L Thompson if (is_transpose) { 290d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 291d075f50bSSebastian Grimberg } else { 292d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 293d075f50bSSebastian Grimberg } 294d075f50bSSebastian Grimberg } break; 295d075f50bSSebastian Grimberg case CEED_EVAL_CURL: { 296d075f50bSSebastian Grimberg void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 2977bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 298d075f50bSSebastian Grimberg 2997bbbfca3SJeremy L Thompson if (is_transpose) { 300d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 301d075f50bSSebastian Grimberg } else { 302d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 303d075f50bSSebastian Grimberg } 3040d0321e0SJeremy L Thompson } break; 3050d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 306097cc795SJames Wright CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 307437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 308b2165e7aSSebastian Grimberg 309eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 3100d0321e0SJeremy L Thompson } break; 3119ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 3129ea2cfd9SJeremy L Thompson break; 3130d0321e0SJeremy L Thompson } 3140d0321e0SJeremy L Thompson 3159ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 3162b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 3179ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 3189ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 3190d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3200d0321e0SJeremy L Thompson } 3210d0321e0SJeremy L Thompson 322*db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 323*db2becc9SJeremy L Thompson CeedVector v) { 324*db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 325*db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 326*db2becc9SJeremy L Thompson } 327*db2becc9SJeremy L Thompson 328*db2becc9SJeremy L Thompson static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 329*db2becc9SJeremy L Thompson CeedVector v) { 330*db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 331*db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 332*db2becc9SJeremy L Thompson } 333*db2becc9SJeremy L Thompson 3340d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3350d0321e0SJeremy L Thompson // Destroy tensor basis 3360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3370d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 3380d0321e0SJeremy L Thompson Ceed ceed; 3390d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 340ca735530SJeremy L Thompson 341ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3422b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 3432b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 34434d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 345097cc795SJames Wright if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 3462b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 3472b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 34834d14614SJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 3492b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 3500d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3510d0321e0SJeremy L Thompson } 3520d0321e0SJeremy L Thompson 3530d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3540d0321e0SJeremy L Thompson // Destroy non-tensor basis 3550d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3560d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 3570d0321e0SJeremy L Thompson Ceed ceed; 3580d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 359ca735530SJeremy L Thompson 360ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3612b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 3622b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 363097cc795SJames Wright if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 3642b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp)); 3652b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad)); 366d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_div)); 367d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_curl)); 3682b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 3690d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3700d0321e0SJeremy L Thompson } 3710d0321e0SJeremy L Thompson 3720d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3730d0321e0SJeremy L Thompson // Create tensor 3740d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3752b730f8bSJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 3762b730f8bSJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 3770d0321e0SJeremy L Thompson Ceed ceed; 37822070f95SJeremy L Thompson char *basis_kernel_source; 37922070f95SJeremy L Thompson const char *basis_kernel_path; 380ca735530SJeremy L Thompson CeedInt num_comp; 381ca735530SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 382ca735530SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 3830d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 384ca735530SJeremy L Thompson 385ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3862b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 3870d0321e0SJeremy L Thompson 3880d0321e0SJeremy L Thompson // Copy data to GPU 389097cc795SJames Wright if (q_weight_1d) { 3902b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 3912b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 392097cc795SJames Wright } 3932b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 3942b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 3952b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 3962b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 3970d0321e0SJeremy L Thompson 398ecc88aebSJeremy L Thompson // Compile basis kernels 3992b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 4002b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 40123d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 4022b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 40323d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 404eb7e6cafSJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 405f7c9815fSJeremy L Thompson Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 4062b730f8bSJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 407eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 408eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 409eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 4102b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 4112b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 412437930d1SJeremy L Thompson 4132b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 4140d0321e0SJeremy L Thompson 415d075f50bSSebastian Grimberg // Register backend functions 4162b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 417*db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 41834d14614SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 419*db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 4202b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 4210d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4220d0321e0SJeremy L Thompson } 4230d0321e0SJeremy L Thompson 4240d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 425d075f50bSSebastian Grimberg // Create non-tensor H^1 4260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4272b730f8bSJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 42851475c7cSJeremy L Thompson const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 4290d0321e0SJeremy L Thompson Ceed ceed; 43022070f95SJeremy L Thompson char *basis_kernel_source; 43122070f95SJeremy L Thompson const char *basis_kernel_path; 432d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_grad; 433ca735530SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 4340d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 435ca735530SJeremy L Thompson 436ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4372b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 4380d0321e0SJeremy L Thompson 4390d0321e0SJeremy L Thompson // Copy basis data to GPU 440d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 441d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 442097cc795SJames Wright if (q_weight) { 4432b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 4442b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 445097cc795SJames Wright } 446d075f50bSSebastian Grimberg if (interp) { 447d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 448d075f50bSSebastian Grimberg 4492b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 4502b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 451d075f50bSSebastian Grimberg } 452d075f50bSSebastian Grimberg if (grad) { 453d075f50bSSebastian Grimberg const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 454d075f50bSSebastian Grimberg 4552b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 4562b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 457d075f50bSSebastian Grimberg } 4580d0321e0SJeremy L Thompson 4590d0321e0SJeremy L Thompson // Compile basis kernels 4602b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 4612b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 46223d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 4632b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 46423d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 465d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 466d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 467d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 468d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 469d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 470d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 471d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 472d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 473d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 474d075f50bSSebastian Grimberg 475d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 476d075f50bSSebastian Grimberg 477d075f50bSSebastian Grimberg // Register backend functions 478d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 479*db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 480d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 481d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 482d075f50bSSebastian Grimberg } 483d075f50bSSebastian Grimberg 484d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 485d075f50bSSebastian Grimberg // Create non-tensor H(div) 486d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 487d075f50bSSebastian Grimberg int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 488d075f50bSSebastian Grimberg const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 489d075f50bSSebastian Grimberg Ceed ceed; 49022070f95SJeremy L Thompson char *basis_kernel_source; 49122070f95SJeremy L Thompson const char *basis_kernel_path; 492d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_div; 493d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 494d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 495d075f50bSSebastian Grimberg 496d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 497d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 498d075f50bSSebastian Grimberg 499d075f50bSSebastian Grimberg // Copy basis data to GPU 500d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 501d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 502097cc795SJames Wright if (q_weight) { 503d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 504d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 505097cc795SJames Wright } 506d075f50bSSebastian Grimberg if (interp) { 507d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 508d075f50bSSebastian Grimberg 509d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 510d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 511d075f50bSSebastian Grimberg } 512d075f50bSSebastian Grimberg if (div) { 513d075f50bSSebastian Grimberg const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 514d075f50bSSebastian Grimberg 515d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 516d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 517d075f50bSSebastian Grimberg } 518d075f50bSSebastian Grimberg 519d075f50bSSebastian Grimberg // Compile basis kernels 520d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 521d075f50bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 522d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 523d075f50bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 524d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 525d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 526d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 527d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 528d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 529d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 530d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 531d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 532d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 533d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 534d075f50bSSebastian Grimberg 535d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 536d075f50bSSebastian Grimberg 537d075f50bSSebastian Grimberg // Register backend functions 538d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 539*db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 540d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 541d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 542d075f50bSSebastian Grimberg } 543d075f50bSSebastian Grimberg 544d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 545d075f50bSSebastian Grimberg // Create non-tensor H(curl) 546d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 547d075f50bSSebastian Grimberg int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 548d075f50bSSebastian Grimberg const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 549d075f50bSSebastian Grimberg Ceed ceed; 55022070f95SJeremy L Thompson char *basis_kernel_source; 55122070f95SJeremy L Thompson const char *basis_kernel_path; 552d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_curl; 553d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 554d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 555d075f50bSSebastian Grimberg 556d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 557d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 558d075f50bSSebastian Grimberg 559d075f50bSSebastian Grimberg // Copy basis data to GPU 560d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 561d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 562097cc795SJames Wright if (q_weight) { 563d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 564d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 565097cc795SJames Wright } 566d075f50bSSebastian Grimberg if (interp) { 567d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 568d075f50bSSebastian Grimberg 569d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 570d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 571d075f50bSSebastian Grimberg } 572d075f50bSSebastian Grimberg if (curl) { 573d075f50bSSebastian Grimberg const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 574d075f50bSSebastian Grimberg 575d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 576d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 577d075f50bSSebastian Grimberg } 578d075f50bSSebastian Grimberg 579d075f50bSSebastian Grimberg // Compile basis kernels 580d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 581d075f50bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 582d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 583d075f50bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 584d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 585d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 586d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 587d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 588d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 589d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 590d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 591d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 5922b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 5932b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 5940d0321e0SJeremy L Thompson 5952b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 5960d0321e0SJeremy L Thompson 5970d0321e0SJeremy L Thompson // Register backend functions 5982b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 599*db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 6002b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 6010d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6020d0321e0SJeremy L Thompson } 6032a86cc9dSSebastian Grimberg 6040d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 605