15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 110d0321e0SJeremy L Thompson #include <cuda.h> 120d0321e0SJeremy L Thompson #include <cuda_runtime.h> 13111870feSJeremy L Thompson #include <string.h> 142b730f8bSJeremy L Thompson 1549aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 160d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 172b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h" 180d0321e0SJeremy L Thompson 190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 200d0321e0SJeremy L Thompson // Basis apply - tensor 210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 22db2becc9SJeremy L Thompson static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23db2becc9SJeremy L Thompson CeedVector u, CeedVector v) { 240d0321e0SJeremy L Thompson Ceed ceed; 25ca735530SJeremy L Thompson CeedInt Q_1d, dim; 267bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27437930d1SJeremy L Thompson const int max_block_size = 32; 280d0321e0SJeremy L Thompson const CeedScalar *d_u; 290d0321e0SJeremy L Thompson CeedScalar *d_v; 30ca735530SJeremy L Thompson CeedBasis_Cuda *data; 31ca735530SJeremy L Thompson 32ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 34ca735530SJeremy L Thompson 359ea2cfd9SJeremy L Thompson // Get read/write access to u, v 366574a04fSJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 376574a04fSJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38*9dafd6dfSZach Atkins if (apply_add) { 39*9dafd6dfSZach Atkins CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 40*9dafd6dfSZach Atkins } else { 410d0321e0SJeremy L Thompson // Clear v for transpose operation 42*9dafd6dfSZach Atkins if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 43*9dafd6dfSZach Atkins CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 440d0321e0SJeremy L Thompson } 452b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 462b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 470d0321e0SJeremy L Thompson 480d0321e0SJeremy L Thompson // Basis action 49437930d1SJeremy L Thompson switch (eval_mode) { 500d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 517bbbfca3SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 52b2165e7aSSebastian Grimberg const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 530d0321e0SJeremy L Thompson 54eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 550d0321e0SJeremy L Thompson } break; 560d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 577bbbfca3SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 58b2165e7aSSebastian Grimberg const CeedInt block_size = max_block_size; 590d0321e0SJeremy L Thompson 60eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 610d0321e0SJeremy L Thompson } break; 620d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 63097cc795SJames Wright CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 64437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 65b2165e7aSSebastian Grimberg const int block_size_x = Q_1d; 66b2165e7aSSebastian Grimberg const int block_size_y = dim >= 2 ? Q_1d : 1; 67b2165e7aSSebastian Grimberg 68b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 690d0321e0SJeremy L Thompson } break; 709ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 719ea2cfd9SJeremy L Thompson break; 720d0321e0SJeremy L Thompson // LCOV_EXCL_START 730d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 740d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 75bcbe1c99SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 760d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 770d0321e0SJeremy L Thompson } 780d0321e0SJeremy L Thompson 799ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 802b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 819ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 829ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 839bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 840d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 850d0321e0SJeremy L Thompson } 860d0321e0SJeremy L Thompson 87db2becc9SJeremy L Thompson static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 88db2becc9SJeremy L Thompson CeedVector v) { 89db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 90db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 91db2becc9SJeremy L Thompson } 92db2becc9SJeremy L Thompson 93db2becc9SJeremy L Thompson static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 94db2becc9SJeremy L Thompson CeedVector v) { 95db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 96db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 97db2becc9SJeremy L Thompson } 98db2becc9SJeremy L Thompson 990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 10034d14614SJeremy L Thompson // Basis apply - tensor AtPoints 10134d14614SJeremy L Thompson //------------------------------------------------------------------------------ 102db2becc9SJeremy L Thompson static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 103db2becc9SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 10434d14614SJeremy L Thompson Ceed ceed; 10534d14614SJeremy L Thompson CeedInt Q_1d, dim, max_num_points = num_points[0]; 10634d14614SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 10734d14614SJeremy L Thompson const int max_block_size = 32; 10834d14614SJeremy L Thompson const CeedScalar *d_x, *d_u; 10934d14614SJeremy L Thompson CeedScalar *d_v; 11034d14614SJeremy L Thompson CeedBasis_Cuda *data; 11134d14614SJeremy L Thompson 11234d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 11334d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 11434d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 11534d14614SJeremy L Thompson 11634d14614SJeremy L Thompson // Weight handled separately 11734d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_WEIGHT) { 1185a5594ffSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(v, 1.0)); 11934d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 12034d14614SJeremy L Thompson } 12134d14614SJeremy L Thompson 1229bc66399SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 1239bc66399SJeremy L Thompson 124111870feSJeremy L Thompson // Check padded to uniform number of points per elem 125111870feSJeremy L Thompson for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 126111870feSJeremy L Thompson { 127111870feSJeremy L Thompson CeedInt num_comp, q_comp; 128111870feSJeremy L Thompson CeedSize len, len_required; 129111870feSJeremy L Thompson 130111870feSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 131111870feSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 132111870feSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 133111870feSJeremy L Thompson len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 134111870feSJeremy L Thompson CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 135111870feSJeremy L Thompson "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 136111870feSJeremy L Thompson " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 137111870feSJeremy L Thompson len, len_required); 138111870feSJeremy L Thompson } 139111870feSJeremy L Thompson 140111870feSJeremy L Thompson // Move num_points array to device 141111870feSJeremy L Thompson if (is_transpose) { 142111870feSJeremy L Thompson const CeedInt num_bytes = num_elem * sizeof(CeedInt); 143111870feSJeremy L Thompson 144111870feSJeremy L Thompson if (num_elem != data->num_elem_at_points) { 145111870feSJeremy L Thompson data->num_elem_at_points = num_elem; 146111870feSJeremy L Thompson 147111870feSJeremy L Thompson if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 148111870feSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 149111870feSJeremy L Thompson CeedCallBackend(CeedFree(&data->h_points_per_elem)); 150111870feSJeremy L Thompson CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 151111870feSJeremy L Thompson } 1529e511c80SJeremy L Thompson if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 153111870feSJeremy L Thompson memcpy(data->h_points_per_elem, num_points, num_bytes); 154111870feSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 155111870feSJeremy L Thompson } 156111870feSJeremy L Thompson } 157111870feSJeremy L Thompson 15834d14614SJeremy L Thompson // Build kernels if needed 15934d14614SJeremy L Thompson if (data->num_points != max_num_points) { 16034d14614SJeremy L Thompson CeedInt P_1d; 16134d14614SJeremy L Thompson 16234d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 16334d14614SJeremy L Thompson data->num_points = max_num_points; 16434d14614SJeremy L Thompson 16534d14614SJeremy L Thompson // -- Create interp matrix to Chebyshev coefficients 16634d14614SJeremy L Thompson if (!data->d_chebyshev_interp_1d) { 16734d14614SJeremy L Thompson CeedSize interp_bytes; 16834d14614SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 16934d14614SJeremy L Thompson 17034d14614SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 17134d14614SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 1725a5594ffSJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 17334d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 17434d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 17534d14614SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 17634d14614SJeremy L Thompson } 17734d14614SJeremy L Thompson 17834d14614SJeremy L Thompson // -- Compile kernels 1799c25dd66SJeremy L Thompson const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n"; 18034d14614SJeremy L Thompson CeedInt num_comp; 18134d14614SJeremy L Thompson 18234d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 18334d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 18434d14614SJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 185f7c9815fSJeremy L Thompson Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 18634d14614SJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 187f7c9815fSJeremy L Thompson max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 18834d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 18981ae6159SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 19034d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 19181ae6159SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 19234d14614SJeremy L Thompson } 19334d14614SJeremy L Thompson 19434d14614SJeremy L Thompson // Get read/write access to u, v 19534d14614SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 19634d14614SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 19734d14614SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 198*9dafd6dfSZach Atkins if (apply_add) { 199*9dafd6dfSZach Atkins CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 200*9dafd6dfSZach Atkins } else { 20134d14614SJeremy L Thompson // Clear v for transpose operation 202*9dafd6dfSZach Atkins if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 203*9dafd6dfSZach Atkins CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 20434d14614SJeremy L Thompson } 20534d14614SJeremy L Thompson 20634d14614SJeremy L Thompson // Basis action 20734d14614SJeremy L Thompson switch (eval_mode) { 20834d14614SJeremy L Thompson case CEED_EVAL_INTERP: { 20981ae6159SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 21034d14614SJeremy L Thompson const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 21134d14614SJeremy L Thompson 21281ae6159SJeremy L Thompson CeedCallBackend( 21381ae6159SJeremy L Thompson CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); 21434d14614SJeremy L Thompson } break; 21534d14614SJeremy L Thompson case CEED_EVAL_GRAD: { 21681ae6159SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 2172d10e82cSJeremy L Thompson const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 21834d14614SJeremy L Thompson 21981ae6159SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); 22034d14614SJeremy L Thompson } break; 22134d14614SJeremy L Thompson case CEED_EVAL_WEIGHT: 22234d14614SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 22334d14614SJeremy L Thompson break; 22434d14614SJeremy L Thompson // LCOV_EXCL_START 22534d14614SJeremy L Thompson case CEED_EVAL_DIV: 22634d14614SJeremy L Thompson case CEED_EVAL_CURL: 22734d14614SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 22834d14614SJeremy L Thompson // LCOV_EXCL_STOP 22934d14614SJeremy L Thompson } 23034d14614SJeremy L Thompson 23134d14614SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 23234d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 23334d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 23434d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 23534d14614SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 2369bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 23734d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 23834d14614SJeremy L Thompson } 23934d14614SJeremy L Thompson 240db2becc9SJeremy L Thompson static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 241db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 242db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 243db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 244db2becc9SJeremy L Thompson } 245db2becc9SJeremy L Thompson 246db2becc9SJeremy L Thompson static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 247db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 248db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 249db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 250db2becc9SJeremy L Thompson } 251db2becc9SJeremy L Thompson 25234d14614SJeremy L Thompson //------------------------------------------------------------------------------ 2530d0321e0SJeremy L Thompson // Basis apply - non-tensor 2540d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 255db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 256db2becc9SJeremy L Thompson CeedVector u, CeedVector v) { 2570d0321e0SJeremy L Thompson Ceed ceed; 258437930d1SJeremy L Thompson CeedInt num_nodes, num_qpts; 2597bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 260d075f50bSSebastian Grimberg const int elems_per_block = 1; 261d075f50bSSebastian Grimberg const int grid = CeedDivUpInt(num_elem, elems_per_block); 2620d0321e0SJeremy L Thompson const CeedScalar *d_u; 2630d0321e0SJeremy L Thompson CeedScalar *d_v; 264ca735530SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 265ca735530SJeremy L Thompson 266ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 267ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 268ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 269ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 270ca735530SJeremy L Thompson 2719ea2cfd9SJeremy L Thompson // Get read/write access to u, v 2729ea2cfd9SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 2739ea2cfd9SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 274*9dafd6dfSZach Atkins if (apply_add) { 275*9dafd6dfSZach Atkins CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 276*9dafd6dfSZach Atkins } else { 2770d0321e0SJeremy L Thompson // Clear v for transpose operation 278*9dafd6dfSZach Atkins if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0)); 279*9dafd6dfSZach Atkins CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 2800d0321e0SJeremy L Thompson } 2810d0321e0SJeremy L Thompson 2820d0321e0SJeremy L Thompson // Apply basis operation 283437930d1SJeremy L Thompson switch (eval_mode) { 2840d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 285d075f50bSSebastian Grimberg void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 2867bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 287b2165e7aSSebastian Grimberg 2887bbbfca3SJeremy L Thompson if (is_transpose) { 289d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 290d075f50bSSebastian Grimberg } else { 291b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 292d075f50bSSebastian Grimberg } 2930d0321e0SJeremy L Thompson } break; 2940d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 295d075f50bSSebastian Grimberg void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 2967bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 297b2165e7aSSebastian Grimberg 2987bbbfca3SJeremy L Thompson if (is_transpose) { 299d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 300d075f50bSSebastian Grimberg } else { 301d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 302d075f50bSSebastian Grimberg } 303d075f50bSSebastian Grimberg } break; 304d075f50bSSebastian Grimberg case CEED_EVAL_DIV: { 305d075f50bSSebastian Grimberg void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 3067bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 307d075f50bSSebastian Grimberg 3087bbbfca3SJeremy L Thompson if (is_transpose) { 309d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 310d075f50bSSebastian Grimberg } else { 311d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 312d075f50bSSebastian Grimberg } 313d075f50bSSebastian Grimberg } break; 314d075f50bSSebastian Grimberg case CEED_EVAL_CURL: { 315d075f50bSSebastian Grimberg void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 3167bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 317d075f50bSSebastian Grimberg 3187bbbfca3SJeremy L Thompson if (is_transpose) { 319d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 320d075f50bSSebastian Grimberg } else { 321d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 322d075f50bSSebastian Grimberg } 3230d0321e0SJeremy L Thompson } break; 3240d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 325097cc795SJames Wright CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 326437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 327b2165e7aSSebastian Grimberg 328eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 3290d0321e0SJeremy L Thompson } break; 3309ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 3319ea2cfd9SJeremy L Thompson break; 3320d0321e0SJeremy L Thompson } 3330d0321e0SJeremy L Thompson 3349ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 3352b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 3369ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 3379ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 3389bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 3390d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3400d0321e0SJeremy L Thompson } 3410d0321e0SJeremy L Thompson 342db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 343db2becc9SJeremy L Thompson CeedVector v) { 344db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 345db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 346db2becc9SJeremy L Thompson } 347db2becc9SJeremy L Thompson 348db2becc9SJeremy L Thompson static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 349db2becc9SJeremy L Thompson CeedVector v) { 350db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 351db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 352db2becc9SJeremy L Thompson } 353db2becc9SJeremy L Thompson 3540d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3550d0321e0SJeremy L Thompson // Destroy tensor basis 3560d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3570d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 3580d0321e0SJeremy L Thompson Ceed ceed; 3590d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 360ca735530SJeremy L Thompson 361ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3622b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 3632b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 36434d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 365097cc795SJames Wright if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 366111870feSJeremy L Thompson CeedCallBackend(CeedFree(&data->h_points_per_elem)); 367111870feSJeremy L Thompson if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 3682b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 3692b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 37034d14614SJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 3712b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 3729bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 3730d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3740d0321e0SJeremy L Thompson } 3750d0321e0SJeremy L Thompson 3760d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3770d0321e0SJeremy L Thompson // Destroy non-tensor basis 3780d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3790d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 3800d0321e0SJeremy L Thompson Ceed ceed; 3810d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 382ca735530SJeremy L Thompson 383ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3842b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 3852b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 386097cc795SJames Wright if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 3872b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp)); 3882b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad)); 389d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_div)); 390d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_curl)); 3912b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 3929bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 3930d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3940d0321e0SJeremy L Thompson } 3950d0321e0SJeremy L Thompson 3960d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3970d0321e0SJeremy L Thompson // Create tensor 3980d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3992b730f8bSJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 4002b730f8bSJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 4010d0321e0SJeremy L Thompson Ceed ceed; 402ca735530SJeremy L Thompson CeedInt num_comp; 403ca735530SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 404ca735530SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 4050d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 406ca735530SJeremy L Thompson 407ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4082b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 4090d0321e0SJeremy L Thompson 4100d0321e0SJeremy L Thompson // Copy data to GPU 411097cc795SJames Wright if (q_weight_1d) { 4122b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 4132b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 414097cc795SJames Wright } 4152b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 4162b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 4172b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 4182b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 4190d0321e0SJeremy L Thompson 420ecc88aebSJeremy L Thompson // Compile basis kernels 4219c25dd66SJeremy L Thompson const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n"; 4229c25dd66SJeremy L Thompson 4232b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 424eb7e6cafSJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 425f7c9815fSJeremy L Thompson Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 4262b730f8bSJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 427eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 428eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 429eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 430437930d1SJeremy L Thompson 4312b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 4320d0321e0SJeremy L Thompson 433d075f50bSSebastian Grimberg // Register backend functions 4342b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 435db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 43634d14614SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 437db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 4382b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 4399bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 4400d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4410d0321e0SJeremy L Thompson } 4420d0321e0SJeremy L Thompson 4430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 444d075f50bSSebastian Grimberg // Create non-tensor H^1 4450d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4462b730f8bSJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 44751475c7cSJeremy L Thompson const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 4480d0321e0SJeremy L Thompson Ceed ceed; 449d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_grad; 450ca735530SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 4510d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 452ca735530SJeremy L Thompson 453ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4542b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 4550d0321e0SJeremy L Thompson 4560d0321e0SJeremy L Thompson // Copy basis data to GPU 457d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 458d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 459097cc795SJames Wright if (q_weight) { 4602b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 4612b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 462097cc795SJames Wright } 463d075f50bSSebastian Grimberg if (interp) { 464d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 465d075f50bSSebastian Grimberg 4662b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 4672b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 468d075f50bSSebastian Grimberg } 469d075f50bSSebastian Grimberg if (grad) { 470d075f50bSSebastian Grimberg const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 471d075f50bSSebastian Grimberg 4722b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 4732b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 474d075f50bSSebastian Grimberg } 4750d0321e0SJeremy L Thompson 4760d0321e0SJeremy L Thompson // Compile basis kernels 4779c25dd66SJeremy L Thompson const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 4789c25dd66SJeremy L Thompson 4792b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 480d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 481d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 482d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 483d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 484d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 485d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 486d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 487d075f50bSSebastian Grimberg 488d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 489d075f50bSSebastian Grimberg 490d075f50bSSebastian Grimberg // Register backend functions 491d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 492db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 493d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 4949bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 495d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 496d075f50bSSebastian Grimberg } 497d075f50bSSebastian Grimberg 498d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 499d075f50bSSebastian Grimberg // Create non-tensor H(div) 500d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 501d075f50bSSebastian Grimberg int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 502d075f50bSSebastian Grimberg const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 503d075f50bSSebastian Grimberg Ceed ceed; 504d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_div; 505d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 506d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 507d075f50bSSebastian Grimberg 508d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 509d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 510d075f50bSSebastian Grimberg 511d075f50bSSebastian Grimberg // Copy basis data to GPU 512d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 513d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 514097cc795SJames Wright if (q_weight) { 515d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 516d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 517097cc795SJames Wright } 518d075f50bSSebastian Grimberg if (interp) { 519d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 520d075f50bSSebastian Grimberg 521d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 522d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 523d075f50bSSebastian Grimberg } 524d075f50bSSebastian Grimberg if (div) { 525d075f50bSSebastian Grimberg const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 526d075f50bSSebastian Grimberg 527d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 528d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 529d075f50bSSebastian Grimberg } 530d075f50bSSebastian Grimberg 531d075f50bSSebastian Grimberg // Compile basis kernels 5329c25dd66SJeremy L Thompson const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 5339c25dd66SJeremy L Thompson 534d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 535d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 536d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 537d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 538d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 539d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 540d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 541d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 542d075f50bSSebastian Grimberg 543d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 544d075f50bSSebastian Grimberg 545d075f50bSSebastian Grimberg // Register backend functions 546d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 547db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 548d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 5499bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 550d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 551d075f50bSSebastian Grimberg } 552d075f50bSSebastian Grimberg 553d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 554d075f50bSSebastian Grimberg // Create non-tensor H(curl) 555d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 556d075f50bSSebastian Grimberg int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 557d075f50bSSebastian Grimberg const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 558d075f50bSSebastian Grimberg Ceed ceed; 559d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_curl; 560d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 561d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 562d075f50bSSebastian Grimberg 563d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 564d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 565d075f50bSSebastian Grimberg 566d075f50bSSebastian Grimberg // Copy basis data to GPU 567d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 568d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 569097cc795SJames Wright if (q_weight) { 570d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 571d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 572097cc795SJames Wright } 573d075f50bSSebastian Grimberg if (interp) { 574d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 575d075f50bSSebastian Grimberg 576d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 577d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 578d075f50bSSebastian Grimberg } 579d075f50bSSebastian Grimberg if (curl) { 580d075f50bSSebastian Grimberg const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 581d075f50bSSebastian Grimberg 582d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 583d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 584d075f50bSSebastian Grimberg } 585d075f50bSSebastian Grimberg 586d075f50bSSebastian Grimberg // Compile basis kernels 5879c25dd66SJeremy L Thompson const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n"; 5889c25dd66SJeremy L Thompson 589d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 590d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 591d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 592d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 593d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 594d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 595d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 596d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 5970d0321e0SJeremy L Thompson 5982b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 5990d0321e0SJeremy L Thompson 6000d0321e0SJeremy L Thompson // Register backend functions 6012b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 602db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 6032b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 6049bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 6050d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6060d0321e0SJeremy L Thompson } 6072a86cc9dSSebastian Grimberg 6080d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 609