15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 110d0321e0SJeremy L Thompson #include <cuda.h> 120d0321e0SJeremy L Thompson #include <cuda_runtime.h> 13111870feSJeremy L Thompson #include <string.h> 142b730f8bSJeremy L Thompson 1549aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 160d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 172b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h" 180d0321e0SJeremy L Thompson 190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 200d0321e0SJeremy L Thompson // Basis apply - tensor 210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 22db2becc9SJeremy L Thompson static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 23db2becc9SJeremy L Thompson CeedVector u, CeedVector v) { 240d0321e0SJeremy L Thompson Ceed ceed; 25ca735530SJeremy L Thompson CeedInt Q_1d, dim; 267bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 27437930d1SJeremy L Thompson const int max_block_size = 32; 280d0321e0SJeremy L Thompson const CeedScalar *d_u; 290d0321e0SJeremy L Thompson CeedScalar *d_v; 30ca735530SJeremy L Thompson CeedBasis_Cuda *data; 31ca735530SJeremy L Thompson 32ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 33ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 34ca735530SJeremy L Thompson 359ea2cfd9SJeremy L Thompson // Get read/write access to u, v 366574a04fSJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 376574a04fSJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 38db2becc9SJeremy L Thompson if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 39db2becc9SJeremy L Thompson else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 400d0321e0SJeremy L Thompson 410d0321e0SJeremy L Thompson // Clear v for transpose operation 42db2becc9SJeremy L Thompson if (is_transpose && !apply_add) { 43*19a04db8SJeremy L Thompson CeedInt num_comp, q_comp, num_nodes, num_qpts; 441f9221feSJeremy L Thompson CeedSize length; 45ca735530SJeremy L Thompson 46*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 47*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 48*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 49*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 50*19a04db8SJeremy L Thompson length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 512b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 520d0321e0SJeremy L Thompson } 532b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 542b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 550d0321e0SJeremy L Thompson 560d0321e0SJeremy L Thompson // Basis action 57437930d1SJeremy L Thompson switch (eval_mode) { 580d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 597bbbfca3SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 60b2165e7aSSebastian Grimberg const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 610d0321e0SJeremy L Thompson 62eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 630d0321e0SJeremy L Thompson } break; 640d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 657bbbfca3SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 66b2165e7aSSebastian Grimberg const CeedInt block_size = max_block_size; 670d0321e0SJeremy L Thompson 68eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 690d0321e0SJeremy L Thompson } break; 700d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 71097cc795SJames Wright CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 72437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 73b2165e7aSSebastian Grimberg const int block_size_x = Q_1d; 74b2165e7aSSebastian Grimberg const int block_size_y = dim >= 2 ? Q_1d : 1; 75b2165e7aSSebastian Grimberg 76b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 770d0321e0SJeremy L Thompson } break; 789ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 799ea2cfd9SJeremy L Thompson break; 800d0321e0SJeremy L Thompson // LCOV_EXCL_START 810d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 820d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 83bcbe1c99SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 840d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 850d0321e0SJeremy L Thompson } 860d0321e0SJeremy L Thompson 879ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 882b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 899ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 909ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 910d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 920d0321e0SJeremy L Thompson } 930d0321e0SJeremy L Thompson 94db2becc9SJeremy L Thompson static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 95db2becc9SJeremy L Thompson CeedVector v) { 96db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 97db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 98db2becc9SJeremy L Thompson } 99db2becc9SJeremy L Thompson 100db2becc9SJeremy L Thompson static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 101db2becc9SJeremy L Thompson CeedVector v) { 102db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 103db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 104db2becc9SJeremy L Thompson } 105db2becc9SJeremy L Thompson 1060d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 10734d14614SJeremy L Thompson // Basis apply - tensor AtPoints 10834d14614SJeremy L Thompson //------------------------------------------------------------------------------ 109db2becc9SJeremy L Thompson static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 110db2becc9SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 11134d14614SJeremy L Thompson Ceed ceed; 11234d14614SJeremy L Thompson CeedInt Q_1d, dim, max_num_points = num_points[0]; 11334d14614SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 11434d14614SJeremy L Thompson const int max_block_size = 32; 11534d14614SJeremy L Thompson const CeedScalar *d_x, *d_u; 11634d14614SJeremy L Thompson CeedScalar *d_v; 11734d14614SJeremy L Thompson CeedBasis_Cuda *data; 11834d14614SJeremy L Thompson 11934d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 12034d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 12134d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 12234d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 12334d14614SJeremy L Thompson 12434d14614SJeremy L Thompson // Weight handled separately 12534d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_WEIGHT) { 1265a5594ffSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(v, 1.0)); 12734d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 12834d14614SJeremy L Thompson } 12934d14614SJeremy L Thompson 130111870feSJeremy L Thompson // Check padded to uniform number of points per elem 131111870feSJeremy L Thompson for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 132111870feSJeremy L Thompson { 133111870feSJeremy L Thompson CeedInt num_comp, q_comp; 134111870feSJeremy L Thompson CeedSize len, len_required; 135111870feSJeremy L Thompson 136111870feSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 137111870feSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 138111870feSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 139111870feSJeremy L Thompson len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 140111870feSJeremy L Thompson CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 141111870feSJeremy L Thompson "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 142111870feSJeremy L Thompson " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 143111870feSJeremy L Thompson len, len_required); 144111870feSJeremy L Thompson } 145111870feSJeremy L Thompson 146111870feSJeremy L Thompson // Move num_points array to device 147111870feSJeremy L Thompson if (is_transpose) { 148111870feSJeremy L Thompson const CeedInt num_bytes = num_elem * sizeof(CeedInt); 149111870feSJeremy L Thompson 150111870feSJeremy L Thompson if (num_elem != data->num_elem_at_points) { 151111870feSJeremy L Thompson data->num_elem_at_points = num_elem; 152111870feSJeremy L Thompson 153111870feSJeremy L Thompson if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 154111870feSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); 155111870feSJeremy L Thompson CeedCallBackend(CeedFree(&data->h_points_per_elem)); 156111870feSJeremy L Thompson CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 157111870feSJeremy L Thompson } 1589e511c80SJeremy L Thompson if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 159111870feSJeremy L Thompson memcpy(data->h_points_per_elem, num_points, num_bytes); 160111870feSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); 161111870feSJeremy L Thompson } 162111870feSJeremy L Thompson } 163111870feSJeremy L Thompson 16434d14614SJeremy L Thompson // Build kernels if needed 16534d14614SJeremy L Thompson if (data->num_points != max_num_points) { 16634d14614SJeremy L Thompson CeedInt P_1d; 16734d14614SJeremy L Thompson 16834d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 16934d14614SJeremy L Thompson data->num_points = max_num_points; 17034d14614SJeremy L Thompson 17134d14614SJeremy L Thompson // -- Create interp matrix to Chebyshev coefficients 17234d14614SJeremy L Thompson if (!data->d_chebyshev_interp_1d) { 17334d14614SJeremy L Thompson CeedSize interp_bytes; 17434d14614SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 17534d14614SJeremy L Thompson 17634d14614SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 17734d14614SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 1785a5594ffSJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 17934d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 18034d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 18134d14614SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 18234d14614SJeremy L Thompson } 18334d14614SJeremy L Thompson 18434d14614SJeremy L Thompson // -- Compile kernels 18534d14614SJeremy L Thompson char *basis_kernel_source; 18634d14614SJeremy L Thompson const char *basis_kernel_path; 18734d14614SJeremy L Thompson CeedInt num_comp; 18834d14614SJeremy L Thompson 18934d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 19034d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 19134d14614SJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path)); 19234d14614SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 19334d14614SJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 19434d14614SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 19534d14614SJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 196f7c9815fSJeremy L Thompson Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 19734d14614SJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 198f7c9815fSJeremy L Thompson max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); 19934d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 20034d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 20134d14614SJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 20234d14614SJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 20334d14614SJeremy L Thompson } 20434d14614SJeremy L Thompson 20534d14614SJeremy L Thompson // Get read/write access to u, v 20634d14614SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 20734d14614SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 20834d14614SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 209db2becc9SJeremy L Thompson if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 210db2becc9SJeremy L Thompson else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 21134d14614SJeremy L Thompson 21234d14614SJeremy L Thompson // Clear v for transpose operation 213db2becc9SJeremy L Thompson if (is_transpose && !apply_add) { 214*19a04db8SJeremy L Thompson CeedInt num_comp, q_comp, num_nodes; 21534d14614SJeremy L Thompson CeedSize length; 21634d14614SJeremy L Thompson 217*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 218*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 219*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 220*19a04db8SJeremy L Thompson length = 221*19a04db8SJeremy L Thompson (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp)); 22234d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 22334d14614SJeremy L Thompson } 22434d14614SJeremy L Thompson 22534d14614SJeremy L Thompson // Basis action 22634d14614SJeremy L Thompson switch (eval_mode) { 22734d14614SJeremy L Thompson case CEED_EVAL_INTERP: { 228111870feSJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 22934d14614SJeremy L Thompson const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 23034d14614SJeremy L Thompson 23134d14614SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 23234d14614SJeremy L Thompson } break; 23334d14614SJeremy L Thompson case CEED_EVAL_GRAD: { 234111870feSJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 2352d10e82cSJeremy L Thompson const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 23634d14614SJeremy L Thompson 23734d14614SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 23834d14614SJeremy L Thompson } break; 23934d14614SJeremy L Thompson case CEED_EVAL_WEIGHT: 24034d14614SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 24134d14614SJeremy L Thompson break; 24234d14614SJeremy L Thompson // LCOV_EXCL_START 24334d14614SJeremy L Thompson case CEED_EVAL_DIV: 24434d14614SJeremy L Thompson case CEED_EVAL_CURL: 24534d14614SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 24634d14614SJeremy L Thompson // LCOV_EXCL_STOP 24734d14614SJeremy L Thompson } 24834d14614SJeremy L Thompson 24934d14614SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 25034d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 25134d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 25234d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 25334d14614SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 25434d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 25534d14614SJeremy L Thompson } 25634d14614SJeremy L Thompson 257db2becc9SJeremy L Thompson static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 258db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 259db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 260db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 261db2becc9SJeremy L Thompson } 262db2becc9SJeremy L Thompson 263db2becc9SJeremy L Thompson static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 264db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 265db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 266db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 267db2becc9SJeremy L Thompson } 268db2becc9SJeremy L Thompson 26934d14614SJeremy L Thompson //------------------------------------------------------------------------------ 2700d0321e0SJeremy L Thompson // Basis apply - non-tensor 2710d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 272db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 273db2becc9SJeremy L Thompson CeedVector u, CeedVector v) { 2740d0321e0SJeremy L Thompson Ceed ceed; 275437930d1SJeremy L Thompson CeedInt num_nodes, num_qpts; 2767bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 277d075f50bSSebastian Grimberg const int elems_per_block = 1; 278d075f50bSSebastian Grimberg const int grid = CeedDivUpInt(num_elem, elems_per_block); 2790d0321e0SJeremy L Thompson const CeedScalar *d_u; 2800d0321e0SJeremy L Thompson CeedScalar *d_v; 281ca735530SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 282ca735530SJeremy L Thompson 283ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 284ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 285ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 286ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 287ca735530SJeremy L Thompson 2889ea2cfd9SJeremy L Thompson // Get read/write access to u, v 2899ea2cfd9SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 2909ea2cfd9SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 291db2becc9SJeremy L Thompson if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 292db2becc9SJeremy L Thompson else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 2930d0321e0SJeremy L Thompson 2940d0321e0SJeremy L Thompson // Clear v for transpose operation 295db2becc9SJeremy L Thompson if (is_transpose && !apply_add) { 296*19a04db8SJeremy L Thompson CeedInt num_comp, q_comp; 2971f9221feSJeremy L Thompson CeedSize length; 298ca735530SJeremy L Thompson 299*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 300*19a04db8SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 301*19a04db8SJeremy L Thompson length = (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)num_qpts * (CeedSize)q_comp)); 3022b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 3030d0321e0SJeremy L Thompson } 3040d0321e0SJeremy L Thompson 3050d0321e0SJeremy L Thompson // Apply basis operation 306437930d1SJeremy L Thompson switch (eval_mode) { 3070d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 308d075f50bSSebastian Grimberg void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 3097bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 310b2165e7aSSebastian Grimberg 3117bbbfca3SJeremy L Thompson if (is_transpose) { 312d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 313d075f50bSSebastian Grimberg } else { 314b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 315d075f50bSSebastian Grimberg } 3160d0321e0SJeremy L Thompson } break; 3170d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 318d075f50bSSebastian Grimberg void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 3197bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 320b2165e7aSSebastian Grimberg 3217bbbfca3SJeremy L Thompson if (is_transpose) { 322d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 323d075f50bSSebastian Grimberg } else { 324d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 325d075f50bSSebastian Grimberg } 326d075f50bSSebastian Grimberg } break; 327d075f50bSSebastian Grimberg case CEED_EVAL_DIV: { 328d075f50bSSebastian Grimberg void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 3297bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 330d075f50bSSebastian Grimberg 3317bbbfca3SJeremy L Thompson if (is_transpose) { 332d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 333d075f50bSSebastian Grimberg } else { 334d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 335d075f50bSSebastian Grimberg } 336d075f50bSSebastian Grimberg } break; 337d075f50bSSebastian Grimberg case CEED_EVAL_CURL: { 338d075f50bSSebastian Grimberg void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 3397bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 340d075f50bSSebastian Grimberg 3417bbbfca3SJeremy L Thompson if (is_transpose) { 342d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 343d075f50bSSebastian Grimberg } else { 344d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 345d075f50bSSebastian Grimberg } 3460d0321e0SJeremy L Thompson } break; 3470d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 348097cc795SJames Wright CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 349437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 350b2165e7aSSebastian Grimberg 351eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 3520d0321e0SJeremy L Thompson } break; 3539ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 3549ea2cfd9SJeremy L Thompson break; 3550d0321e0SJeremy L Thompson } 3560d0321e0SJeremy L Thompson 3579ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 3582b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 3599ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 3609ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 3610d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3620d0321e0SJeremy L Thompson } 3630d0321e0SJeremy L Thompson 364db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 365db2becc9SJeremy L Thompson CeedVector v) { 366db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v)); 367db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 368db2becc9SJeremy L Thompson } 369db2becc9SJeremy L Thompson 370db2becc9SJeremy L Thompson static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 371db2becc9SJeremy L Thompson CeedVector v) { 372db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v)); 373db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 374db2becc9SJeremy L Thompson } 375db2becc9SJeremy L Thompson 3760d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3770d0321e0SJeremy L Thompson // Destroy tensor basis 3780d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3790d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 3800d0321e0SJeremy L Thompson Ceed ceed; 3810d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 382ca735530SJeremy L Thompson 383ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3842b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 3852b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 38634d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 387097cc795SJames Wright if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 388111870feSJeremy L Thompson CeedCallBackend(CeedFree(&data->h_points_per_elem)); 389111870feSJeremy L Thompson if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); 3902b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 3912b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 39234d14614SJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 3932b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 3940d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3950d0321e0SJeremy L Thompson } 3960d0321e0SJeremy L Thompson 3970d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3980d0321e0SJeremy L Thompson // Destroy non-tensor basis 3990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4000d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 4010d0321e0SJeremy L Thompson Ceed ceed; 4020d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 403ca735530SJeremy L Thompson 404ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4052b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 4062b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 407097cc795SJames Wright if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 4082b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp)); 4092b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad)); 410d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_div)); 411d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_curl)); 4122b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 4130d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4140d0321e0SJeremy L Thompson } 4150d0321e0SJeremy L Thompson 4160d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4170d0321e0SJeremy L Thompson // Create tensor 4180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4192b730f8bSJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 4202b730f8bSJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 4210d0321e0SJeremy L Thompson Ceed ceed; 42222070f95SJeremy L Thompson char *basis_kernel_source; 42322070f95SJeremy L Thompson const char *basis_kernel_path; 424ca735530SJeremy L Thompson CeedInt num_comp; 425ca735530SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 426ca735530SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 4270d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 428ca735530SJeremy L Thompson 429ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4302b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 4310d0321e0SJeremy L Thompson 4320d0321e0SJeremy L Thompson // Copy data to GPU 433097cc795SJames Wright if (q_weight_1d) { 4342b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 4352b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 436097cc795SJames Wright } 4372b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 4382b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 4392b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 4402b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 4410d0321e0SJeremy L Thompson 442ecc88aebSJeremy L Thompson // Compile basis kernels 4432b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 4442b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 44523d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 4462b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 44723d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 448eb7e6cafSJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 449f7c9815fSJeremy L Thompson Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 4502b730f8bSJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 451eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 452eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 453eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 4542b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 4552b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 456437930d1SJeremy L Thompson 4572b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 4580d0321e0SJeremy L Thompson 459d075f50bSSebastian Grimberg // Register backend functions 4602b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 461db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); 46234d14614SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 463db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); 4642b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 4650d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 4660d0321e0SJeremy L Thompson } 4670d0321e0SJeremy L Thompson 4680d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 469d075f50bSSebastian Grimberg // Create non-tensor H^1 4700d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 4712b730f8bSJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 47251475c7cSJeremy L Thompson const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 4730d0321e0SJeremy L Thompson Ceed ceed; 47422070f95SJeremy L Thompson char *basis_kernel_source; 47522070f95SJeremy L Thompson const char *basis_kernel_path; 476d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_grad; 477ca735530SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 4780d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 479ca735530SJeremy L Thompson 480ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 4812b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 4820d0321e0SJeremy L Thompson 4830d0321e0SJeremy L Thompson // Copy basis data to GPU 484d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 485d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 486097cc795SJames Wright if (q_weight) { 4872b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 4882b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 489097cc795SJames Wright } 490d075f50bSSebastian Grimberg if (interp) { 491d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 492d075f50bSSebastian Grimberg 4932b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 4942b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 495d075f50bSSebastian Grimberg } 496d075f50bSSebastian Grimberg if (grad) { 497d075f50bSSebastian Grimberg const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 498d075f50bSSebastian Grimberg 4992b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 5002b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 501d075f50bSSebastian Grimberg } 5020d0321e0SJeremy L Thompson 5030d0321e0SJeremy L Thompson // Compile basis kernels 5042b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 5052b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 50623d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 5072b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 50823d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 509d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 510d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 511d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 512d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 513d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 514d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 515d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 516d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 517d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 518d075f50bSSebastian Grimberg 519d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 520d075f50bSSebastian Grimberg 521d075f50bSSebastian Grimberg // Register backend functions 522d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 523db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 524d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 525d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 526d075f50bSSebastian Grimberg } 527d075f50bSSebastian Grimberg 528d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 529d075f50bSSebastian Grimberg // Create non-tensor H(div) 530d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 531d075f50bSSebastian Grimberg int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 532d075f50bSSebastian Grimberg const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 533d075f50bSSebastian Grimberg Ceed ceed; 53422070f95SJeremy L Thompson char *basis_kernel_source; 53522070f95SJeremy L Thompson const char *basis_kernel_path; 536d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_div; 537d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 538d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 539d075f50bSSebastian Grimberg 540d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 541d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 542d075f50bSSebastian Grimberg 543d075f50bSSebastian Grimberg // Copy basis data to GPU 544d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 545d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 546097cc795SJames Wright if (q_weight) { 547d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 548d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 549097cc795SJames Wright } 550d075f50bSSebastian Grimberg if (interp) { 551d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 552d075f50bSSebastian Grimberg 553d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 554d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 555d075f50bSSebastian Grimberg } 556d075f50bSSebastian Grimberg if (div) { 557d075f50bSSebastian Grimberg const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 558d075f50bSSebastian Grimberg 559d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 560d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 561d075f50bSSebastian Grimberg } 562d075f50bSSebastian Grimberg 563d075f50bSSebastian Grimberg // Compile basis kernels 564d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 565d075f50bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 566d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 567d075f50bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 568d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 569d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 570d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 571d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 572d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 573d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 574d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 575d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 576d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 577d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 578d075f50bSSebastian Grimberg 579d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 580d075f50bSSebastian Grimberg 581d075f50bSSebastian Grimberg // Register backend functions 582d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 583db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 584d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 585d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 586d075f50bSSebastian Grimberg } 587d075f50bSSebastian Grimberg 588d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 589d075f50bSSebastian Grimberg // Create non-tensor H(curl) 590d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 591d075f50bSSebastian Grimberg int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 592d075f50bSSebastian Grimberg const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 593d075f50bSSebastian Grimberg Ceed ceed; 59422070f95SJeremy L Thompson char *basis_kernel_source; 59522070f95SJeremy L Thompson const char *basis_kernel_path; 596d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_curl; 597d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 598d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 599d075f50bSSebastian Grimberg 600d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 601d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 602d075f50bSSebastian Grimberg 603d075f50bSSebastian Grimberg // Copy basis data to GPU 604d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 605d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 606097cc795SJames Wright if (q_weight) { 607d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 608d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 609097cc795SJames Wright } 610d075f50bSSebastian Grimberg if (interp) { 611d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 612d075f50bSSebastian Grimberg 613d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 614d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 615d075f50bSSebastian Grimberg } 616d075f50bSSebastian Grimberg if (curl) { 617d075f50bSSebastian Grimberg const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 618d075f50bSSebastian Grimberg 619d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 620d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 621d075f50bSSebastian Grimberg } 622d075f50bSSebastian Grimberg 623d075f50bSSebastian Grimberg // Compile basis kernels 624d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 625d075f50bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 626d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 627d075f50bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 628d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 629d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 630d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 631d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 632d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 633d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 634d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 635d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 6362b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 6372b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 6380d0321e0SJeremy L Thompson 6392b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 6400d0321e0SJeremy L Thompson 6410d0321e0SJeremy L Thompson // Register backend functions 6422b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 643db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda)); 6442b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 6450d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6460d0321e0SJeremy L Thompson } 6472a86cc9dSSebastian Grimberg 6480d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 649