15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 110d0321e0SJeremy L Thompson #include <cuda.h> 120d0321e0SJeremy L Thompson #include <cuda_runtime.h> 132b730f8bSJeremy L Thompson 1449aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 150d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 162b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h" 170d0321e0SJeremy L Thompson 180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 190d0321e0SJeremy L Thompson // Basis apply - tensor 200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 212b730f8bSJeremy L Thompson int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 220d0321e0SJeremy L Thompson Ceed ceed; 23ca735530SJeremy L Thompson CeedInt Q_1d, dim; 247bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 25437930d1SJeremy L Thompson const int max_block_size = 32; 260d0321e0SJeremy L Thompson const CeedScalar *d_u; 270d0321e0SJeremy L Thompson CeedScalar *d_v; 28ca735530SJeremy L Thompson CeedBasis_Cuda *data; 29ca735530SJeremy L Thompson 30ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 31ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 32ca735530SJeremy L Thompson 339ea2cfd9SJeremy L Thompson // Get read/write access to u, v 346574a04fSJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 356574a04fSJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 362b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 370d0321e0SJeremy L Thompson 380d0321e0SJeremy L Thompson // Clear v for transpose operation 397bbbfca3SJeremy L Thompson if (is_transpose) { 401f9221feSJeremy L Thompson CeedSize length; 41ca735530SJeremy L Thompson 422b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 432b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 440d0321e0SJeremy L Thompson } 452b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 462b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 470d0321e0SJeremy L Thompson 480d0321e0SJeremy L Thompson // Basis action 49437930d1SJeremy L Thompson switch (eval_mode) { 500d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 517bbbfca3SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v}; 52b2165e7aSSebastian Grimberg const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 530d0321e0SJeremy L Thompson 54eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args)); 550d0321e0SJeremy L Thompson } break; 560d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 577bbbfca3SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 58b2165e7aSSebastian Grimberg const CeedInt block_size = max_block_size; 590d0321e0SJeremy L Thompson 60eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args)); 610d0321e0SJeremy L Thompson } break; 620d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 63097cc795SJames Wright CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 64437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 65b2165e7aSSebastian Grimberg const int block_size_x = Q_1d; 66b2165e7aSSebastian Grimberg const int block_size_y = dim >= 2 ? Q_1d : 1; 67b2165e7aSSebastian Grimberg 68b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args)); 690d0321e0SJeremy L Thompson } break; 709ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 719ea2cfd9SJeremy L Thompson break; 720d0321e0SJeremy L Thompson // LCOV_EXCL_START 730d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 740d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 75bcbe1c99SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 760d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 770d0321e0SJeremy L Thompson } 780d0321e0SJeremy L Thompson 799ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 802b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 819ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 829ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 830d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 840d0321e0SJeremy L Thompson } 850d0321e0SJeremy L Thompson 860d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 87*34d14614SJeremy L Thompson // Basis apply - tensor AtPoints 88*34d14614SJeremy L Thompson //------------------------------------------------------------------------------ 89*34d14614SJeremy L Thompson int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, 90*34d14614SJeremy L Thompson CeedVector x_ref, CeedVector u, CeedVector v) { 91*34d14614SJeremy L Thompson Ceed ceed; 92*34d14614SJeremy L Thompson CeedInt Q_1d, dim, max_num_points = num_points[0]; 93*34d14614SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 94*34d14614SJeremy L Thompson const int max_block_size = 32; 95*34d14614SJeremy L Thompson const CeedScalar *d_x, *d_u; 96*34d14614SJeremy L Thompson CeedScalar *d_v; 97*34d14614SJeremy L Thompson CeedBasis_Cuda *data; 98*34d14614SJeremy L Thompson 99*34d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 100*34d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 101*34d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 102*34d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 103*34d14614SJeremy L Thompson 104*34d14614SJeremy L Thompson // Check uniform number of points per elem 105*34d14614SJeremy L Thompson for (CeedInt i = 1; i < num_elem; i++) { 106*34d14614SJeremy L Thompson CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND, 107*34d14614SJeremy L Thompson "BasisApplyAtPoints only supported for the same number of points in each element"); 108*34d14614SJeremy L Thompson } 109*34d14614SJeremy L Thompson 110*34d14614SJeremy L Thompson // Weight handled separately 111*34d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_WEIGHT) { 112*34d14614SJeremy L Thompson CeedCall(CeedVectorSetValue(v, 1.0)); 113*34d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 114*34d14614SJeremy L Thompson } 115*34d14614SJeremy L Thompson 116*34d14614SJeremy L Thompson // Build kernels if needed 117*34d14614SJeremy L Thompson if (data->num_points != max_num_points) { 118*34d14614SJeremy L Thompson CeedInt P_1d; 119*34d14614SJeremy L Thompson 120*34d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 121*34d14614SJeremy L Thompson data->num_points = max_num_points; 122*34d14614SJeremy L Thompson 123*34d14614SJeremy L Thompson // -- Create interp matrix to Chebyshev coefficients 124*34d14614SJeremy L Thompson if (!data->d_chebyshev_interp_1d) { 125*34d14614SJeremy L Thompson CeedSize interp_bytes; 126*34d14614SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 127*34d14614SJeremy L Thompson 128*34d14614SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 129*34d14614SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 130*34d14614SJeremy L Thompson CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 131*34d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 132*34d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 133*34d14614SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 134*34d14614SJeremy L Thompson } 135*34d14614SJeremy L Thompson 136*34d14614SJeremy L Thompson // -- Compile kernels 137*34d14614SJeremy L Thompson char *basis_kernel_source; 138*34d14614SJeremy L Thompson const char *basis_kernel_path; 139*34d14614SJeremy L Thompson CeedInt num_comp; 140*34d14614SJeremy L Thompson 141*34d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 142*34d14614SJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 143*34d14614SJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path)); 144*34d14614SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 145*34d14614SJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 146*34d14614SJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 147*34d14614SJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 148*34d14614SJeremy L Thompson num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 149*34d14614SJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", 150*34d14614SJeremy L Thompson max_num_points, "POINTS_BUFF_LEN", 151*34d14614SJeremy L Thompson max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1))); 152*34d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 153*34d14614SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 154*34d14614SJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 155*34d14614SJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 156*34d14614SJeremy L Thompson } 157*34d14614SJeremy L Thompson 158*34d14614SJeremy L Thompson // Get read/write access to u, v 159*34d14614SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 160*34d14614SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 161*34d14614SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 162*34d14614SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 163*34d14614SJeremy L Thompson 164*34d14614SJeremy L Thompson // Clear v for transpose operation 165*34d14614SJeremy L Thompson if (is_transpose) { 166*34d14614SJeremy L Thompson CeedSize length; 167*34d14614SJeremy L Thompson 168*34d14614SJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 169*34d14614SJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 170*34d14614SJeremy L Thompson } 171*34d14614SJeremy L Thompson 172*34d14614SJeremy L Thompson // Basis action 173*34d14614SJeremy L Thompson switch (eval_mode) { 174*34d14614SJeremy L Thompson case CEED_EVAL_INTERP: { 175*34d14614SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; 176*34d14614SJeremy L Thompson const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 177*34d14614SJeremy L Thompson 178*34d14614SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); 179*34d14614SJeremy L Thompson } break; 180*34d14614SJeremy L Thompson case CEED_EVAL_GRAD: { 181*34d14614SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; 182*34d14614SJeremy L Thompson const CeedInt block_size = max_block_size; 183*34d14614SJeremy L Thompson 184*34d14614SJeremy L Thompson CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); 185*34d14614SJeremy L Thompson } break; 186*34d14614SJeremy L Thompson case CEED_EVAL_WEIGHT: 187*34d14614SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 188*34d14614SJeremy L Thompson break; 189*34d14614SJeremy L Thompson // LCOV_EXCL_START 190*34d14614SJeremy L Thompson case CEED_EVAL_DIV: 191*34d14614SJeremy L Thompson case CEED_EVAL_CURL: 192*34d14614SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 193*34d14614SJeremy L Thompson // LCOV_EXCL_STOP 194*34d14614SJeremy L Thompson } 195*34d14614SJeremy L Thompson 196*34d14614SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 197*34d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 198*34d14614SJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 199*34d14614SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 200*34d14614SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 201*34d14614SJeremy L Thompson return CEED_ERROR_SUCCESS; 202*34d14614SJeremy L Thompson } 203*34d14614SJeremy L Thompson 204*34d14614SJeremy L Thompson //------------------------------------------------------------------------------ 2050d0321e0SJeremy L Thompson // Basis apply - non-tensor 2060d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2072b730f8bSJeremy L Thompson int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 2082b730f8bSJeremy L Thompson CeedVector v) { 2090d0321e0SJeremy L Thompson Ceed ceed; 210437930d1SJeremy L Thompson CeedInt num_nodes, num_qpts; 2117bbbfca3SJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 212d075f50bSSebastian Grimberg const int elems_per_block = 1; 213d075f50bSSebastian Grimberg const int grid = CeedDivUpInt(num_elem, elems_per_block); 2140d0321e0SJeremy L Thompson const CeedScalar *d_u; 2150d0321e0SJeremy L Thompson CeedScalar *d_v; 216ca735530SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 217ca735530SJeremy L Thompson 218ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 219ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 220ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 221ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 222ca735530SJeremy L Thompson 2239ea2cfd9SJeremy L Thompson // Get read/write access to u, v 2249ea2cfd9SJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 2259ea2cfd9SJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 2262b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 2270d0321e0SJeremy L Thompson 2280d0321e0SJeremy L Thompson // Clear v for transpose operation 2297bbbfca3SJeremy L Thompson if (is_transpose) { 2301f9221feSJeremy L Thompson CeedSize length; 231ca735530SJeremy L Thompson 2322b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 2332b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 2340d0321e0SJeremy L Thompson } 2350d0321e0SJeremy L Thompson 2360d0321e0SJeremy L Thompson // Apply basis operation 237437930d1SJeremy L Thompson switch (eval_mode) { 2380d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 239d075f50bSSebastian Grimberg void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; 2407bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 241b2165e7aSSebastian Grimberg 2427bbbfca3SJeremy L Thompson if (is_transpose) { 243d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); 244d075f50bSSebastian Grimberg } else { 245b2165e7aSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); 246d075f50bSSebastian Grimberg } 2470d0321e0SJeremy L Thompson } break; 2480d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 249d075f50bSSebastian Grimberg void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; 2507bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 251b2165e7aSSebastian Grimberg 2527bbbfca3SJeremy L Thompson if (is_transpose) { 253d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); 254d075f50bSSebastian Grimberg } else { 255d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); 256d075f50bSSebastian Grimberg } 257d075f50bSSebastian Grimberg } break; 258d075f50bSSebastian Grimberg case CEED_EVAL_DIV: { 259d075f50bSSebastian Grimberg void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; 2607bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 261d075f50bSSebastian Grimberg 2627bbbfca3SJeremy L Thompson if (is_transpose) { 263d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); 264d075f50bSSebastian Grimberg } else { 265d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); 266d075f50bSSebastian Grimberg } 267d075f50bSSebastian Grimberg } break; 268d075f50bSSebastian Grimberg case CEED_EVAL_CURL: { 269d075f50bSSebastian Grimberg void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; 2707bbbfca3SJeremy L Thompson const int block_size_x = is_transpose ? num_nodes : num_qpts; 271d075f50bSSebastian Grimberg 2727bbbfca3SJeremy L Thompson if (is_transpose) { 273d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); 274d075f50bSSebastian Grimberg } else { 275d075f50bSSebastian Grimberg CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); 276d075f50bSSebastian Grimberg } 2770d0321e0SJeremy L Thompson } break; 2780d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 279097cc795SJames Wright CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 280437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 281b2165e7aSSebastian Grimberg 282eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 2830d0321e0SJeremy L Thompson } break; 2849ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 2859ea2cfd9SJeremy L Thompson break; 2860d0321e0SJeremy L Thompson } 2870d0321e0SJeremy L Thompson 2889ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 2892b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 2909ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 2919ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 2920d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2930d0321e0SJeremy L Thompson } 2940d0321e0SJeremy L Thompson 2950d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2960d0321e0SJeremy L Thompson // Destroy tensor basis 2970d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2980d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 2990d0321e0SJeremy L Thompson Ceed ceed; 3000d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 301ca735530SJeremy L Thompson 302ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3032b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 3042b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 305*34d14614SJeremy L Thompson if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); 306097cc795SJames Wright if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 3072b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 3082b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 309*34d14614SJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); 3102b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 3110d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3120d0321e0SJeremy L Thompson } 3130d0321e0SJeremy L Thompson 3140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3150d0321e0SJeremy L Thompson // Destroy non-tensor basis 3160d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3170d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 3180d0321e0SJeremy L Thompson Ceed ceed; 3190d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 320ca735530SJeremy L Thompson 321ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3222b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 3232b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 324097cc795SJames Wright if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 3252b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp)); 3262b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad)); 327d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_div)); 328d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaFree(data->d_curl)); 3292b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 3300d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3310d0321e0SJeremy L Thompson } 3320d0321e0SJeremy L Thompson 3330d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3340d0321e0SJeremy L Thompson // Create tensor 3350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3362b730f8bSJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 3372b730f8bSJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 3380d0321e0SJeremy L Thompson Ceed ceed; 33922070f95SJeremy L Thompson char *basis_kernel_source; 34022070f95SJeremy L Thompson const char *basis_kernel_path; 341ca735530SJeremy L Thompson CeedInt num_comp; 342ca735530SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 343ca735530SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 3440d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 345ca735530SJeremy L Thompson 346ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3472b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 3480d0321e0SJeremy L Thompson 3490d0321e0SJeremy L Thompson // Copy data to GPU 350097cc795SJames Wright if (q_weight_1d) { 3512b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 3522b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 353097cc795SJames Wright } 3542b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 3552b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 3562b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 3572b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 3580d0321e0SJeremy L Thompson 359ecc88aebSJeremy L Thompson // Compile basis kernels 3602b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 3612b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 36223d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 3632b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 36423d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 365eb7e6cafSJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 3662b730f8bSJeremy L Thompson num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 3672b730f8bSJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 368eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 369eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); 370eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 3712b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 3722b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 373437930d1SJeremy L Thompson 3742b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 3750d0321e0SJeremy L Thompson 376d075f50bSSebastian Grimberg // Register backend functions 3772b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 378*34d14614SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); 3792b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 3800d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3810d0321e0SJeremy L Thompson } 3820d0321e0SJeremy L Thompson 3830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 384d075f50bSSebastian Grimberg // Create non-tensor H^1 3850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3862b730f8bSJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 38751475c7cSJeremy L Thompson const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 3880d0321e0SJeremy L Thompson Ceed ceed; 38922070f95SJeremy L Thompson char *basis_kernel_source; 39022070f95SJeremy L Thompson const char *basis_kernel_path; 391d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_grad; 392ca735530SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 3930d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 394ca735530SJeremy L Thompson 395ca735530SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 3962b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 3970d0321e0SJeremy L Thompson 3980d0321e0SJeremy L Thompson // Copy basis data to GPU 399d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 400d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 401097cc795SJames Wright if (q_weight) { 4022b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 4032b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 404097cc795SJames Wright } 405d075f50bSSebastian Grimberg if (interp) { 406d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 407d075f50bSSebastian Grimberg 4082b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 4092b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 410d075f50bSSebastian Grimberg } 411d075f50bSSebastian Grimberg if (grad) { 412d075f50bSSebastian Grimberg const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 413d075f50bSSebastian Grimberg 4142b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 4152b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 416d075f50bSSebastian Grimberg } 4170d0321e0SJeremy L Thompson 4180d0321e0SJeremy L Thompson // Compile basis kernels 4192b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 4202b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 42123d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 4222b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 42323d4529eSJeremy L Thompson CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 424d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 425d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); 426d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 427d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 428d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 429d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 430d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 431d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 432d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 433d075f50bSSebastian Grimberg 434d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 435d075f50bSSebastian Grimberg 436d075f50bSSebastian Grimberg // Register backend functions 437d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 438d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 439d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 440d075f50bSSebastian Grimberg } 441d075f50bSSebastian Grimberg 442d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 443d075f50bSSebastian Grimberg // Create non-tensor H(div) 444d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 445d075f50bSSebastian Grimberg int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, 446d075f50bSSebastian Grimberg const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 447d075f50bSSebastian Grimberg Ceed ceed; 44822070f95SJeremy L Thompson char *basis_kernel_source; 44922070f95SJeremy L Thompson const char *basis_kernel_path; 450d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_div; 451d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 452d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 453d075f50bSSebastian Grimberg 454d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 455d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 456d075f50bSSebastian Grimberg 457d075f50bSSebastian Grimberg // Copy basis data to GPU 458d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 459d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); 460097cc795SJames Wright if (q_weight) { 461d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 462d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 463097cc795SJames Wright } 464d075f50bSSebastian Grimberg if (interp) { 465d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 466d075f50bSSebastian Grimberg 467d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 468d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 469d075f50bSSebastian Grimberg } 470d075f50bSSebastian Grimberg if (div) { 471d075f50bSSebastian Grimberg const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; 472d075f50bSSebastian Grimberg 473d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); 474d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); 475d075f50bSSebastian Grimberg } 476d075f50bSSebastian Grimberg 477d075f50bSSebastian Grimberg // Compile basis kernels 478d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 479d075f50bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 480d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 481d075f50bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 482d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 483d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 484d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); 485d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 486d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 487d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 488d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 489d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 490d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_path)); 491d075f50bSSebastian Grimberg CeedCallBackend(CeedFree(&basis_kernel_source)); 492d075f50bSSebastian Grimberg 493d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisSetData(basis, data)); 494d075f50bSSebastian Grimberg 495d075f50bSSebastian Grimberg // Register backend functions 496d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 497d075f50bSSebastian Grimberg CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 498d075f50bSSebastian Grimberg return CEED_ERROR_SUCCESS; 499d075f50bSSebastian Grimberg } 500d075f50bSSebastian Grimberg 501d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 502d075f50bSSebastian Grimberg // Create non-tensor H(curl) 503d075f50bSSebastian Grimberg //------------------------------------------------------------------------------ 504d075f50bSSebastian Grimberg int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 505d075f50bSSebastian Grimberg const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 506d075f50bSSebastian Grimberg Ceed ceed; 50722070f95SJeremy L Thompson char *basis_kernel_source; 50822070f95SJeremy L Thompson const char *basis_kernel_path; 509d075f50bSSebastian Grimberg CeedInt num_comp, q_comp_interp, q_comp_curl; 510d075f50bSSebastian Grimberg const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 511d075f50bSSebastian Grimberg CeedBasisNonTensor_Cuda *data; 512d075f50bSSebastian Grimberg 513d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 514d075f50bSSebastian Grimberg CeedCallBackend(CeedCalloc(1, &data)); 515d075f50bSSebastian Grimberg 516d075f50bSSebastian Grimberg // Copy basis data to GPU 517d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 518d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); 519097cc795SJames Wright if (q_weight) { 520d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 521d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 522097cc795SJames Wright } 523d075f50bSSebastian Grimberg if (interp) { 524d075f50bSSebastian Grimberg const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 525d075f50bSSebastian Grimberg 526d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 527d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 528d075f50bSSebastian Grimberg } 529d075f50bSSebastian Grimberg if (curl) { 530d075f50bSSebastian Grimberg const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; 531d075f50bSSebastian Grimberg 532d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); 533d075f50bSSebastian Grimberg CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); 534d075f50bSSebastian Grimberg } 535d075f50bSSebastian Grimberg 536d075f50bSSebastian Grimberg // Compile basis kernels 537d075f50bSSebastian Grimberg CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 538d075f50bSSebastian Grimberg CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 539d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); 540d075f50bSSebastian Grimberg CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 541d075f50bSSebastian Grimberg CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); 542d075f50bSSebastian Grimberg CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", 543d075f50bSSebastian Grimberg q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); 544d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); 545d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 546d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); 547d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); 548d075f50bSSebastian Grimberg CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); 5492b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 5502b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 5510d0321e0SJeremy L Thompson 5522b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 5530d0321e0SJeremy L Thompson 5540d0321e0SJeremy L Thompson // Register backend functions 5552b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 5562b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 5570d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 5580d0321e0SJeremy L Thompson } 5592a86cc9dSSebastian Grimberg 5600d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 561