13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 8*49aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 110d0321e0SJeremy L Thompson #include <cuda.h> 120d0321e0SJeremy L Thompson #include <cuda_runtime.h> 132b730f8bSJeremy L Thompson 14*49aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 150d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 162b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h" 170d0321e0SJeremy L Thompson 180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 190d0321e0SJeremy L Thompson // Basis apply - tensor 200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 212b730f8bSJeremy L Thompson int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 220d0321e0SJeremy L Thompson Ceed ceed; 232b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 240d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 252b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 260d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 272b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 28437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 29437930d1SJeremy L Thompson const int max_block_size = 32; 300d0321e0SJeremy L Thompson 310d0321e0SJeremy L Thompson // Read vectors 320d0321e0SJeremy L Thompson const CeedScalar *d_u; 330d0321e0SJeremy L Thompson CeedScalar *d_v; 34437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 352b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 360d0321e0SJeremy L Thompson } 372b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 380d0321e0SJeremy L Thompson 390d0321e0SJeremy L Thompson // Clear v for transpose operation 40437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 411f9221feSJeremy L Thompson CeedSize length; 422b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 432b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 440d0321e0SJeremy L Thompson } 45437930d1SJeremy L Thompson CeedInt Q_1d, dim; 462b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 472b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 480d0321e0SJeremy L Thompson 490d0321e0SJeremy L Thompson // Basis action 50437930d1SJeremy L Thompson switch (eval_mode) { 510d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 522b730f8bSJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &d_u, &d_v}; 53437930d1SJeremy L Thompson CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 540d0321e0SJeremy L Thompson 552b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args)); 560d0321e0SJeremy L Thompson } break; 570d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 582b730f8bSJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; 59437930d1SJeremy L Thompson CeedInt block_size = max_block_size; 600d0321e0SJeremy L Thompson 612b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args)); 620d0321e0SJeremy L Thompson } break; 630d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 64437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 65437930d1SJeremy L Thompson const int grid_size = num_elem; 662b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid_size, Q_1d, dim >= 2 ? Q_1d : 1, 1, weight_args)); 670d0321e0SJeremy L Thompson } break; 680d0321e0SJeremy L Thompson // LCOV_EXCL_START 690d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 700d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 710d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 720d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 730d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 740d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 750d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 760d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 772b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 780d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 790d0321e0SJeremy L Thompson } 800d0321e0SJeremy L Thompson 810d0321e0SJeremy L Thompson // Restore vectors 82437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 832b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 840d0321e0SJeremy L Thompson } 852b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 860d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 870d0321e0SJeremy L Thompson } 880d0321e0SJeremy L Thompson 890d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 900d0321e0SJeremy L Thompson // Basis apply - non-tensor 910d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 922b730f8bSJeremy L Thompson int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 932b730f8bSJeremy L Thompson CeedVector v) { 940d0321e0SJeremy L Thompson Ceed ceed; 952b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 960d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 972b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); 980d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 992b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 100437930d1SJeremy L Thompson CeedInt num_nodes, num_qpts; 1012b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); 1022b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); 103437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 104437930d1SJeremy L Thompson int elems_per_block = 1; 1052b730f8bSJeremy L Thompson int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 1060d0321e0SJeremy L Thompson 1070d0321e0SJeremy L Thompson // Read vectors 1080d0321e0SJeremy L Thompson const CeedScalar *d_u; 1090d0321e0SJeremy L Thompson CeedScalar *d_v; 110437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1112b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 1120d0321e0SJeremy L Thompson } 1132b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 1140d0321e0SJeremy L Thompson 1150d0321e0SJeremy L Thompson // Clear v for transpose operation 116437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 1171f9221feSJeremy L Thompson CeedSize length; 1182b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(v, &length)); 1192b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); 1200d0321e0SJeremy L Thompson } 1210d0321e0SJeremy L Thompson 1220d0321e0SJeremy L Thompson // Apply basis operation 123437930d1SJeremy L Thompson switch (eval_mode) { 1240d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 1252b730f8bSJeremy L Thompson void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; 126437930d1SJeremy L Thompson if (transpose) { 1272b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, elems_per_block, interp_args)); 1280d0321e0SJeremy L Thompson } else { 1292b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, elems_per_block, interp_args)); 1300d0321e0SJeremy L Thompson } 1310d0321e0SJeremy L Thompson } break; 1320d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 1332b730f8bSJeremy L Thompson void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; 134437930d1SJeremy L Thompson if (transpose) { 1352b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, elems_per_block, grad_args)); 1360d0321e0SJeremy L Thompson } else { 1372b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, elems_per_block, grad_args)); 1380d0321e0SJeremy L Thompson } 1390d0321e0SJeremy L Thompson } break; 1400d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 141437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; 1422b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); 1430d0321e0SJeremy L Thompson } break; 1440d0321e0SJeremy L Thompson // LCOV_EXCL_START 1450d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 1460d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 1470d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 1480d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 1490d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 1500d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 1510d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 1520d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 1532b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); 1540d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 1550d0321e0SJeremy L Thompson } 1560d0321e0SJeremy L Thompson 1570d0321e0SJeremy L Thompson // Restore vectors 158437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1592b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 1600d0321e0SJeremy L Thompson } 1612b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 1620d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1630d0321e0SJeremy L Thompson } 1640d0321e0SJeremy L Thompson 1650d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1660d0321e0SJeremy L Thompson // Destroy tensor basis 1670d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1680d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 1690d0321e0SJeremy L Thompson Ceed ceed; 1702b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 1710d0321e0SJeremy L Thompson 1720d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 1732b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 1740d0321e0SJeremy L Thompson 1752b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 1760d0321e0SJeremy L Thompson 1772b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); 1782b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); 1792b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); 1802b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 181437930d1SJeremy L Thompson 1820d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1830d0321e0SJeremy L Thompson } 1840d0321e0SJeremy L Thompson 1850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1860d0321e0SJeremy L Thompson // Destroy non-tensor basis 1870d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1880d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 1890d0321e0SJeremy L Thompson Ceed ceed; 1902b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 1910d0321e0SJeremy L Thompson 1920d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 1932b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 1940d0321e0SJeremy L Thompson 1952b730f8bSJeremy L Thompson CeedCallCuda(ceed, cuModuleUnload(data->module)); 1960d0321e0SJeremy L Thompson 1972b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_q_weight)); 1982b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_interp)); 1992b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaFree(data->d_grad)); 2002b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 201437930d1SJeremy L Thompson 2020d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2030d0321e0SJeremy L Thompson } 2040d0321e0SJeremy L Thompson 2050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2060d0321e0SJeremy L Thompson // Create tensor 2070d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2082b730f8bSJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 2092b730f8bSJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 2100d0321e0SJeremy L Thompson Ceed ceed; 2112b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 2120d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 2132b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 2140d0321e0SJeremy L Thompson 2150d0321e0SJeremy L Thompson // Copy data to GPU 216437930d1SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 2172b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); 2182b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); 2190d0321e0SJeremy L Thompson 220437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 2212b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); 2222b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 2230d0321e0SJeremy L Thompson 2242b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); 2252b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); 2260d0321e0SJeremy L Thompson 227ecc88aebSJeremy L Thompson // Compile basis kernels 228437930d1SJeremy L Thompson CeedInt num_comp; 2292b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 230437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 2312b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); 23246dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 2332b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 23446dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 2352b730f8bSJeremy L Thompson CeedCallBackend(CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", 2362b730f8bSJeremy L Thompson num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, 2372b730f8bSJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); 2382b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp)); 2392b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad)); 2402b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight)); 2412b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 2422b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 243437930d1SJeremy L Thompson 2442b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 2450d0321e0SJeremy L Thompson 2462b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); 2472b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); 2480d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2490d0321e0SJeremy L Thompson } 2500d0321e0SJeremy L Thompson 2510d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2520d0321e0SJeremy L Thompson // Create non-tensor 2530d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2542b730f8bSJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 2552b730f8bSJeremy L Thompson const CeedScalar *qref, const CeedScalar *q_weight, CeedBasis basis) { 2560d0321e0SJeremy L Thompson Ceed ceed; 2572b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 2580d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 2592b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 2600d0321e0SJeremy L Thompson 2610d0321e0SJeremy L Thompson // Copy basis data to GPU 262437930d1SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 2632b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); 2642b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); 2650d0321e0SJeremy L Thompson 266437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * num_nodes; 2672b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); 2682b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); 2690d0321e0SJeremy L Thompson 270437930d1SJeremy L Thompson const CeedInt grad_bytes = q_bytes * num_nodes * dim; 2712b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); 2722b730f8bSJeremy L Thompson CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); 2730d0321e0SJeremy L Thompson 2740d0321e0SJeremy L Thompson // Compile basis kernels 275437930d1SJeremy L Thompson CeedInt num_comp; 2762b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 277437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 2782b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); 27946dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 2802b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); 28146dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 2822b730f8bSJeremy L Thompson CeedCallCuda(ceed, CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, 2832b730f8bSJeremy L Thompson "BASIS_NUM_COMP", num_comp)); 2842b730f8bSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp)); 2852b730f8bSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad)); 2862b730f8bSJeremy L Thompson CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight)); 2872b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_path)); 2882b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&basis_kernel_source)); 2890d0321e0SJeremy L Thompson 2902b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 2910d0321e0SJeremy L Thompson 2920d0321e0SJeremy L Thompson // Register backend functions 2932b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); 2942b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); 2950d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2960d0321e0SJeremy L Thompson } 2970d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 298