1*3d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*3d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 4*3d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 6*3d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 80d0321e0SJeremy L Thompson #include <ceed/ceed.h> 90d0321e0SJeremy L Thompson #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 110d0321e0SJeremy L Thompson #include <cuda.h> 120d0321e0SJeremy L Thompson #include <cuda_runtime.h> 130d0321e0SJeremy L Thompson #include "ceed-cuda-ref.h" 140d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 150d0321e0SJeremy L Thompson 160d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 170d0321e0SJeremy L Thompson // Basis apply - tensor 180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 19437930d1SJeremy L Thompson int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, 20437930d1SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, 21437930d1SJeremy L Thompson CeedVector u, CeedVector v) { 220d0321e0SJeremy L Thompson int ierr; 230d0321e0SJeremy L Thompson Ceed ceed; 240d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 250d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 260d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 270d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 280d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 29437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 30437930d1SJeremy L Thompson const int max_block_size = 32; 310d0321e0SJeremy L Thompson 320d0321e0SJeremy L Thompson // Read vectors 330d0321e0SJeremy L Thompson const CeedScalar *d_u; 340d0321e0SJeremy L Thompson CeedScalar *d_v; 35437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 360d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 370d0321e0SJeremy L Thompson } 380d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 390d0321e0SJeremy L Thompson 400d0321e0SJeremy L Thompson // Clear v for transpose operation 41437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 421f9221feSJeremy L Thompson CeedSize length; 430d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 440d0321e0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 450d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 460d0321e0SJeremy L Thompson } 47437930d1SJeremy L Thompson CeedInt Q_1d, dim; 48437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 490d0321e0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 500d0321e0SJeremy L Thompson 510d0321e0SJeremy L Thompson // Basis action 52437930d1SJeremy L Thompson switch (eval_mode) { 530d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 54437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 55437930d1SJeremy L Thompson &data->d_interp_1d, &d_u, &d_v 560d0321e0SJeremy L Thompson }; 57437930d1SJeremy L Thompson CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 580d0321e0SJeremy L Thompson 59437930d1SJeremy L Thompson ierr = CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args); 600d0321e0SJeremy L Thompson CeedChkBackend(ierr); 610d0321e0SJeremy L Thompson } break; 620d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 63437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 64437930d1SJeremy L Thompson &data->d_grad_1d, &d_u, &d_v 650d0321e0SJeremy L Thompson }; 66437930d1SJeremy L Thompson CeedInt block_size = max_block_size; 670d0321e0SJeremy L Thompson 68437930d1SJeremy L Thompson ierr = CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args); 690d0321e0SJeremy L Thompson CeedChkBackend(ierr); 700d0321e0SJeremy L Thompson } break; 710d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 72437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 73437930d1SJeremy L Thompson const int grid_size = num_elem; 74437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid_size, 75437930d1SJeremy L Thompson Q_1d, dim >= 2 ? Q_1d : 1, 1, 76437930d1SJeremy L Thompson weight_args); CeedChkBackend(ierr); 770d0321e0SJeremy L Thompson } break; 780d0321e0SJeremy L Thompson // LCOV_EXCL_START 790d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 800d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 810d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 820d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 830d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 840d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 850d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 860d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 870d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 880d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 890d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 900d0321e0SJeremy L Thompson } 910d0321e0SJeremy L Thompson 920d0321e0SJeremy L Thompson // Restore vectors 93437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 940d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 950d0321e0SJeremy L Thompson } 960d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 970d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 980d0321e0SJeremy L Thompson } 990d0321e0SJeremy L Thompson 1000d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1010d0321e0SJeremy L Thompson // Basis apply - non-tensor 1020d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 103437930d1SJeremy L Thompson int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, 104437930d1SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, 1050d0321e0SJeremy L Thompson CeedVector u, CeedVector v) { 1060d0321e0SJeremy L Thompson int ierr; 1070d0321e0SJeremy L Thompson Ceed ceed; 1080d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 1090d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 1100d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 1110d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 1120d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 113437930d1SJeremy L Thompson CeedInt num_nodes, num_qpts; 114437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); 115437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); 116437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 117437930d1SJeremy L Thompson int elems_per_block = 1; 118437930d1SJeremy L Thompson int grid = num_elem / elems_per_block + 119437930d1SJeremy L Thompson ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 1200d0321e0SJeremy L Thompson 1210d0321e0SJeremy L Thompson // Read vectors 1220d0321e0SJeremy L Thompson const CeedScalar *d_u; 1230d0321e0SJeremy L Thompson CeedScalar *d_v; 124437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1250d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 1260d0321e0SJeremy L Thompson } 1270d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 1280d0321e0SJeremy L Thompson 1290d0321e0SJeremy L Thompson // Clear v for transpose operation 130437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 1311f9221feSJeremy L Thompson CeedSize length; 1320d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 1330d0321e0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 1340d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 1350d0321e0SJeremy L Thompson } 1360d0321e0SJeremy L Thompson 1370d0321e0SJeremy L Thompson // Apply basis operation 138437930d1SJeremy L Thompson switch (eval_mode) { 1390d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 140437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 1410d0321e0SJeremy L Thompson &data->d_interp, &d_u, &d_v 1420d0321e0SJeremy L Thompson }; 143437930d1SJeremy L Thompson if (transpose) { 144437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, 145437930d1SJeremy L Thompson elems_per_block, interp_args); CeedChkBackend(ierr); 1460d0321e0SJeremy L Thompson } else { 147437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, 148437930d1SJeremy L Thompson elems_per_block, interp_args); CeedChkBackend(ierr); 1490d0321e0SJeremy L Thompson } 1500d0321e0SJeremy L Thompson } break; 1510d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 152437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, 1530d0321e0SJeremy L Thompson &d_u, &d_v 1540d0321e0SJeremy L Thompson }; 155437930d1SJeremy L Thompson if (transpose) { 156437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, 157437930d1SJeremy L Thompson elems_per_block, grad_args); CeedChkBackend(ierr); 1580d0321e0SJeremy L Thompson } else { 159437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, 160437930d1SJeremy L Thompson elems_per_block, grad_args); CeedChkBackend(ierr); 1610d0321e0SJeremy L Thompson } 1620d0321e0SJeremy L Thompson } break; 1630d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 164437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; 165437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, 166437930d1SJeremy L Thompson elems_per_block, weight_args); CeedChkBackend(ierr); 1670d0321e0SJeremy L Thompson } break; 1680d0321e0SJeremy L Thompson // LCOV_EXCL_START 1690d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 1700d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 1710d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 1720d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 1730d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 1740d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 1750d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 1760d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 1770d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1780d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 1790d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 1800d0321e0SJeremy L Thompson } 1810d0321e0SJeremy L Thompson 1820d0321e0SJeremy L Thompson // Restore vectors 183437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1840d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 1850d0321e0SJeremy L Thompson } 1860d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 1870d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1880d0321e0SJeremy L Thompson } 1890d0321e0SJeremy L Thompson 1900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1910d0321e0SJeremy L Thompson // Destroy tensor basis 1920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1930d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 1940d0321e0SJeremy L Thompson int ierr; 1950d0321e0SJeremy L Thompson Ceed ceed; 1960d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 1970d0321e0SJeremy L Thompson 1980d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 1990d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 2000d0321e0SJeremy L Thompson 2010d0321e0SJeremy L Thompson CeedChk_Cu(ceed, cuModuleUnload(data->module)); 2020d0321e0SJeremy L Thompson 203437930d1SJeremy L Thompson ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 204437930d1SJeremy L Thompson ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 205437930d1SJeremy L Thompson ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 2060d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 207437930d1SJeremy L Thompson 2080d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2090d0321e0SJeremy L Thompson } 2100d0321e0SJeremy L Thompson 2110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2120d0321e0SJeremy L Thompson // Destroy non-tensor basis 2130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2140d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 2150d0321e0SJeremy L Thompson int ierr; 2160d0321e0SJeremy L Thompson Ceed ceed; 2170d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2180d0321e0SJeremy L Thompson 2190d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 2200d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 2210d0321e0SJeremy L Thompson 2220d0321e0SJeremy L Thompson CeedChk_Cu(ceed, cuModuleUnload(data->module)); 2230d0321e0SJeremy L Thompson 224437930d1SJeremy L Thompson ierr = cudaFree(data->d_q_weight); CeedChk_Cu(ceed, ierr); 2250d0321e0SJeremy L Thompson ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr); 2260d0321e0SJeremy L Thompson ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr); 2270d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 228437930d1SJeremy L Thompson 2290d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2300d0321e0SJeremy L Thompson } 2310d0321e0SJeremy L Thompson 2320d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2330d0321e0SJeremy L Thompson // Create tensor 2340d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 235437930d1SJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 236437930d1SJeremy L Thompson const CeedScalar *interp_1d, 237437930d1SJeremy L Thompson const CeedScalar *grad_1d, 238437930d1SJeremy L Thompson const CeedScalar *q_ref_1d, 239437930d1SJeremy L Thompson const CeedScalar *q_weight_1d, 2400d0321e0SJeremy L Thompson CeedBasis basis) { 2410d0321e0SJeremy L Thompson int ierr; 2420d0321e0SJeremy L Thompson Ceed ceed; 2430d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2440d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 2450d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 2460d0321e0SJeremy L Thompson 2470d0321e0SJeremy L Thompson // Copy data to GPU 248437930d1SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 249437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 250437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 251437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 2520d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 2530d0321e0SJeremy L Thompson 254437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 255437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 256437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 257437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 2580d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 2590d0321e0SJeremy L Thompson 260437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 261437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 262437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 2630d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 2640d0321e0SJeremy L Thompson 2650d0321e0SJeremy L Thompson // Complie basis kernels 266437930d1SJeremy L Thompson CeedInt num_comp; 267437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 268437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 269437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-ref-basis-tensor.h", 270437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 27146dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 272437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 273437930d1SJeremy L Thompson CeedChkBackend(ierr); 27446dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 275437930d1SJeremy L Thompson ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, 276d7d111ecSJeremy L Thompson "BASIS_Q_1D", Q_1d, 277d7d111ecSJeremy L Thompson "BASIS_P_1D", P_1d, 278437930d1SJeremy L Thompson "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 279437930d1SJeremy L Thompson Q_1d : P_1d, dim), 2800d0321e0SJeremy L Thompson "BASIS_DIM", dim, 281d7d111ecSJeremy L Thompson "BASIS_NUM_COMP", num_comp, 282d7d111ecSJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 283d7d111ecSJeremy L Thompson "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) 2840d0321e0SJeremy L Thompson ); CeedChkBackend(ierr); 285437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 2860d0321e0SJeremy L Thompson CeedChkBackend(ierr); 287437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 2880d0321e0SJeremy L Thompson CeedChkBackend(ierr); 289437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 2900d0321e0SJeremy L Thompson CeedChkBackend(ierr); 291437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 292437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 293437930d1SJeremy L Thompson 2940d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 2950d0321e0SJeremy L Thompson 2960d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 2970d0321e0SJeremy L Thompson CeedBasisApply_Cuda); CeedChkBackend(ierr); 2980d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 2990d0321e0SJeremy L Thompson CeedBasisDestroy_Cuda); CeedChkBackend(ierr); 3000d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3010d0321e0SJeremy L Thompson } 3020d0321e0SJeremy L Thompson 3030d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3040d0321e0SJeremy L Thompson // Create non-tensor 3050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 306437930d1SJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, 307437930d1SJeremy L Thompson CeedInt num_nodes, 308437930d1SJeremy L Thompson CeedInt num_qpts, const CeedScalar *interp, 3090d0321e0SJeremy L Thompson const CeedScalar *grad, const CeedScalar *qref, 310437930d1SJeremy L Thompson const CeedScalar *q_weight, CeedBasis basis) { 3110d0321e0SJeremy L Thompson int ierr; 3120d0321e0SJeremy L Thompson Ceed ceed; 3130d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 3140d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 3150d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 3160d0321e0SJeremy L Thompson 3170d0321e0SJeremy L Thompson // Copy basis data to GPU 318437930d1SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 319437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Cu(ceed, ierr); 320437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_q_weight, q_weight, q_bytes, 3210d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 3220d0321e0SJeremy L Thompson 323437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * num_nodes; 324437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp, interp_bytes); 325437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 326437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_interp, interp, interp_bytes, 3270d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 3280d0321e0SJeremy L Thompson 329437930d1SJeremy L Thompson const CeedInt grad_bytes = q_bytes * num_nodes * dim; 330437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Cu(ceed, ierr); 331437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_grad, grad, grad_bytes, 3320d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 3330d0321e0SJeremy L Thompson 3340d0321e0SJeremy L Thompson // Compile basis kernels 335437930d1SJeremy L Thompson CeedInt num_comp; 336437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 337437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 338437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-ref-basis-nontensor.h", 339437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 34046dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 341437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 342437930d1SJeremy L Thompson CeedChkBackend(ierr); 34346dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 344437930d1SJeremy L Thompson ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, 345d7d111ecSJeremy L Thompson "BASIS_Q", num_qpts, 346d7d111ecSJeremy L Thompson "BASIS_P", num_nodes, 3470d0321e0SJeremy L Thompson "BASIS_DIM", dim, 348d7d111ecSJeremy L Thompson "BASIS_NUM_COMP", num_comp 3490d0321e0SJeremy L Thompson ); CeedChk_Cu(ceed, ierr); 350437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 3510d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 352437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 3530d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 354437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 3550d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 356437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 357437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 3580d0321e0SJeremy L Thompson 3590d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 3600d0321e0SJeremy L Thompson 3610d0321e0SJeremy L Thompson // Register backend functions 3620d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 3630d0321e0SJeremy L Thompson CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr); 3640d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 3650d0321e0SJeremy L Thompson CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr); 3660d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3670d0321e0SJeremy L Thompson } 3680d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 369