10d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 20d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 30d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details. 40d0321e0SJeremy L Thompson // 50d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 60d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 70d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 80d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 90d0321e0SJeremy L Thompson // 100d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 110d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 120d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 130d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 140d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 150d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 160d0321e0SJeremy L Thompson 170d0321e0SJeremy L Thompson #include <ceed/ceed.h> 180d0321e0SJeremy L Thompson #include <ceed/backend.h> 19*437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 200d0321e0SJeremy L Thompson #include <cuda.h> 210d0321e0SJeremy L Thompson #include <cuda_runtime.h> 220d0321e0SJeremy L Thompson #include "ceed-cuda-ref.h" 230d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 240d0321e0SJeremy L Thompson 250d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 260d0321e0SJeremy L Thompson // Basis apply - tensor 270d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 28*437930d1SJeremy L Thompson int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, 29*437930d1SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, 30*437930d1SJeremy L Thompson CeedVector u, CeedVector v) { 310d0321e0SJeremy L Thompson int ierr; 320d0321e0SJeremy L Thompson Ceed ceed; 330d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 340d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 350d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 360d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 370d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 38*437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 39*437930d1SJeremy L Thompson const int max_block_size = 32; 400d0321e0SJeremy L Thompson 410d0321e0SJeremy L Thompson // Read vectors 420d0321e0SJeremy L Thompson const CeedScalar *d_u; 430d0321e0SJeremy L Thompson CeedScalar *d_v; 44*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 450d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 460d0321e0SJeremy L Thompson } 470d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 480d0321e0SJeremy L Thompson 490d0321e0SJeremy L Thompson // Clear v for transpose operation 50*437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 510d0321e0SJeremy L Thompson CeedInt length; 520d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 530d0321e0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 540d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 550d0321e0SJeremy L Thompson } 56*437930d1SJeremy L Thompson CeedInt Q_1d, dim; 57*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 580d0321e0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 590d0321e0SJeremy L Thompson 600d0321e0SJeremy L Thompson // Basis action 61*437930d1SJeremy L Thompson switch (eval_mode) { 620d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 63*437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 64*437930d1SJeremy L Thompson &data->d_interp_1d, &d_u, &d_v 650d0321e0SJeremy L Thompson }; 66*437930d1SJeremy L Thompson CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); 670d0321e0SJeremy L Thompson 68*437930d1SJeremy L Thompson ierr = CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args); 690d0321e0SJeremy L Thompson CeedChkBackend(ierr); 700d0321e0SJeremy L Thompson } break; 710d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 72*437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 73*437930d1SJeremy L Thompson &data->d_grad_1d, &d_u, &d_v 740d0321e0SJeremy L Thompson }; 75*437930d1SJeremy L Thompson CeedInt block_size = max_block_size; 760d0321e0SJeremy L Thompson 77*437930d1SJeremy L Thompson ierr = CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args); 780d0321e0SJeremy L Thompson CeedChkBackend(ierr); 790d0321e0SJeremy L Thompson } break; 800d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 81*437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 82*437930d1SJeremy L Thompson const int grid_size = num_elem; 83*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid_size, 84*437930d1SJeremy L Thompson Q_1d, dim >= 2 ? Q_1d : 1, 1, 85*437930d1SJeremy L Thompson weight_args); CeedChkBackend(ierr); 860d0321e0SJeremy L Thompson } break; 870d0321e0SJeremy L Thompson // LCOV_EXCL_START 880d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 890d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 900d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 910d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 920d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 930d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 940d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 950d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 960d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 970d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 980d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 990d0321e0SJeremy L Thompson } 1000d0321e0SJeremy L Thompson 1010d0321e0SJeremy L Thompson // Restore vectors 102*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1030d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 1040d0321e0SJeremy L Thompson } 1050d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 1060d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1070d0321e0SJeremy L Thompson } 1080d0321e0SJeremy L Thompson 1090d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1100d0321e0SJeremy L Thompson // Basis apply - non-tensor 1110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 112*437930d1SJeremy L Thompson int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, 113*437930d1SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, 1140d0321e0SJeremy L Thompson CeedVector u, CeedVector v) { 1150d0321e0SJeremy L Thompson int ierr; 1160d0321e0SJeremy L Thompson Ceed ceed; 1170d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 1180d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 1190d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 1200d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 1210d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 122*437930d1SJeremy L Thompson CeedInt num_nodes, num_qpts; 123*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); 124*437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); 125*437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 126*437930d1SJeremy L Thompson int elems_per_block = 1; 127*437930d1SJeremy L Thompson int grid = num_elem / elems_per_block + 128*437930d1SJeremy L Thompson ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); 1290d0321e0SJeremy L Thompson 1300d0321e0SJeremy L Thompson // Read vectors 1310d0321e0SJeremy L Thompson const CeedScalar *d_u; 1320d0321e0SJeremy L Thompson CeedScalar *d_v; 133*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1340d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 1350d0321e0SJeremy L Thompson } 1360d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 1370d0321e0SJeremy L Thompson 1380d0321e0SJeremy L Thompson // Clear v for transpose operation 139*437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 1400d0321e0SJeremy L Thompson CeedInt length; 1410d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 1420d0321e0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 1430d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 1440d0321e0SJeremy L Thompson } 1450d0321e0SJeremy L Thompson 1460d0321e0SJeremy L Thompson // Apply basis operation 147*437930d1SJeremy L Thompson switch (eval_mode) { 1480d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 149*437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, 1500d0321e0SJeremy L Thompson &data->d_interp, &d_u, &d_v 1510d0321e0SJeremy L Thompson }; 152*437930d1SJeremy L Thompson if (transpose) { 153*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, 154*437930d1SJeremy L Thompson elems_per_block, interp_args); CeedChkBackend(ierr); 1550d0321e0SJeremy L Thompson } else { 156*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, 157*437930d1SJeremy L Thompson elems_per_block, interp_args); CeedChkBackend(ierr); 1580d0321e0SJeremy L Thompson } 1590d0321e0SJeremy L Thompson } break; 1600d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 161*437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, 1620d0321e0SJeremy L Thompson &d_u, &d_v 1630d0321e0SJeremy L Thompson }; 164*437930d1SJeremy L Thompson if (transpose) { 165*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, 166*437930d1SJeremy L Thompson elems_per_block, grad_args); CeedChkBackend(ierr); 1670d0321e0SJeremy L Thompson } else { 168*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, 169*437930d1SJeremy L Thompson elems_per_block, grad_args); CeedChkBackend(ierr); 1700d0321e0SJeremy L Thompson } 1710d0321e0SJeremy L Thompson } break; 1720d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 173*437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; 174*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, 175*437930d1SJeremy L Thompson elems_per_block, weight_args); CeedChkBackend(ierr); 1760d0321e0SJeremy L Thompson } break; 1770d0321e0SJeremy L Thompson // LCOV_EXCL_START 1780d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 1790d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 1800d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 1810d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 1820d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 1830d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 1840d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 1850d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 1860d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1870d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 1880d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 1890d0321e0SJeremy L Thompson } 1900d0321e0SJeremy L Thompson 1910d0321e0SJeremy L Thompson // Restore vectors 192*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1930d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 1940d0321e0SJeremy L Thompson } 1950d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 1960d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1970d0321e0SJeremy L Thompson } 1980d0321e0SJeremy L Thompson 1990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2000d0321e0SJeremy L Thompson // Destroy tensor basis 2010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2020d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 2030d0321e0SJeremy L Thompson int ierr; 2040d0321e0SJeremy L Thompson Ceed ceed; 2050d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2060d0321e0SJeremy L Thompson 2070d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 2080d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 2090d0321e0SJeremy L Thompson 2100d0321e0SJeremy L Thompson CeedChk_Cu(ceed, cuModuleUnload(data->module)); 2110d0321e0SJeremy L Thompson 212*437930d1SJeremy L Thompson ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 213*437930d1SJeremy L Thompson ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 214*437930d1SJeremy L Thompson ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 2150d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 216*437930d1SJeremy L Thompson 2170d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2180d0321e0SJeremy L Thompson } 2190d0321e0SJeremy L Thompson 2200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2210d0321e0SJeremy L Thompson // Destroy non-tensor basis 2220d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2230d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 2240d0321e0SJeremy L Thompson int ierr; 2250d0321e0SJeremy L Thompson Ceed ceed; 2260d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2270d0321e0SJeremy L Thompson 2280d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 2290d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 2300d0321e0SJeremy L Thompson 2310d0321e0SJeremy L Thompson CeedChk_Cu(ceed, cuModuleUnload(data->module)); 2320d0321e0SJeremy L Thompson 233*437930d1SJeremy L Thompson ierr = cudaFree(data->d_q_weight); CeedChk_Cu(ceed, ierr); 2340d0321e0SJeremy L Thompson ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr); 2350d0321e0SJeremy L Thompson ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr); 2360d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 237*437930d1SJeremy L Thompson 2380d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2390d0321e0SJeremy L Thompson } 2400d0321e0SJeremy L Thompson 2410d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2420d0321e0SJeremy L Thompson // Create tensor 2430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 244*437930d1SJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 245*437930d1SJeremy L Thompson const CeedScalar *interp_1d, 246*437930d1SJeremy L Thompson const CeedScalar *grad_1d, 247*437930d1SJeremy L Thompson const CeedScalar *q_ref_1d, 248*437930d1SJeremy L Thompson const CeedScalar *q_weight_1d, 2490d0321e0SJeremy L Thompson CeedBasis basis) { 2500d0321e0SJeremy L Thompson int ierr; 2510d0321e0SJeremy L Thompson Ceed ceed; 2520d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2530d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 2540d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 2550d0321e0SJeremy L Thompson 2560d0321e0SJeremy L Thompson // Copy data to GPU 257*437930d1SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 258*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 259*437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 260*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 2610d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 2620d0321e0SJeremy L Thompson 263*437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 264*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 265*437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 266*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 2670d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 2680d0321e0SJeremy L Thompson 269*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 270*437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 271*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 2720d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 2730d0321e0SJeremy L Thompson 2740d0321e0SJeremy L Thompson // Complie basis kernels 275*437930d1SJeremy L Thompson CeedInt num_comp; 276*437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 277*437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 278*437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-ref-basis-tensor.h", 279*437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 280*437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 281*437930d1SJeremy L Thompson CeedChkBackend(ierr); 282*437930d1SJeremy L Thompson ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, 283*437930d1SJeremy L Thompson "BASIS_Q1D", Q_1d, 284*437930d1SJeremy L Thompson "BASIS_P1D", P_1d, 285*437930d1SJeremy L Thompson "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 286*437930d1SJeremy L Thompson Q_1d : P_1d, dim), 2870d0321e0SJeremy L Thompson "BASIS_DIM", dim, 288*437930d1SJeremy L Thompson "BASIS_NCOMP", num_comp, 289*437930d1SJeremy L Thompson "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 290*437930d1SJeremy L Thompson "BASIS_NQPT", CeedIntPow(Q_1d, dim) 2910d0321e0SJeremy L Thompson ); CeedChkBackend(ierr); 292*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 2930d0321e0SJeremy L Thompson CeedChkBackend(ierr); 294*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 2950d0321e0SJeremy L Thompson CeedChkBackend(ierr); 296*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 2970d0321e0SJeremy L Thompson CeedChkBackend(ierr); 298*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 299*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 300*437930d1SJeremy L Thompson 3010d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 3020d0321e0SJeremy L Thompson 3030d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 3040d0321e0SJeremy L Thompson CeedBasisApply_Cuda); CeedChkBackend(ierr); 3050d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 3060d0321e0SJeremy L Thompson CeedBasisDestroy_Cuda); CeedChkBackend(ierr); 3070d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3080d0321e0SJeremy L Thompson } 3090d0321e0SJeremy L Thompson 3100d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3110d0321e0SJeremy L Thompson // Create non-tensor 3120d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 313*437930d1SJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, 314*437930d1SJeremy L Thompson CeedInt num_nodes, 315*437930d1SJeremy L Thompson CeedInt num_qpts, const CeedScalar *interp, 3160d0321e0SJeremy L Thompson const CeedScalar *grad, const CeedScalar *qref, 317*437930d1SJeremy L Thompson const CeedScalar *q_weight, CeedBasis basis) { 3180d0321e0SJeremy L Thompson int ierr; 3190d0321e0SJeremy L Thompson Ceed ceed; 3200d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 3210d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 3220d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 3230d0321e0SJeremy L Thompson 3240d0321e0SJeremy L Thompson // Copy basis data to GPU 325*437930d1SJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 326*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Cu(ceed, ierr); 327*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_q_weight, q_weight, q_bytes, 3280d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 3290d0321e0SJeremy L Thompson 330*437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * num_nodes; 331*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp, interp_bytes); 332*437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 333*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_interp, interp, interp_bytes, 3340d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 3350d0321e0SJeremy L Thompson 336*437930d1SJeremy L Thompson const CeedInt grad_bytes = q_bytes * num_nodes * dim; 337*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Cu(ceed, ierr); 338*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_grad, grad, grad_bytes, 3390d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 3400d0321e0SJeremy L Thompson 3410d0321e0SJeremy L Thompson // Compile basis kernels 342*437930d1SJeremy L Thompson CeedInt num_comp; 343*437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 344*437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 345*437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-ref-basis-nontensor.h", 346*437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 347*437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 348*437930d1SJeremy L Thompson CeedChkBackend(ierr); 349*437930d1SJeremy L Thompson ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, 350*437930d1SJeremy L Thompson "Q", num_qpts, 351*437930d1SJeremy L Thompson "P", num_nodes, 3520d0321e0SJeremy L Thompson "BASIS_DIM", dim, 353*437930d1SJeremy L Thompson "BASIS_NCOMP", num_comp 3540d0321e0SJeremy L Thompson ); CeedChk_Cu(ceed, ierr); 355*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 3560d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 357*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 3580d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 359*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 3600d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 361*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 362*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 3630d0321e0SJeremy L Thompson 3640d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 3650d0321e0SJeremy L Thompson 3660d0321e0SJeremy L Thompson // Register backend functions 3670d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 3680d0321e0SJeremy L Thompson CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr); 3690d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 3700d0321e0SJeremy L Thompson CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr); 3710d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3720d0321e0SJeremy L Thompson } 3730d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 374