1c532df63SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2c532df63SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3c532df63SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4c532df63SYohann // 5c532df63SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6c532df63SYohann // libraries and APIs for efficient high-order finite element and spectral 7c532df63SYohann // element discretizations for exascale applications. For more information and 8c532df63SYohann // source code availability see http://github.com/ceed. 9c532df63SYohann // 10c532df63SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11c532df63SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12c532df63SYohann // of Science and the National Nuclear Security Administration) responsible for 13c532df63SYohann // the planning and preparation of a capable exascale ecosystem, including 14c532df63SYohann // software, applications, hardware, advanced system engineering and early 15c532df63SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16c532df63SYohann 17ec3da8bcSJed Brown #include <ceed/ceed.h> 18ec3da8bcSJed Brown #include <ceed/backend.h> 19*437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 203d576824SJeremy L Thompson #include <cuda.h> 213d576824SJeremy L Thompson #include <cuda_runtime.h> 223d576824SJeremy L Thompson #include <stddef.h> 23c532df63SYohann #include "ceed-cuda-shared.h" 246d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 25c532df63SYohann 26c532df63SYohann 27ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 28ab213215SJeremy L Thompson // Device initalization 29ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 30*437930d1SJeremy L Thompson int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, 31c532df63SYohann CeedScalar **c_B); 32*437930d1SJeremy L Thompson int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 33*437930d1SJeremy L Thompson CeedInt Q_1d, CeedScalar **c_B_ptr, 347f823360Sjeremylt CeedScalar **c_G_ptr); 35c532df63SYohann 36ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 37ab213215SJeremy L Thompson // Apply basis 38ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 39*437930d1SJeremy L Thompson int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, 40*437930d1SJeremy L Thompson CeedTransposeMode t_mode, 41*437930d1SJeremy L Thompson CeedEvalMode eval_mode, CeedVector u, 427f823360Sjeremylt CeedVector v) { 43c532df63SYohann int ierr; 44c532df63SYohann Ceed ceed; 45e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 466dbfb411Snbeams Ceed_Cuda *ceed_Cuda; 47e15f9bd0SJeremy L Thompson CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 48c532df63SYohann CeedBasis_Cuda_shared *data; 49e15f9bd0SJeremy L Thompson CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 50*437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 51*437930d1SJeremy L Thompson CeedInt dim, num_comp; 52e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 53*437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 54c532df63SYohann 55ab213215SJeremy L Thompson // Read vectors 56c532df63SYohann const CeedScalar *d_u; 57c532df63SYohann CeedScalar *d_v; 58*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 59e15f9bd0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 60c532df63SYohann } 619c774eddSJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 62c532df63SYohann 63ab213215SJeremy L Thompson // Clear v for transpose mode 64*437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 65c532df63SYohann CeedInt length; 66e15f9bd0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 67e15f9bd0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 68c532df63SYohann } 69ab213215SJeremy L Thompson 70ab213215SJeremy L Thompson // Apply basis operation 71*437930d1SJeremy L Thompson switch (eval_mode) { 72ab213215SJeremy L Thompson case CEED_EVAL_INTERP: { 73*437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 74*437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 75*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 76*437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 77*437930d1SJeremy L Thompson ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); 78e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 79*437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 80ccf0fe6fSjeremylt &d_u, &d_v 81ccf0fe6fSjeremylt }; 824d537eeaSYohann if (dim == 1) { 83*437930d1SJeremy L Thompson CeedInt elems_per_block = 32; 84*437930d1SJeremy L Thompson CeedInt grid = num_elem/elems_per_block + 85*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 86*437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 87*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, 88*437930d1SJeremy L Thompson elems_per_block, shared_mem, 89*437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 90074be161SYohann Dudouit } else if (dim == 2) { 91*437930d1SJeremy L Thompson const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 92*437930d1SJeremy L Thompson // elems_per_block must be at least 1 93*437930d1SJeremy L Thompson CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 94*437930d1SJeremy L Thompson num_comp : 1, 1); 95*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 96*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 97*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 98*437930d1SJeremy L Thompson CeedScalar); 99*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 100*437930d1SJeremy L Thompson thread_1d, 101*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 102*437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 103074be161SYohann Dudouit } else if (dim == 3) { 104*437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 105*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 106*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 107*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 108*437930d1SJeremy L Thompson CeedScalar); 109*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 110*437930d1SJeremy L Thompson thread_1d, 111*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 112*437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 113074be161SYohann Dudouit } 114ab213215SJeremy L Thompson } break; 115ab213215SJeremy L Thompson case CEED_EVAL_GRAD: { 116*437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 117*437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 118*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 119*437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 120*437930d1SJeremy L Thompson ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d, 121*437930d1SJeremy L Thompson Q_1d, &data->c_B, &data->c_G); 122e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 123*437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 124ccf0fe6fSjeremylt &data->c_G, &d_u, &d_v 125ccf0fe6fSjeremylt }; 1264d537eeaSYohann if (dim == 1) { 127*437930d1SJeremy L Thompson CeedInt elems_per_block = 32; 128*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 129*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 ); 130*437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 131*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, 132*437930d1SJeremy L Thompson elems_per_block, shared_mem, grad_args); 133e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 134074be161SYohann Dudouit } else if (dim == 2) { 135*437930d1SJeremy L Thompson const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 136*437930d1SJeremy L Thompson // elems_per_block must be at least 1 137*437930d1SJeremy L Thompson CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 138*437930d1SJeremy L Thompson num_comp : 1, 1); 139*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 140*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 141*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 142*437930d1SJeremy L Thompson CeedScalar); 143*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 144*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 145*437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 146074be161SYohann Dudouit } else if (dim == 3) { 147*437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 148*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 149*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 150*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 151*437930d1SJeremy L Thompson CeedScalar); 152*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 153*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 154*437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 155074be161SYohann Dudouit } 156ab213215SJeremy L Thompson } break; 157ab213215SJeremy L Thompson case CEED_EVAL_WEIGHT: { 158*437930d1SJeremy L Thompson CeedInt Q_1d; 159*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 160*437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 161074be161SYohann Dudouit if (dim == 1) { 162*437930d1SJeremy L Thompson const CeedInt elems_per_block = 32 / Q_1d; 163*437930d1SJeremy L Thompson const CeedInt gridsize = num_elem / elems_per_block + 164*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 165*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, 166*437930d1SJeremy L Thompson elems_per_block, 1, weight_args); 167e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 168074be161SYohann Dudouit } else if (dim == 2) { 169*437930d1SJeremy L Thompson const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 170*437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 171*437930d1SJeremy L Thompson const CeedInt gridsize = num_elem / elems_per_block + 172*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 173*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 174*437930d1SJeremy L Thompson elems_per_block, weight_args); 175e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 176074be161SYohann Dudouit } else if (dim == 3) { 177*437930d1SJeremy L Thompson const CeedInt gridsize = num_elem; 178*437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d, 179*437930d1SJeremy L Thompson weight_args); 180e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 181074be161SYohann Dudouit } 182ab213215SJeremy L Thompson } break; 183ab213215SJeremy L Thompson // LCOV_EXCL_START 184ab213215SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 185ab213215SJeremy L Thompson case CEED_EVAL_DIV: 186e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 187ab213215SJeremy L Thompson // Evaluate the curl to/from the quadrature points 188ab213215SJeremy L Thompson case CEED_EVAL_CURL: 189e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 190ab213215SJeremy L Thompson // Take no action, BasisApply should not have been called 191ab213215SJeremy L Thompson case CEED_EVAL_NONE: 192e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 193ab213215SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 194ab213215SJeremy L Thompson // LCOV_EXCL_STOP 195c532df63SYohann } 196c532df63SYohann 197ab213215SJeremy L Thompson // Restore vectors 198*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 199e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 200c532df63SYohann } 201e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 202e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 203c532df63SYohann } 204c532df63SYohann 205ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 206ab213215SJeremy L Thompson // Destroy basis 207ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 208c532df63SYohann static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 209c532df63SYohann int ierr; 210c532df63SYohann Ceed ceed; 211e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 212c532df63SYohann 213c532df63SYohann CeedBasis_Cuda_shared *data; 214e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 215c532df63SYohann 216c532df63SYohann CeedChk_Cu(ceed, cuModuleUnload(data->module)); 217c532df63SYohann 218*437930d1SJeremy L Thompson ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 219*437930d1SJeremy L Thompson ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 220*437930d1SJeremy L Thompson ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 221*437930d1SJeremy L Thompson ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); 222c532df63SYohann 223e15f9bd0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 224c532df63SYohann 225e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 226c532df63SYohann } 227c532df63SYohann 228ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 229ab213215SJeremy L Thompson // Create tensor basis 230ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 231*437930d1SJeremy L Thompson int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 232*437930d1SJeremy L Thompson const CeedScalar *interp_1d, 233*437930d1SJeremy L Thompson const CeedScalar *grad_1d, 234*437930d1SJeremy L Thompson const CeedScalar *q_ref_1d, 235*437930d1SJeremy L Thompson const CeedScalar *q_weight_1d, 236c532df63SYohann CeedBasis basis) { 237c532df63SYohann int ierr; 238c532df63SYohann Ceed ceed; 239e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 240c532df63SYohann CeedBasis_Cuda_shared *data; 241e15f9bd0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 242c532df63SYohann 243ab213215SJeremy L Thompson // Copy basis data to GPU 244*437930d1SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 245*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 246*437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 247*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 248c532df63SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 249c532df63SYohann 250*437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 251*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 252*437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 253*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 254c532df63SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 255c532df63SYohann 256*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 257*437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 258*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 259c532df63SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 260c532df63SYohann 261ab213215SJeremy L Thompson // Compute collocated gradient and copy to GPU 262*437930d1SJeremy L Thompson data->d_collo_grad_1d = NULL; 263*437930d1SJeremy L Thompson if (dim == 3 && Q_1d >= P_1d) { 264*437930d1SJeremy L Thompson CeedScalar *collo_grad_1d; 265*437930d1SJeremy L Thompson ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 266*437930d1SJeremy L Thompson ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 267*437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); 268ac421f39SYohann CeedChk_Cu(ceed, ierr); 269*437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, 270ac421f39SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 271*437930d1SJeremy L Thompson ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 272ac421f39SYohann } 273ac421f39SYohann 274ab213215SJeremy L Thompson // Compile basis kernels 275*437930d1SJeremy L Thompson CeedInt num_comp; 276*437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 277*437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 278*437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-shared-basis.h", 279*437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 280*437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 281*437930d1SJeremy L Thompson CeedChkBackend(ierr); 282*437930d1SJeremy L Thompson ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, 283*437930d1SJeremy L Thompson "Q1D", Q_1d, 284*437930d1SJeremy L Thompson "P1D", P_1d, 285*437930d1SJeremy L Thompson "T1D", CeedIntMax(Q_1d, P_1d), 286*437930d1SJeremy L Thompson "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 287*437930d1SJeremy L Thompson Q_1d : P_1d, dim), 288c532df63SYohann "BASIS_DIM", dim, 289*437930d1SJeremy L Thompson "BASIS_NCOMP", num_comp, 290*437930d1SJeremy L Thompson "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 291*437930d1SJeremy L Thompson "BASIS_NQPT", CeedIntPow(Q_1d, dim) 292e15f9bd0SJeremy L Thompson ); CeedChkBackend(ierr); 293*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 294e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 295*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 296e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 297*437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 298e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 299*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 300*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 301c532df63SYohann 302e15f9bd0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 303ab213215SJeremy L Thompson 304ab213215SJeremy L Thompson // Register backend functions 305c532df63SYohann ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 306c532df63SYohann CeedBasisApplyTensor_Cuda_shared); 307e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 308c532df63SYohann ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 309e15f9bd0SJeremy L Thompson CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); 310e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 311c532df63SYohann } 312ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 313