1*3d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*3d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3c532df63SYohann // 4*3d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5c532df63SYohann // 6*3d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7c532df63SYohann 8ec3da8bcSJed Brown #include <ceed/ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 113d576824SJeremy L Thompson #include <cuda.h> 123d576824SJeremy L Thompson #include <cuda_runtime.h> 133d576824SJeremy L Thompson #include <stddef.h> 14c532df63SYohann #include "ceed-cuda-shared.h" 156d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 16c532df63SYohann 17c532df63SYohann 18ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 19ab213215SJeremy L Thompson // Device initalization 20ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 21437930d1SJeremy L Thompson int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, 22c532df63SYohann CeedScalar **c_B); 23437930d1SJeremy L Thompson int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, 24437930d1SJeremy L Thompson CeedInt Q_1d, CeedScalar **c_B_ptr, 257f823360Sjeremylt CeedScalar **c_G_ptr); 26c532df63SYohann 27ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 28ab213215SJeremy L Thompson // Apply basis 29ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 30437930d1SJeremy L Thompson int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, 31437930d1SJeremy L Thompson CeedTransposeMode t_mode, 32437930d1SJeremy L Thompson CeedEvalMode eval_mode, CeedVector u, 337f823360Sjeremylt CeedVector v) { 34c532df63SYohann int ierr; 35c532df63SYohann Ceed ceed; 36e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 376dbfb411Snbeams Ceed_Cuda *ceed_Cuda; 38e15f9bd0SJeremy L Thompson CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 39c532df63SYohann CeedBasis_Cuda_shared *data; 40e15f9bd0SJeremy L Thompson CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 41437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 42437930d1SJeremy L Thompson CeedInt dim, num_comp; 43e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 44437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 45c532df63SYohann 46ab213215SJeremy L Thompson // Read vectors 47c532df63SYohann const CeedScalar *d_u; 48c532df63SYohann CeedScalar *d_v; 49437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 50e15f9bd0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 51c532df63SYohann } 529c774eddSJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 53c532df63SYohann 54ab213215SJeremy L Thompson // Clear v for transpose mode 55437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 561f9221feSJeremy L Thompson CeedSize length; 57e15f9bd0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 58e15f9bd0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 59c532df63SYohann } 60ab213215SJeremy L Thompson 61ab213215SJeremy L Thompson // Apply basis operation 62437930d1SJeremy L Thompson switch (eval_mode) { 63ab213215SJeremy L Thompson case CEED_EVAL_INTERP: { 64437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 65437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 66437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 67437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 68437930d1SJeremy L Thompson ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); 69e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 70437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 71ccf0fe6fSjeremylt &d_u, &d_v 72ccf0fe6fSjeremylt }; 734d537eeaSYohann if (dim == 1) { 74e6f67ff7SJed Brown CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 75e6f67ff7SJed Brown CeedIntMax(512 / thread_1d, 76e6f67ff7SJed Brown 1)); // avoid >512 total threads 77437930d1SJeremy L Thompson CeedInt grid = num_elem/elems_per_block + 78437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 79437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 80437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, 81437930d1SJeremy L Thompson elems_per_block, shared_mem, 82437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 83074be161SYohann Dudouit } else if (dim == 2) { 84437930d1SJeremy L Thompson const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 85437930d1SJeremy L Thompson // elems_per_block must be at least 1 86437930d1SJeremy L Thompson CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 87437930d1SJeremy L Thompson num_comp : 1, 1); 88437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 89437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 90437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 91437930d1SJeremy L Thompson CeedScalar); 92437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 93437930d1SJeremy L Thompson thread_1d, 94437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 95437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 96074be161SYohann Dudouit } else if (dim == 3) { 97437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 98437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 99437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 100437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 101437930d1SJeremy L Thompson CeedScalar); 102437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 103437930d1SJeremy L Thompson thread_1d, 104437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 105437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 106074be161SYohann Dudouit } 107ab213215SJeremy L Thompson } break; 108ab213215SJeremy L Thompson case CEED_EVAL_GRAD: { 109437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 110437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 111437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 112437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 113437930d1SJeremy L Thompson ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d, 114437930d1SJeremy L Thompson Q_1d, &data->c_B, &data->c_G); 115e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 116437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B, 117ccf0fe6fSjeremylt &data->c_G, &d_u, &d_v 118ccf0fe6fSjeremylt }; 1194d537eeaSYohann if (dim == 1) { 120e6f67ff7SJed Brown CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], 121e6f67ff7SJed Brown CeedIntMax(512 / thread_1d, 122e6f67ff7SJed Brown 1)); // avoid >512 total threads 123437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 124437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 ); 125437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 126437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, 127437930d1SJeremy L Thompson elems_per_block, shared_mem, grad_args); 128e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 129074be161SYohann Dudouit } else if (dim == 2) { 130437930d1SJeremy L Thompson const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; 131437930d1SJeremy L Thompson // elems_per_block must be at least 1 132437930d1SJeremy L Thompson CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / 133437930d1SJeremy L Thompson num_comp : 1, 1); 134437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 135437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 136437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 137437930d1SJeremy L Thompson CeedScalar); 138437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 139437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 140437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 141074be161SYohann Dudouit } else if (dim == 3) { 142437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 143437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 144437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 145437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 146437930d1SJeremy L Thompson CeedScalar); 147437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, 148437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 149437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 150074be161SYohann Dudouit } 151ab213215SJeremy L Thompson } break; 152ab213215SJeremy L Thompson case CEED_EVAL_WEIGHT: { 153437930d1SJeremy L Thompson CeedInt Q_1d; 154437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 155437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 156074be161SYohann Dudouit if (dim == 1) { 157437930d1SJeremy L Thompson const CeedInt elems_per_block = 32 / Q_1d; 158437930d1SJeremy L Thompson const CeedInt gridsize = num_elem / elems_per_block + 159437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 160437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, 161437930d1SJeremy L Thompson elems_per_block, 1, weight_args); 162e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 163074be161SYohann Dudouit } else if (dim == 2) { 164437930d1SJeremy L Thompson const CeedInt opt_elems = 32 / (Q_1d * Q_1d); 165437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 166437930d1SJeremy L Thompson const CeedInt gridsize = num_elem / elems_per_block + 167437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 168437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, 169437930d1SJeremy L Thompson elems_per_block, weight_args); 170e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 171074be161SYohann Dudouit } else if (dim == 3) { 172437930d1SJeremy L Thompson const CeedInt gridsize = num_elem; 173437930d1SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d, 174437930d1SJeremy L Thompson weight_args); 175e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 176074be161SYohann Dudouit } 177ab213215SJeremy L Thompson } break; 178ab213215SJeremy L Thompson // LCOV_EXCL_START 179ab213215SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 180ab213215SJeremy L Thompson case CEED_EVAL_DIV: 181e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 182ab213215SJeremy L Thompson // Evaluate the curl to/from the quadrature points 183ab213215SJeremy L Thompson case CEED_EVAL_CURL: 184e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 185ab213215SJeremy L Thompson // Take no action, BasisApply should not have been called 186ab213215SJeremy L Thompson case CEED_EVAL_NONE: 187e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 188ab213215SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 189ab213215SJeremy L Thompson // LCOV_EXCL_STOP 190c532df63SYohann } 191c532df63SYohann 192ab213215SJeremy L Thompson // Restore vectors 193437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 194e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 195c532df63SYohann } 196e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 197e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 198c532df63SYohann } 199c532df63SYohann 200ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 201ab213215SJeremy L Thompson // Destroy basis 202ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 203c532df63SYohann static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { 204c532df63SYohann int ierr; 205c532df63SYohann Ceed ceed; 206e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 207c532df63SYohann 208c532df63SYohann CeedBasis_Cuda_shared *data; 209e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 210c532df63SYohann 211c532df63SYohann CeedChk_Cu(ceed, cuModuleUnload(data->module)); 212c532df63SYohann 213437930d1SJeremy L Thompson ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); 214437930d1SJeremy L Thompson ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); 215437930d1SJeremy L Thompson ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); 216437930d1SJeremy L Thompson ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); 217c532df63SYohann 218e15f9bd0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 219c532df63SYohann 220e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 221c532df63SYohann } 222c532df63SYohann 223ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 224ab213215SJeremy L Thompson // Create tensor basis 225ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 226437930d1SJeremy L Thompson int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 227437930d1SJeremy L Thompson const CeedScalar *interp_1d, 228437930d1SJeremy L Thompson const CeedScalar *grad_1d, 229437930d1SJeremy L Thompson const CeedScalar *q_ref_1d, 230437930d1SJeremy L Thompson const CeedScalar *q_weight_1d, 231c532df63SYohann CeedBasis basis) { 232c532df63SYohann int ierr; 233c532df63SYohann Ceed ceed; 234e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 235c532df63SYohann CeedBasis_Cuda_shared *data; 236e15f9bd0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 237c532df63SYohann 238ab213215SJeremy L Thompson // Copy basis data to GPU 239437930d1SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 240437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); 241437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 242437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, 243c532df63SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 244c532df63SYohann 245437930d1SJeremy L Thompson const CeedInt interp_bytes = q_bytes * P_1d; 246437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); 247437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 248437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, 249c532df63SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 250c532df63SYohann 251437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); 252437930d1SJeremy L Thompson CeedChk_Cu(ceed, ierr); 253437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, 254c532df63SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 255c532df63SYohann 256ab213215SJeremy L Thompson // Compute collocated gradient and copy to GPU 257437930d1SJeremy L Thompson data->d_collo_grad_1d = NULL; 258437930d1SJeremy L Thompson if (dim == 3 && Q_1d >= P_1d) { 259437930d1SJeremy L Thompson CeedScalar *collo_grad_1d; 260437930d1SJeremy L Thompson ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 261437930d1SJeremy L Thompson ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 262437930d1SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); 263ac421f39SYohann CeedChk_Cu(ceed, ierr); 264437930d1SJeremy L Thompson ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, 265ac421f39SYohann cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 266437930d1SJeremy L Thompson ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 267ac421f39SYohann } 268ac421f39SYohann 269ab213215SJeremy L Thompson // Compile basis kernels 270437930d1SJeremy L Thompson CeedInt num_comp; 271437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 272437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 273437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-shared-basis.h", 274437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 27546dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 276437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 277437930d1SJeremy L Thompson CeedChkBackend(ierr); 27846dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n"); 279437930d1SJeremy L Thompson ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, 280d7d111ecSJeremy L Thompson "BASIS_Q_1D", Q_1d, 281d7d111ecSJeremy L Thompson "BASIS_P_1D", P_1d, 282d7d111ecSJeremy L Thompson "BASIS_T_1D", CeedIntMax(Q_1d, P_1d), 283437930d1SJeremy L Thompson "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 284437930d1SJeremy L Thompson Q_1d : P_1d, dim), 285c532df63SYohann "BASIS_DIM", dim, 286d7d111ecSJeremy L Thompson "BASIS_NUM_COMP", num_comp, 287d7d111ecSJeremy L Thompson "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 288d7d111ecSJeremy L Thompson "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) 289e15f9bd0SJeremy L Thompson ); CeedChkBackend(ierr); 290437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); 291e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 292437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); 293e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 294437930d1SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); 295e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 296437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 297437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 298c532df63SYohann 299e15f9bd0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 300ab213215SJeremy L Thompson 301ab213215SJeremy L Thompson // Register backend functions 302c532df63SYohann ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 303c532df63SYohann CeedBasisApplyTensor_Cuda_shared); 304e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 305c532df63SYohann ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 306e15f9bd0SJeremy L Thompson CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); 307e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 308c532df63SYohann } 309ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 310