xref: /libCEED/backends/cuda-shared/ceed-cuda-shared-basis.c (revision e6f67ff732e27992a07b79bab2d86654e6d26ce3)
1c532df63SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2c532df63SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3c532df63SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4c532df63SYohann //
5c532df63SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6c532df63SYohann // libraries and APIs for efficient high-order finite element and spectral
7c532df63SYohann // element discretizations for exascale applications. For more information and
8c532df63SYohann // source code availability see http://github.com/ceed.
9c532df63SYohann //
10c532df63SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11c532df63SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12c532df63SYohann // of Science and the National Nuclear Security Administration) responsible for
13c532df63SYohann // the planning and preparation of a capable exascale ecosystem, including
14c532df63SYohann // software, applications, hardware, advanced system engineering and early
15c532df63SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16c532df63SYohann 
17ec3da8bcSJed Brown #include <ceed/ceed.h>
18ec3da8bcSJed Brown #include <ceed/backend.h>
19437930d1SJeremy L Thompson #include <ceed/jit-tools.h>
203d576824SJeremy L Thompson #include <cuda.h>
213d576824SJeremy L Thompson #include <cuda_runtime.h>
223d576824SJeremy L Thompson #include <stddef.h>
23c532df63SYohann #include "ceed-cuda-shared.h"
246d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
25c532df63SYohann 
26c532df63SYohann 
27ab213215SJeremy L Thompson //------------------------------------------------------------------------------
28ab213215SJeremy L Thompson // Device initalization
29ab213215SJeremy L Thompson //------------------------------------------------------------------------------
30437930d1SJeremy L Thompson int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
31c532df63SYohann                        CeedScalar **c_B);
32437930d1SJeremy L Thompson int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d,
33437930d1SJeremy L Thompson                            CeedInt Q_1d, CeedScalar **c_B_ptr,
347f823360Sjeremylt                            CeedScalar **c_G_ptr);
35c532df63SYohann 
36ab213215SJeremy L Thompson //------------------------------------------------------------------------------
37ab213215SJeremy L Thompson // Apply basis
38ab213215SJeremy L Thompson //------------------------------------------------------------------------------
39437930d1SJeremy L Thompson int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem,
40437930d1SJeremy L Thompson                                      CeedTransposeMode t_mode,
41437930d1SJeremy L Thompson                                      CeedEvalMode eval_mode, CeedVector u,
427f823360Sjeremylt                                      CeedVector v) {
43c532df63SYohann   int ierr;
44c532df63SYohann   Ceed ceed;
45e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
466dbfb411Snbeams   Ceed_Cuda *ceed_Cuda;
47e15f9bd0SJeremy L Thompson   CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
48c532df63SYohann   CeedBasis_Cuda_shared *data;
49e15f9bd0SJeremy L Thompson   CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
50437930d1SJeremy L Thompson   const CeedInt transpose = t_mode == CEED_TRANSPOSE;
51437930d1SJeremy L Thompson   CeedInt dim, num_comp;
52e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
53437930d1SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
54c532df63SYohann 
55ab213215SJeremy L Thompson   // Read vectors
56c532df63SYohann   const CeedScalar *d_u;
57c532df63SYohann   CeedScalar *d_v;
58437930d1SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) {
59e15f9bd0SJeremy L Thompson     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
60c532df63SYohann   }
619c774eddSJeremy L Thompson   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
62c532df63SYohann 
63ab213215SJeremy L Thompson   // Clear v for transpose mode
64437930d1SJeremy L Thompson   if (t_mode == CEED_TRANSPOSE) {
65c532df63SYohann     CeedInt length;
66e15f9bd0SJeremy L Thompson     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
67e15f9bd0SJeremy L Thompson     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr);
68c532df63SYohann   }
69ab213215SJeremy L Thompson 
70ab213215SJeremy L Thompson   // Apply basis operation
71437930d1SJeremy L Thompson   switch (eval_mode) {
72ab213215SJeremy L Thompson   case CEED_EVAL_INTERP: {
73437930d1SJeremy L Thompson     CeedInt P_1d, Q_1d;
74437930d1SJeremy L Thompson     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
75437930d1SJeremy L Thompson     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
76437930d1SJeremy L Thompson     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
77437930d1SJeremy L Thompson     ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B);
78e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
79437930d1SJeremy L Thompson     void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B,
80ccf0fe6fSjeremylt                            &d_u, &d_v
81ccf0fe6fSjeremylt                           };
824d537eeaSYohann     if (dim == 1) {
83*e6f67ff7SJed Brown       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
84*e6f67ff7SJed Brown                                            CeedIntMax(512 / thread_1d,
85*e6f67ff7SJed Brown                                                1)); // avoid >512 total threads
86437930d1SJeremy L Thompson       CeedInt grid = num_elem/elems_per_block +
87437930d1SJeremy L Thompson                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
88437930d1SJeremy L Thompson       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
89437930d1SJeremy L Thompson       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1,
90437930d1SJeremy L Thompson                                         elems_per_block, shared_mem,
91437930d1SJeremy L Thompson                                         interp_args); CeedChkBackend(ierr);
92074be161SYohann Dudouit     } else if (dim == 2) {
93437930d1SJeremy L Thompson       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
94437930d1SJeremy L Thompson       // elems_per_block must be at least 1
95437930d1SJeremy L Thompson       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
96437930d1SJeremy L Thompson                                            num_comp : 1, 1);
97437930d1SJeremy L Thompson       CeedInt grid = num_elem / elems_per_block +
98437930d1SJeremy L Thompson                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
99437930d1SJeremy L Thompson       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
100437930d1SJeremy L Thompson                              CeedScalar);
101437930d1SJeremy L Thompson       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
102437930d1SJeremy L Thompson                                         thread_1d,
103437930d1SJeremy L Thompson                                         num_comp*elems_per_block, shared_mem,
104437930d1SJeremy L Thompson                                         interp_args); CeedChkBackend(ierr);
105074be161SYohann Dudouit     } else if (dim == 3) {
106437930d1SJeremy L Thompson       CeedInt elems_per_block = 1;
107437930d1SJeremy L Thompson       CeedInt grid = num_elem / elems_per_block +
108437930d1SJeremy L Thompson                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
109437930d1SJeremy L Thompson       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
110437930d1SJeremy L Thompson                              CeedScalar);
111437930d1SJeremy L Thompson       ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d,
112437930d1SJeremy L Thompson                                         thread_1d,
113437930d1SJeremy L Thompson                                         num_comp*elems_per_block, shared_mem,
114437930d1SJeremy L Thompson                                         interp_args); CeedChkBackend(ierr);
115074be161SYohann Dudouit     }
116ab213215SJeremy L Thompson   } break;
117ab213215SJeremy L Thompson   case CEED_EVAL_GRAD: {
118437930d1SJeremy L Thompson     CeedInt P_1d, Q_1d;
119437930d1SJeremy L Thompson     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
120437930d1SJeremy L Thompson     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
121437930d1SJeremy L Thompson     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
122437930d1SJeremy L Thompson     ierr = CeedCudaInitInterpGrad(data->d_interp_1d, data->d_grad_1d, P_1d,
123437930d1SJeremy L Thompson                                   Q_1d, &data->c_B, &data->c_G);
124e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
125437930d1SJeremy L Thompson     void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->c_B,
126ccf0fe6fSjeremylt                          &data->c_G, &d_u, &d_v
127ccf0fe6fSjeremylt                         };
1284d537eeaSYohann     if (dim == 1) {
129*e6f67ff7SJed Brown       CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2],
130*e6f67ff7SJed Brown                                            CeedIntMax(512 / thread_1d,
131*e6f67ff7SJed Brown                                                1)); // avoid >512 total threads
132437930d1SJeremy L Thompson       CeedInt grid = num_elem / elems_per_block +
133437930d1SJeremy L Thompson                      ((num_elem / elems_per_block*elems_per_block<num_elem) ? 1 : 0 );
134437930d1SJeremy L Thompson       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
135437930d1SJeremy L Thompson       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1,
136437930d1SJeremy L Thompson                                         elems_per_block, shared_mem, grad_args);
137e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
138074be161SYohann Dudouit     } else if (dim == 2) {
139437930d1SJeremy L Thompson       const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
140437930d1SJeremy L Thompson       // elems_per_block must be at least 1
141437930d1SJeremy L Thompson       CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] /
142437930d1SJeremy L Thompson                                            num_comp : 1, 1);
143437930d1SJeremy L Thompson       CeedInt grid = num_elem / elems_per_block +
144437930d1SJeremy L Thompson                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
145437930d1SJeremy L Thompson       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
146437930d1SJeremy L Thompson                              CeedScalar);
147437930d1SJeremy L Thompson       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
148437930d1SJeremy L Thompson                                         num_comp*elems_per_block, shared_mem,
149437930d1SJeremy L Thompson                                         grad_args); CeedChkBackend(ierr);
150074be161SYohann Dudouit     } else if (dim == 3) {
151437930d1SJeremy L Thompson       CeedInt elems_per_block = 1;
152437930d1SJeremy L Thompson       CeedInt grid = num_elem / elems_per_block +
153437930d1SJeremy L Thompson                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
154437930d1SJeremy L Thompson       CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof(
155437930d1SJeremy L Thompson                              CeedScalar);
156437930d1SJeremy L Thompson       ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d,
157437930d1SJeremy L Thompson                                         num_comp*elems_per_block, shared_mem,
158437930d1SJeremy L Thompson                                         grad_args); CeedChkBackend(ierr);
159074be161SYohann Dudouit     }
160ab213215SJeremy L Thompson   } break;
161ab213215SJeremy L Thompson   case CEED_EVAL_WEIGHT: {
162437930d1SJeremy L Thompson     CeedInt Q_1d;
163437930d1SJeremy L Thompson     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
164437930d1SJeremy L Thompson     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v};
165074be161SYohann Dudouit     if (dim == 1) {
166437930d1SJeremy L Thompson       const CeedInt elems_per_block = 32 / Q_1d;
167437930d1SJeremy L Thompson       const CeedInt gridsize = num_elem / elems_per_block +
168437930d1SJeremy L Thompson                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
169437930d1SJeremy L Thompson       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d,
170437930d1SJeremy L Thompson                                   elems_per_block, 1, weight_args);
171e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
172074be161SYohann Dudouit     } else if (dim == 2) {
173437930d1SJeremy L Thompson       const CeedInt opt_elems = 32 / (Q_1d * Q_1d);
174437930d1SJeremy L Thompson       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
175437930d1SJeremy L Thompson       const CeedInt gridsize = num_elem / elems_per_block +
176437930d1SJeremy L Thompson                                ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
177437930d1SJeremy L Thompson       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d,
178437930d1SJeremy L Thompson                                   elems_per_block, weight_args);
179e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
180074be161SYohann Dudouit     } else if (dim == 3) {
181437930d1SJeremy L Thompson       const CeedInt gridsize = num_elem;
182437930d1SJeremy L Thompson       ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, Q_1d,
183437930d1SJeremy L Thompson                                   weight_args);
184e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
185074be161SYohann Dudouit     }
186ab213215SJeremy L Thompson   } break;
187ab213215SJeremy L Thompson   // LCOV_EXCL_START
188ab213215SJeremy L Thompson   // Evaluate the divergence to/from the quadrature points
189ab213215SJeremy L Thompson   case CEED_EVAL_DIV:
190e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
191ab213215SJeremy L Thompson   // Evaluate the curl to/from the quadrature points
192ab213215SJeremy L Thompson   case CEED_EVAL_CURL:
193e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
194ab213215SJeremy L Thompson   // Take no action, BasisApply should not have been called
195ab213215SJeremy L Thompson   case CEED_EVAL_NONE:
196e15f9bd0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
197ab213215SJeremy L Thompson                      "CEED_EVAL_NONE does not make sense in this context");
198ab213215SJeremy L Thompson     // LCOV_EXCL_STOP
199c532df63SYohann   }
200c532df63SYohann 
201ab213215SJeremy L Thompson   // Restore vectors
202437930d1SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) {
203e15f9bd0SJeremy L Thompson     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
204c532df63SYohann   }
205e15f9bd0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
206e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
207c532df63SYohann }
208c532df63SYohann 
209ab213215SJeremy L Thompson //------------------------------------------------------------------------------
210ab213215SJeremy L Thompson // Destroy basis
211ab213215SJeremy L Thompson //------------------------------------------------------------------------------
212c532df63SYohann static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
213c532df63SYohann   int ierr;
214c532df63SYohann   Ceed ceed;
215e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
216c532df63SYohann 
217c532df63SYohann   CeedBasis_Cuda_shared *data;
218e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
219c532df63SYohann 
220c532df63SYohann   CeedChk_Cu(ceed, cuModuleUnload(data->module));
221c532df63SYohann 
222437930d1SJeremy L Thompson   ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr);
223437930d1SJeremy L Thompson   ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr);
224437930d1SJeremy L Thompson   ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr);
225437930d1SJeremy L Thompson   ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr);
226c532df63SYohann 
227e15f9bd0SJeremy L Thompson   ierr = CeedFree(&data); CeedChkBackend(ierr);
228c532df63SYohann 
229e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
230c532df63SYohann }
231c532df63SYohann 
232ab213215SJeremy L Thompson //------------------------------------------------------------------------------
233ab213215SJeremy L Thompson // Create tensor basis
234ab213215SJeremy L Thompson //------------------------------------------------------------------------------
235437930d1SJeremy L Thompson int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
236437930d1SJeremy L Thompson                                         const CeedScalar *interp_1d,
237437930d1SJeremy L Thompson                                         const CeedScalar *grad_1d,
238437930d1SJeremy L Thompson                                         const CeedScalar *q_ref_1d,
239437930d1SJeremy L Thompson                                         const CeedScalar *q_weight_1d,
240c532df63SYohann                                         CeedBasis basis) {
241c532df63SYohann   int ierr;
242c532df63SYohann   Ceed ceed;
243e15f9bd0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
244c532df63SYohann   CeedBasis_Cuda_shared *data;
245e15f9bd0SJeremy L Thompson   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
246c532df63SYohann 
247ab213215SJeremy L Thompson   // Copy basis data to GPU
248437930d1SJeremy L Thompson   const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
249437930d1SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes);
250437930d1SJeremy L Thompson   CeedChk_Cu(ceed, ierr);
251437930d1SJeremy L Thompson   ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes,
252c532df63SYohann                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
253c532df63SYohann 
254437930d1SJeremy L Thompson   const CeedInt interp_bytes = q_bytes * P_1d;
255437930d1SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes);
256437930d1SJeremy L Thompson   CeedChk_Cu(ceed, ierr);
257437930d1SJeremy L Thompson   ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes,
258c532df63SYohann                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
259c532df63SYohann 
260437930d1SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes);
261437930d1SJeremy L Thompson   CeedChk_Cu(ceed, ierr);
262437930d1SJeremy L Thompson   ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes,
263c532df63SYohann                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
264c532df63SYohann 
265ab213215SJeremy L Thompson   // Compute collocated gradient and copy to GPU
266437930d1SJeremy L Thompson   data->d_collo_grad_1d = NULL;
267437930d1SJeremy L Thompson   if (dim == 3 && Q_1d >= P_1d) {
268437930d1SJeremy L Thompson     CeedScalar *collo_grad_1d;
269437930d1SJeremy L Thompson     ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr);
270437930d1SJeremy L Thompson     ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr);
271437930d1SJeremy L Thompson     ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d);
272ac421f39SYohann     CeedChk_Cu(ceed, ierr);
273437930d1SJeremy L Thompson     ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d,
274ac421f39SYohann                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
275437930d1SJeremy L Thompson     ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr);
276ac421f39SYohann   }
277ac421f39SYohann 
278ab213215SJeremy L Thompson   // Compile basis kernels
279437930d1SJeremy L Thompson   CeedInt num_comp;
280437930d1SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
281437930d1SJeremy L Thompson   char *basis_kernel_path, *basis_kernel_source;
282437930d1SJeremy L Thompson   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/cuda-shared-basis.h",
283437930d1SJeremy L Thompson                              &basis_kernel_path); CeedChkBackend(ierr);
28446dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
285437930d1SJeremy L Thompson   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
286437930d1SJeremy L Thompson   CeedChkBackend(ierr);
28746dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n");
288437930d1SJeremy L Thompson   ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8,
289d7d111ecSJeremy L Thompson                          "BASIS_Q_1D", Q_1d,
290d7d111ecSJeremy L Thompson                          "BASIS_P_1D", P_1d,
291d7d111ecSJeremy L Thompson                          "BASIS_T_1D", CeedIntMax(Q_1d, P_1d),
292437930d1SJeremy L Thompson                          "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ?
293437930d1SJeremy L Thompson                              Q_1d : P_1d, dim),
294c532df63SYohann                          "BASIS_DIM", dim,
295d7d111ecSJeremy L Thompson                          "BASIS_NUM_COMP", num_comp,
296d7d111ecSJeremy L Thompson                          "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
297d7d111ecSJeremy L Thompson                          "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)
298e15f9bd0SJeremy L Thompson                         ); CeedChkBackend(ierr);
299437930d1SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp);
300e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
301437930d1SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad);
302e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
303437930d1SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight);
304e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
305437930d1SJeremy L Thompson   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
306437930d1SJeremy L Thompson   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
307c532df63SYohann 
308e15f9bd0SJeremy L Thompson   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
309ab213215SJeremy L Thompson 
310ab213215SJeremy L Thompson   // Register backend functions
311c532df63SYohann   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
312c532df63SYohann                                 CeedBasisApplyTensor_Cuda_shared);
313e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
314c532df63SYohann   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
315e15f9bd0SJeremy L Thompson                                 CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr);
316e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
317c532df63SYohann }
318ab213215SJeremy L Thompson //------------------------------------------------------------------------------
319