xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-ref/ceed-cuda-ref-basis.c (revision 34d146140c2fce42d8bc042f039d47d4ff020864)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
849aac155SJeremy L Thompson #include <ceed.h>
90d0321e0SJeremy L Thompson #include <ceed/backend.h>
10437930d1SJeremy L Thompson #include <ceed/jit-tools.h>
110d0321e0SJeremy L Thompson #include <cuda.h>
120d0321e0SJeremy L Thompson #include <cuda_runtime.h>
132b730f8bSJeremy L Thompson 
1449aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h"
150d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
162b730f8bSJeremy L Thompson #include "ceed-cuda-ref.h"
170d0321e0SJeremy L Thompson 
180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
190d0321e0SJeremy L Thompson // Basis apply - tensor
200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
212b730f8bSJeremy L Thompson int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
220d0321e0SJeremy L Thompson   Ceed              ceed;
23ca735530SJeremy L Thompson   CeedInt           Q_1d, dim;
247bbbfca3SJeremy L Thompson   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
25437930d1SJeremy L Thompson   const int         max_block_size = 32;
260d0321e0SJeremy L Thompson   const CeedScalar *d_u;
270d0321e0SJeremy L Thompson   CeedScalar       *d_v;
28ca735530SJeremy L Thompson   CeedBasis_Cuda   *data;
29ca735530SJeremy L Thompson 
30ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
31ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &data));
32ca735530SJeremy L Thompson 
339ea2cfd9SJeremy L Thompson   // Get read/write access to u, v
346574a04fSJeremy L Thompson   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
356574a04fSJeremy L Thompson   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
362b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
370d0321e0SJeremy L Thompson 
380d0321e0SJeremy L Thompson   // Clear v for transpose operation
397bbbfca3SJeremy L Thompson   if (is_transpose) {
401f9221feSJeremy L Thompson     CeedSize length;
41ca735530SJeremy L Thompson 
422b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(v, &length));
432b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
440d0321e0SJeremy L Thompson   }
452b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
462b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
470d0321e0SJeremy L Thompson 
480d0321e0SJeremy L Thompson   // Basis action
49437930d1SJeremy L Thompson   switch (eval_mode) {
500d0321e0SJeremy L Thompson     case CEED_EVAL_INTERP: {
517bbbfca3SJeremy L Thompson       void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v};
52b2165e7aSSebastian Grimberg       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
530d0321e0SJeremy L Thompson 
54eb7e6cafSJeremy L Thompson       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args));
550d0321e0SJeremy L Thompson     } break;
560d0321e0SJeremy L Thompson     case CEED_EVAL_GRAD: {
577bbbfca3SJeremy L Thompson       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
58b2165e7aSSebastian Grimberg       const CeedInt block_size  = max_block_size;
590d0321e0SJeremy L Thompson 
60eb7e6cafSJeremy L Thompson       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args));
610d0321e0SJeremy L Thompson     } break;
620d0321e0SJeremy L Thompson     case CEED_EVAL_WEIGHT: {
63097cc795SJames Wright       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
64437930d1SJeremy L Thompson       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
65b2165e7aSSebastian Grimberg       const int block_size_x  = Q_1d;
66b2165e7aSSebastian Grimberg       const int block_size_y  = dim >= 2 ? Q_1d : 1;
67b2165e7aSSebastian Grimberg 
68b2165e7aSSebastian Grimberg       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args));
690d0321e0SJeremy L Thompson     } break;
709ea2cfd9SJeremy L Thompson     case CEED_EVAL_NONE: /* handled separately below */
719ea2cfd9SJeremy L Thompson       break;
720d0321e0SJeremy L Thompson     // LCOV_EXCL_START
730d0321e0SJeremy L Thompson     case CEED_EVAL_DIV:
740d0321e0SJeremy L Thompson     case CEED_EVAL_CURL:
75bcbe1c99SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
760d0321e0SJeremy L Thompson       // LCOV_EXCL_STOP
770d0321e0SJeremy L Thompson   }
780d0321e0SJeremy L Thompson 
799ea2cfd9SJeremy L Thompson   // Restore vectors, cover CEED_EVAL_NONE
802b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
819ea2cfd9SJeremy L Thompson   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
829ea2cfd9SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
830d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
840d0321e0SJeremy L Thompson }
850d0321e0SJeremy L Thompson 
860d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
87*34d14614SJeremy L Thompson // Basis apply - tensor AtPoints
88*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
89*34d14614SJeremy L Thompson int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
90*34d14614SJeremy L Thompson                                 CeedVector x_ref, CeedVector u, CeedVector v) {
91*34d14614SJeremy L Thompson   Ceed              ceed;
92*34d14614SJeremy L Thompson   CeedInt           Q_1d, dim, max_num_points = num_points[0];
93*34d14614SJeremy L Thompson   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
94*34d14614SJeremy L Thompson   const int         max_block_size = 32;
95*34d14614SJeremy L Thompson   const CeedScalar *d_x, *d_u;
96*34d14614SJeremy L Thompson   CeedScalar       *d_v;
97*34d14614SJeremy L Thompson   CeedBasis_Cuda   *data;
98*34d14614SJeremy L Thompson 
99*34d14614SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
100*34d14614SJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &data));
101*34d14614SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
102*34d14614SJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
103*34d14614SJeremy L Thompson 
104*34d14614SJeremy L Thompson   // Check uniform number of points per elem
105*34d14614SJeremy L Thompson   for (CeedInt i = 1; i < num_elem; i++) {
106*34d14614SJeremy L Thompson     CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
107*34d14614SJeremy L Thompson               "BasisApplyAtPoints only supported for the same number of points in each element");
108*34d14614SJeremy L Thompson   }
109*34d14614SJeremy L Thompson 
110*34d14614SJeremy L Thompson   // Weight handled separately
111*34d14614SJeremy L Thompson   if (eval_mode == CEED_EVAL_WEIGHT) {
112*34d14614SJeremy L Thompson     CeedCall(CeedVectorSetValue(v, 1.0));
113*34d14614SJeremy L Thompson     return CEED_ERROR_SUCCESS;
114*34d14614SJeremy L Thompson   }
115*34d14614SJeremy L Thompson 
116*34d14614SJeremy L Thompson   // Build kernels if needed
117*34d14614SJeremy L Thompson   if (data->num_points != max_num_points) {
118*34d14614SJeremy L Thompson     CeedInt P_1d;
119*34d14614SJeremy L Thompson 
120*34d14614SJeremy L Thompson     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
121*34d14614SJeremy L Thompson     data->num_points = max_num_points;
122*34d14614SJeremy L Thompson 
123*34d14614SJeremy L Thompson     // -- Create interp matrix to Chebyshev coefficients
124*34d14614SJeremy L Thompson     if (!data->d_chebyshev_interp_1d) {
125*34d14614SJeremy L Thompson       CeedSize    interp_bytes;
126*34d14614SJeremy L Thompson       CeedScalar *chebyshev_interp_1d;
127*34d14614SJeremy L Thompson 
128*34d14614SJeremy L Thompson       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
129*34d14614SJeremy L Thompson       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
130*34d14614SJeremy L Thompson       CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
131*34d14614SJeremy L Thompson       CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
132*34d14614SJeremy L Thompson       CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
133*34d14614SJeremy L Thompson       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
134*34d14614SJeremy L Thompson     }
135*34d14614SJeremy L Thompson 
136*34d14614SJeremy L Thompson     // -- Compile kernels
137*34d14614SJeremy L Thompson     char       *basis_kernel_source;
138*34d14614SJeremy L Thompson     const char *basis_kernel_path;
139*34d14614SJeremy L Thompson     CeedInt     num_comp;
140*34d14614SJeremy L Thompson 
141*34d14614SJeremy L Thompson     if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
142*34d14614SJeremy L Thompson     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
143*34d14614SJeremy L Thompson     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
144*34d14614SJeremy L Thompson     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
145*34d14614SJeremy L Thompson     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
146*34d14614SJeremy L Thompson     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
147*34d14614SJeremy L Thompson     CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
148*34d14614SJeremy L Thompson                                      num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
149*34d14614SJeremy L Thompson                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
150*34d14614SJeremy L Thompson                                      max_num_points, "POINTS_BUFF_LEN",
151*34d14614SJeremy L Thompson                                      max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
152*34d14614SJeremy L Thompson     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
153*34d14614SJeremy L Thompson     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
154*34d14614SJeremy L Thompson     CeedCallBackend(CeedFree(&basis_kernel_path));
155*34d14614SJeremy L Thompson     CeedCallBackend(CeedFree(&basis_kernel_source));
156*34d14614SJeremy L Thompson   }
157*34d14614SJeremy L Thompson 
158*34d14614SJeremy L Thompson   // Get read/write access to u, v
159*34d14614SJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
160*34d14614SJeremy L Thompson   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
161*34d14614SJeremy L Thompson   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
162*34d14614SJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
163*34d14614SJeremy L Thompson 
164*34d14614SJeremy L Thompson   // Clear v for transpose operation
165*34d14614SJeremy L Thompson   if (is_transpose) {
166*34d14614SJeremy L Thompson     CeedSize length;
167*34d14614SJeremy L Thompson 
168*34d14614SJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(v, &length));
169*34d14614SJeremy L Thompson     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
170*34d14614SJeremy L Thompson   }
171*34d14614SJeremy L Thompson 
172*34d14614SJeremy L Thompson   // Basis action
173*34d14614SJeremy L Thompson   switch (eval_mode) {
174*34d14614SJeremy L Thompson     case CEED_EVAL_INTERP: {
175*34d14614SJeremy L Thompson       void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
176*34d14614SJeremy L Thompson       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
177*34d14614SJeremy L Thompson 
178*34d14614SJeremy L Thompson       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
179*34d14614SJeremy L Thompson     } break;
180*34d14614SJeremy L Thompson     case CEED_EVAL_GRAD: {
181*34d14614SJeremy L Thompson       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
182*34d14614SJeremy L Thompson       const CeedInt block_size  = max_block_size;
183*34d14614SJeremy L Thompson 
184*34d14614SJeremy L Thompson       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
185*34d14614SJeremy L Thompson     } break;
186*34d14614SJeremy L Thompson     case CEED_EVAL_WEIGHT:
187*34d14614SJeremy L Thompson     case CEED_EVAL_NONE: /* handled separately below */
188*34d14614SJeremy L Thompson       break;
189*34d14614SJeremy L Thompson     // LCOV_EXCL_START
190*34d14614SJeremy L Thompson     case CEED_EVAL_DIV:
191*34d14614SJeremy L Thompson     case CEED_EVAL_CURL:
192*34d14614SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
193*34d14614SJeremy L Thompson       // LCOV_EXCL_STOP
194*34d14614SJeremy L Thompson   }
195*34d14614SJeremy L Thompson 
196*34d14614SJeremy L Thompson   // Restore vectors, cover CEED_EVAL_NONE
197*34d14614SJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
198*34d14614SJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
199*34d14614SJeremy L Thompson   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
200*34d14614SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
201*34d14614SJeremy L Thompson   return CEED_ERROR_SUCCESS;
202*34d14614SJeremy L Thompson }
203*34d14614SJeremy L Thompson 
204*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
2050d0321e0SJeremy L Thompson // Basis apply - non-tensor
2060d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
2072b730f8bSJeremy L Thompson int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
2082b730f8bSJeremy L Thompson                                  CeedVector v) {
2090d0321e0SJeremy L Thompson   Ceed                     ceed;
210437930d1SJeremy L Thompson   CeedInt                  num_nodes, num_qpts;
2117bbbfca3SJeremy L Thompson   const CeedInt            is_transpose    = t_mode == CEED_TRANSPOSE;
212d075f50bSSebastian Grimberg   const int                elems_per_block = 1;
213d075f50bSSebastian Grimberg   const int                grid            = CeedDivUpInt(num_elem, elems_per_block);
2140d0321e0SJeremy L Thompson   const CeedScalar        *d_u;
2150d0321e0SJeremy L Thompson   CeedScalar              *d_v;
216ca735530SJeremy L Thompson   CeedBasisNonTensor_Cuda *data;
217ca735530SJeremy L Thompson 
218ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
219ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &data));
220ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
221ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
222ca735530SJeremy L Thompson 
2239ea2cfd9SJeremy L Thompson   // Get read/write access to u, v
2249ea2cfd9SJeremy L Thompson   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
2259ea2cfd9SJeremy L Thompson   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
2262b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
2270d0321e0SJeremy L Thompson 
2280d0321e0SJeremy L Thompson   // Clear v for transpose operation
2297bbbfca3SJeremy L Thompson   if (is_transpose) {
2301f9221feSJeremy L Thompson     CeedSize length;
231ca735530SJeremy L Thompson 
2322b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(v, &length));
2332b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
2340d0321e0SJeremy L Thompson   }
2350d0321e0SJeremy L Thompson 
2360d0321e0SJeremy L Thompson   // Apply basis operation
237437930d1SJeremy L Thompson   switch (eval_mode) {
2380d0321e0SJeremy L Thompson     case CEED_EVAL_INTERP: {
239d075f50bSSebastian Grimberg       void     *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v};
2407bbbfca3SJeremy L Thompson       const int block_size_x  = is_transpose ? num_nodes : num_qpts;
241b2165e7aSSebastian Grimberg 
2427bbbfca3SJeremy L Thompson       if (is_transpose) {
243d075f50bSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args));
244d075f50bSSebastian Grimberg       } else {
245b2165e7aSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args));
246d075f50bSSebastian Grimberg       }
2470d0321e0SJeremy L Thompson     } break;
2480d0321e0SJeremy L Thompson     case CEED_EVAL_GRAD: {
249d075f50bSSebastian Grimberg       void     *grad_args[]  = {(void *)&num_elem, &data->d_grad, &d_u, &d_v};
2507bbbfca3SJeremy L Thompson       const int block_size_x = is_transpose ? num_nodes : num_qpts;
251b2165e7aSSebastian Grimberg 
2527bbbfca3SJeremy L Thompson       if (is_transpose) {
253d075f50bSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args));
254d075f50bSSebastian Grimberg       } else {
255d075f50bSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args));
256d075f50bSSebastian Grimberg       }
257d075f50bSSebastian Grimberg     } break;
258d075f50bSSebastian Grimberg     case CEED_EVAL_DIV: {
259d075f50bSSebastian Grimberg       void     *div_args[]   = {(void *)&num_elem, &data->d_div, &d_u, &d_v};
2607bbbfca3SJeremy L Thompson       const int block_size_x = is_transpose ? num_nodes : num_qpts;
261d075f50bSSebastian Grimberg 
2627bbbfca3SJeremy L Thompson       if (is_transpose) {
263d075f50bSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args));
264d075f50bSSebastian Grimberg       } else {
265d075f50bSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args));
266d075f50bSSebastian Grimberg       }
267d075f50bSSebastian Grimberg     } break;
268d075f50bSSebastian Grimberg     case CEED_EVAL_CURL: {
269d075f50bSSebastian Grimberg       void     *curl_args[]  = {(void *)&num_elem, &data->d_curl, &d_u, &d_v};
2707bbbfca3SJeremy L Thompson       const int block_size_x = is_transpose ? num_nodes : num_qpts;
271d075f50bSSebastian Grimberg 
2727bbbfca3SJeremy L Thompson       if (is_transpose) {
273d075f50bSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args));
274d075f50bSSebastian Grimberg       } else {
275d075f50bSSebastian Grimberg         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args));
276d075f50bSSebastian Grimberg       }
2770d0321e0SJeremy L Thompson     } break;
2780d0321e0SJeremy L Thompson     case CEED_EVAL_WEIGHT: {
279097cc795SJames Wright       CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
280437930d1SJeremy L Thompson       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
281b2165e7aSSebastian Grimberg 
282eb7e6cafSJeremy L Thompson       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
2830d0321e0SJeremy L Thompson     } break;
2849ea2cfd9SJeremy L Thompson     case CEED_EVAL_NONE: /* handled separately below */
2859ea2cfd9SJeremy L Thompson       break;
2860d0321e0SJeremy L Thompson   }
2870d0321e0SJeremy L Thompson 
2889ea2cfd9SJeremy L Thompson   // Restore vectors, cover CEED_EVAL_NONE
2892b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
2909ea2cfd9SJeremy L Thompson   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
2919ea2cfd9SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
2920d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2930d0321e0SJeremy L Thompson }
2940d0321e0SJeremy L Thompson 
2950d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
2960d0321e0SJeremy L Thompson // Destroy tensor basis
2970d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
2980d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) {
2990d0321e0SJeremy L Thompson   Ceed            ceed;
3000d0321e0SJeremy L Thompson   CeedBasis_Cuda *data;
301ca735530SJeremy L Thompson 
302ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
3032b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &data));
3042b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(data->module));
305*34d14614SJeremy L Thompson   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
306097cc795SJames Wright   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
3072b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
3082b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
309*34d14614SJeremy L Thompson   CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
3102b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&data));
3110d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3120d0321e0SJeremy L Thompson }
3130d0321e0SJeremy L Thompson 
3140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3150d0321e0SJeremy L Thompson // Destroy non-tensor basis
3160d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3170d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
3180d0321e0SJeremy L Thompson   Ceed                     ceed;
3190d0321e0SJeremy L Thompson   CeedBasisNonTensor_Cuda *data;
320ca735530SJeremy L Thompson 
321ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
3222b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &data));
3232b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(data->module));
324097cc795SJames Wright   if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight));
3252b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaFree(data->d_interp));
3262b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaFree(data->d_grad));
327d075f50bSSebastian Grimberg   CeedCallCuda(ceed, cudaFree(data->d_div));
328d075f50bSSebastian Grimberg   CeedCallCuda(ceed, cudaFree(data->d_curl));
3292b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&data));
3300d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3310d0321e0SJeremy L Thompson }
3320d0321e0SJeremy L Thompson 
3330d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3340d0321e0SJeremy L Thompson // Create tensor
3350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3362b730f8bSJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
3372b730f8bSJeremy L Thompson                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
3380d0321e0SJeremy L Thompson   Ceed            ceed;
33922070f95SJeremy L Thompson   char           *basis_kernel_source;
34022070f95SJeremy L Thompson   const char     *basis_kernel_path;
341ca735530SJeremy L Thompson   CeedInt         num_comp;
342ca735530SJeremy L Thompson   const CeedInt   q_bytes      = Q_1d * sizeof(CeedScalar);
343ca735530SJeremy L Thompson   const CeedInt   interp_bytes = q_bytes * P_1d;
3440d0321e0SJeremy L Thompson   CeedBasis_Cuda *data;
345ca735530SJeremy L Thompson 
346ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
3472b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &data));
3480d0321e0SJeremy L Thompson 
3490d0321e0SJeremy L Thompson   // Copy data to GPU
350097cc795SJames Wright   if (q_weight_1d) {
3512b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
3522b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
353097cc795SJames Wright   }
3542b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
3552b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice));
3562b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes));
3572b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice));
3580d0321e0SJeremy L Thompson 
359ecc88aebSJeremy L Thompson   // Compile basis kernels
3602b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
3612b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path));
36223d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
3632b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
36423d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
365eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
3662b730f8bSJeremy L Thompson                                    num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
3672b730f8bSJeremy L Thompson                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
368eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
369eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
370eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
3712b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_path));
3722b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_source));
373437930d1SJeremy L Thompson 
3742b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, data));
3750d0321e0SJeremy L Thompson 
376d075f50bSSebastian Grimberg   // Register backend functions
3772b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda));
378*34d14614SJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda));
3792b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
3800d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3810d0321e0SJeremy L Thompson }
3820d0321e0SJeremy L Thompson 
3830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
384d075f50bSSebastian Grimberg // Create non-tensor H^1
3850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3862b730f8bSJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
38751475c7cSJeremy L Thompson                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
3880d0321e0SJeremy L Thompson   Ceed                     ceed;
38922070f95SJeremy L Thompson   char                    *basis_kernel_source;
39022070f95SJeremy L Thompson   const char              *basis_kernel_path;
391d075f50bSSebastian Grimberg   CeedInt                  num_comp, q_comp_interp, q_comp_grad;
392ca735530SJeremy L Thompson   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
3930d0321e0SJeremy L Thompson   CeedBasisNonTensor_Cuda *data;
394ca735530SJeremy L Thompson 
395ca735530SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
3962b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &data));
3970d0321e0SJeremy L Thompson 
3980d0321e0SJeremy L Thompson   // Copy basis data to GPU
399d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
400d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
401097cc795SJames Wright   if (q_weight) {
4022b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
4032b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
404097cc795SJames Wright   }
405d075f50bSSebastian Grimberg   if (interp) {
406d075f50bSSebastian Grimberg     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
407d075f50bSSebastian Grimberg 
4082b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes));
4092b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice));
410d075f50bSSebastian Grimberg   }
411d075f50bSSebastian Grimberg   if (grad) {
412d075f50bSSebastian Grimberg     const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
413d075f50bSSebastian Grimberg 
4142b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes));
4152b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice));
416d075f50bSSebastian Grimberg   }
4170d0321e0SJeremy L Thompson 
4180d0321e0SJeremy L Thompson   // Compile basis kernels
4192b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
4202b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
42123d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
4222b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
42323d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
424d075f50bSSebastian Grimberg   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
425d075f50bSSebastian Grimberg                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
426d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
427d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
428d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
429d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
430d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
431d075f50bSSebastian Grimberg   CeedCallBackend(CeedFree(&basis_kernel_path));
432d075f50bSSebastian Grimberg   CeedCallBackend(CeedFree(&basis_kernel_source));
433d075f50bSSebastian Grimberg 
434d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisSetData(basis, data));
435d075f50bSSebastian Grimberg 
436d075f50bSSebastian Grimberg   // Register backend functions
437d075f50bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
438d075f50bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
439d075f50bSSebastian Grimberg   return CEED_ERROR_SUCCESS;
440d075f50bSSebastian Grimberg }
441d075f50bSSebastian Grimberg 
442d075f50bSSebastian Grimberg //------------------------------------------------------------------------------
443d075f50bSSebastian Grimberg // Create non-tensor H(div)
444d075f50bSSebastian Grimberg //------------------------------------------------------------------------------
445d075f50bSSebastian Grimberg int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
446d075f50bSSebastian Grimberg                              const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
447d075f50bSSebastian Grimberg   Ceed                     ceed;
44822070f95SJeremy L Thompson   char                    *basis_kernel_source;
44922070f95SJeremy L Thompson   const char              *basis_kernel_path;
450d075f50bSSebastian Grimberg   CeedInt                  num_comp, q_comp_interp, q_comp_div;
451d075f50bSSebastian Grimberg   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
452d075f50bSSebastian Grimberg   CeedBasisNonTensor_Cuda *data;
453d075f50bSSebastian Grimberg 
454d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
455d075f50bSSebastian Grimberg   CeedCallBackend(CeedCalloc(1, &data));
456d075f50bSSebastian Grimberg 
457d075f50bSSebastian Grimberg   // Copy basis data to GPU
458d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
459d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
460097cc795SJames Wright   if (q_weight) {
461d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
462d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
463097cc795SJames Wright   }
464d075f50bSSebastian Grimberg   if (interp) {
465d075f50bSSebastian Grimberg     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
466d075f50bSSebastian Grimberg 
467d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes));
468d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice));
469d075f50bSSebastian Grimberg   }
470d075f50bSSebastian Grimberg   if (div) {
471d075f50bSSebastian Grimberg     const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div;
472d075f50bSSebastian Grimberg 
473d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes));
474d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice));
475d075f50bSSebastian Grimberg   }
476d075f50bSSebastian Grimberg 
477d075f50bSSebastian Grimberg   // Compile basis kernels
478d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
479d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
480d075f50bSSebastian Grimberg   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
481d075f50bSSebastian Grimberg   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
482d075f50bSSebastian Grimberg   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
483d075f50bSSebastian Grimberg   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
484d075f50bSSebastian Grimberg                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
485d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
486d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
487d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
488d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
489d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
490d075f50bSSebastian Grimberg   CeedCallBackend(CeedFree(&basis_kernel_path));
491d075f50bSSebastian Grimberg   CeedCallBackend(CeedFree(&basis_kernel_source));
492d075f50bSSebastian Grimberg 
493d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisSetData(basis, data));
494d075f50bSSebastian Grimberg 
495d075f50bSSebastian Grimberg   // Register backend functions
496d075f50bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
497d075f50bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
498d075f50bSSebastian Grimberg   return CEED_ERROR_SUCCESS;
499d075f50bSSebastian Grimberg }
500d075f50bSSebastian Grimberg 
501d075f50bSSebastian Grimberg //------------------------------------------------------------------------------
502d075f50bSSebastian Grimberg // Create non-tensor H(curl)
503d075f50bSSebastian Grimberg //------------------------------------------------------------------------------
504d075f50bSSebastian Grimberg int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
505d075f50bSSebastian Grimberg                               const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
506d075f50bSSebastian Grimberg   Ceed                     ceed;
50722070f95SJeremy L Thompson   char                    *basis_kernel_source;
50822070f95SJeremy L Thompson   const char              *basis_kernel_path;
509d075f50bSSebastian Grimberg   CeedInt                  num_comp, q_comp_interp, q_comp_curl;
510d075f50bSSebastian Grimberg   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
511d075f50bSSebastian Grimberg   CeedBasisNonTensor_Cuda *data;
512d075f50bSSebastian Grimberg 
513d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
514d075f50bSSebastian Grimberg   CeedCallBackend(CeedCalloc(1, &data));
515d075f50bSSebastian Grimberg 
516d075f50bSSebastian Grimberg   // Copy basis data to GPU
517d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
518d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
519097cc795SJames Wright   if (q_weight) {
520d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
521d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
522097cc795SJames Wright   }
523d075f50bSSebastian Grimberg   if (interp) {
524d075f50bSSebastian Grimberg     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
525d075f50bSSebastian Grimberg 
526d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes));
527d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice));
528d075f50bSSebastian Grimberg   }
529d075f50bSSebastian Grimberg   if (curl) {
530d075f50bSSebastian Grimberg     const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl;
531d075f50bSSebastian Grimberg 
532d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes));
533d075f50bSSebastian Grimberg     CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice));
534d075f50bSSebastian Grimberg   }
535d075f50bSSebastian Grimberg 
536d075f50bSSebastian Grimberg   // Compile basis kernels
537d075f50bSSebastian Grimberg   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
538d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
539d075f50bSSebastian Grimberg   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
540d075f50bSSebastian Grimberg   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
541d075f50bSSebastian Grimberg   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
542d075f50bSSebastian Grimberg   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
543d075f50bSSebastian Grimberg                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
544d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
545d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
546d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
547d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
548d075f50bSSebastian Grimberg   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
5492b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_path));
5502b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_source));
5510d0321e0SJeremy L Thompson 
5522b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, data));
5530d0321e0SJeremy L Thompson 
5540d0321e0SJeremy L Thompson   // Register backend functions
5552b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
5562b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
5570d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5580d0321e0SJeremy L Thompson }
5592a86cc9dSSebastian Grimberg 
5600d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
561