xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-ref/ceed-cuda-ref-basis.c (revision 34d146140c2fce42d8bc042f039d47d4ff020864)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <cuda.h>
12 #include <cuda_runtime.h>
13 
14 #include "../cuda/ceed-cuda-common.h"
15 #include "../cuda/ceed-cuda-compile.h"
16 #include "ceed-cuda-ref.h"
17 
18 //------------------------------------------------------------------------------
19 // Basis apply - tensor
20 //------------------------------------------------------------------------------
21 int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
22   Ceed              ceed;
23   CeedInt           Q_1d, dim;
24   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
25   const int         max_block_size = 32;
26   const CeedScalar *d_u;
27   CeedScalar       *d_v;
28   CeedBasis_Cuda   *data;
29 
30   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
31   CeedCallBackend(CeedBasisGetData(basis, &data));
32 
33   // Get read/write access to u, v
34   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
35   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
36   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
37 
38   // Clear v for transpose operation
39   if (is_transpose) {
40     CeedSize length;
41 
42     CeedCallBackend(CeedVectorGetLength(v, &length));
43     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
44   }
45   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
46   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
47 
48   // Basis action
49   switch (eval_mode) {
50     case CEED_EVAL_INTERP: {
51       void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v};
52       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
53 
54       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Interp, num_elem, block_size, interp_args));
55     } break;
56     case CEED_EVAL_GRAD: {
57       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
58       const CeedInt block_size  = max_block_size;
59 
60       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args));
61     } break;
62     case CEED_EVAL_WEIGHT: {
63       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
64       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
65       const int block_size_x  = Q_1d;
66       const int block_size_y  = dim >= 2 ? Q_1d : 1;
67 
68       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args));
69     } break;
70     case CEED_EVAL_NONE: /* handled separately below */
71       break;
72     // LCOV_EXCL_START
73     case CEED_EVAL_DIV:
74     case CEED_EVAL_CURL:
75       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
76       // LCOV_EXCL_STOP
77   }
78 
79   // Restore vectors, cover CEED_EVAL_NONE
80   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
81   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
82   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
83   return CEED_ERROR_SUCCESS;
84 }
85 
86 //------------------------------------------------------------------------------
87 // Basis apply - tensor AtPoints
88 //------------------------------------------------------------------------------
89 int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
90                                 CeedVector x_ref, CeedVector u, CeedVector v) {
91   Ceed              ceed;
92   CeedInt           Q_1d, dim, max_num_points = num_points[0];
93   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
94   const int         max_block_size = 32;
95   const CeedScalar *d_x, *d_u;
96   CeedScalar       *d_v;
97   CeedBasis_Cuda   *data;
98 
99   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
100   CeedCallBackend(CeedBasisGetData(basis, &data));
101   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
102   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
103 
104   // Check uniform number of points per elem
105   for (CeedInt i = 1; i < num_elem; i++) {
106     CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
107               "BasisApplyAtPoints only supported for the same number of points in each element");
108   }
109 
110   // Weight handled separately
111   if (eval_mode == CEED_EVAL_WEIGHT) {
112     CeedCall(CeedVectorSetValue(v, 1.0));
113     return CEED_ERROR_SUCCESS;
114   }
115 
116   // Build kernels if needed
117   if (data->num_points != max_num_points) {
118     CeedInt P_1d;
119 
120     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
121     data->num_points = max_num_points;
122 
123     // -- Create interp matrix to Chebyshev coefficients
124     if (!data->d_chebyshev_interp_1d) {
125       CeedSize    interp_bytes;
126       CeedScalar *chebyshev_interp_1d;
127 
128       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
129       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
130       CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
131       CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
132       CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
133       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
134     }
135 
136     // -- Compile kernels
137     char       *basis_kernel_source;
138     const char *basis_kernel_path;
139     CeedInt     num_comp;
140 
141     if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
142     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
143     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
144     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
145     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
146     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
147     CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
148                                      num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
149                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
150                                      max_num_points, "POINTS_BUFF_LEN",
151                                      max_num_points * CeedIntPow(Q_1d > max_num_points ? Q_1d : max_num_points, dim - 1)));
152     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
153     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
154     CeedCallBackend(CeedFree(&basis_kernel_path));
155     CeedCallBackend(CeedFree(&basis_kernel_source));
156   }
157 
158   // Get read/write access to u, v
159   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
160   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
161   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
162   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
163 
164   // Clear v for transpose operation
165   if (is_transpose) {
166     CeedSize length;
167 
168     CeedCallBackend(CeedVectorGetLength(v, &length));
169     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
170   }
171 
172   // Basis action
173   switch (eval_mode) {
174     case CEED_EVAL_INTERP: {
175       void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
176       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
177 
178       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
179     } break;
180     case CEED_EVAL_GRAD: {
181       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
182       const CeedInt block_size  = max_block_size;
183 
184       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
185     } break;
186     case CEED_EVAL_WEIGHT:
187     case CEED_EVAL_NONE: /* handled separately below */
188       break;
189     // LCOV_EXCL_START
190     case CEED_EVAL_DIV:
191     case CEED_EVAL_CURL:
192       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
193       // LCOV_EXCL_STOP
194   }
195 
196   // Restore vectors, cover CEED_EVAL_NONE
197   CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
198   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
199   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
200   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // Basis apply - non-tensor
206 //------------------------------------------------------------------------------
207 int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
208                                  CeedVector v) {
209   Ceed                     ceed;
210   CeedInt                  num_nodes, num_qpts;
211   const CeedInt            is_transpose    = t_mode == CEED_TRANSPOSE;
212   const int                elems_per_block = 1;
213   const int                grid            = CeedDivUpInt(num_elem, elems_per_block);
214   const CeedScalar        *d_u;
215   CeedScalar              *d_v;
216   CeedBasisNonTensor_Cuda *data;
217 
218   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
219   CeedCallBackend(CeedBasisGetData(basis, &data));
220   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
221   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
222 
223   // Get read/write access to u, v
224   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
225   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
226   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
227 
228   // Clear v for transpose operation
229   if (is_transpose) {
230     CeedSize length;
231 
232     CeedCallBackend(CeedVectorGetLength(v, &length));
233     CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
234   }
235 
236   // Apply basis operation
237   switch (eval_mode) {
238     case CEED_EVAL_INTERP: {
239       void     *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v};
240       const int block_size_x  = is_transpose ? num_nodes : num_qpts;
241 
242       if (is_transpose) {
243         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args));
244       } else {
245         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args));
246       }
247     } break;
248     case CEED_EVAL_GRAD: {
249       void     *grad_args[]  = {(void *)&num_elem, &data->d_grad, &d_u, &d_v};
250       const int block_size_x = is_transpose ? num_nodes : num_qpts;
251 
252       if (is_transpose) {
253         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args));
254       } else {
255         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args));
256       }
257     } break;
258     case CEED_EVAL_DIV: {
259       void     *div_args[]   = {(void *)&num_elem, &data->d_div, &d_u, &d_v};
260       const int block_size_x = is_transpose ? num_nodes : num_qpts;
261 
262       if (is_transpose) {
263         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args));
264       } else {
265         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args));
266       }
267     } break;
268     case CEED_EVAL_CURL: {
269       void     *curl_args[]  = {(void *)&num_elem, &data->d_curl, &d_u, &d_v};
270       const int block_size_x = is_transpose ? num_nodes : num_qpts;
271 
272       if (is_transpose) {
273         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args));
274       } else {
275         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args));
276       }
277     } break;
278     case CEED_EVAL_WEIGHT: {
279       CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
280       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
281 
282       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
283     } break;
284     case CEED_EVAL_NONE: /* handled separately below */
285       break;
286   }
287 
288   // Restore vectors, cover CEED_EVAL_NONE
289   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
290   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
291   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
292   return CEED_ERROR_SUCCESS;
293 }
294 
295 //------------------------------------------------------------------------------
296 // Destroy tensor basis
297 //------------------------------------------------------------------------------
298 static int CeedBasisDestroy_Cuda(CeedBasis basis) {
299   Ceed            ceed;
300   CeedBasis_Cuda *data;
301 
302   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
303   CeedCallBackend(CeedBasisGetData(basis, &data));
304   CeedCallCuda(ceed, cuModuleUnload(data->module));
305   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
306   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
307   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
308   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
309   CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
310   CeedCallBackend(CeedFree(&data));
311   return CEED_ERROR_SUCCESS;
312 }
313 
314 //------------------------------------------------------------------------------
315 // Destroy non-tensor basis
316 //------------------------------------------------------------------------------
317 static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
318   Ceed                     ceed;
319   CeedBasisNonTensor_Cuda *data;
320 
321   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
322   CeedCallBackend(CeedBasisGetData(basis, &data));
323   CeedCallCuda(ceed, cuModuleUnload(data->module));
324   if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight));
325   CeedCallCuda(ceed, cudaFree(data->d_interp));
326   CeedCallCuda(ceed, cudaFree(data->d_grad));
327   CeedCallCuda(ceed, cudaFree(data->d_div));
328   CeedCallCuda(ceed, cudaFree(data->d_curl));
329   CeedCallBackend(CeedFree(&data));
330   return CEED_ERROR_SUCCESS;
331 }
332 
333 //------------------------------------------------------------------------------
334 // Create tensor
335 //------------------------------------------------------------------------------
336 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
337                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
338   Ceed            ceed;
339   char           *basis_kernel_source;
340   const char     *basis_kernel_path;
341   CeedInt         num_comp;
342   const CeedInt   q_bytes      = Q_1d * sizeof(CeedScalar);
343   const CeedInt   interp_bytes = q_bytes * P_1d;
344   CeedBasis_Cuda *data;
345 
346   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
347   CeedCallBackend(CeedCalloc(1, &data));
348 
349   // Copy data to GPU
350   if (q_weight_1d) {
351     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
352     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
353   }
354   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
355   CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice));
356   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes));
357   CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice));
358 
359   // Compile basis kernels
360   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
361   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path));
362   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
363   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
364   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
365   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
366                                    num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
367                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
368   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
369   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
370   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
371   CeedCallBackend(CeedFree(&basis_kernel_path));
372   CeedCallBackend(CeedFree(&basis_kernel_source));
373 
374   CeedCallBackend(CeedBasisSetData(basis, data));
375 
376   // Register backend functions
377   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda));
378   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda));
379   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
380   return CEED_ERROR_SUCCESS;
381 }
382 
383 //------------------------------------------------------------------------------
384 // Create non-tensor H^1
385 //------------------------------------------------------------------------------
386 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
387                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
388   Ceed                     ceed;
389   char                    *basis_kernel_source;
390   const char              *basis_kernel_path;
391   CeedInt                  num_comp, q_comp_interp, q_comp_grad;
392   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
393   CeedBasisNonTensor_Cuda *data;
394 
395   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
396   CeedCallBackend(CeedCalloc(1, &data));
397 
398   // Copy basis data to GPU
399   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
400   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
401   if (q_weight) {
402     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
403     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
404   }
405   if (interp) {
406     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
407 
408     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes));
409     CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice));
410   }
411   if (grad) {
412     const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
413 
414     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes));
415     CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice));
416   }
417 
418   // Compile basis kernels
419   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
420   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
421   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
422   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
423   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
424   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
425                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
426   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
427   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
428   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
429   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
430   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
431   CeedCallBackend(CeedFree(&basis_kernel_path));
432   CeedCallBackend(CeedFree(&basis_kernel_source));
433 
434   CeedCallBackend(CeedBasisSetData(basis, data));
435 
436   // Register backend functions
437   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
438   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
439   return CEED_ERROR_SUCCESS;
440 }
441 
442 //------------------------------------------------------------------------------
443 // Create non-tensor H(div)
444 //------------------------------------------------------------------------------
445 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
446                              const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
447   Ceed                     ceed;
448   char                    *basis_kernel_source;
449   const char              *basis_kernel_path;
450   CeedInt                  num_comp, q_comp_interp, q_comp_div;
451   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
452   CeedBasisNonTensor_Cuda *data;
453 
454   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
455   CeedCallBackend(CeedCalloc(1, &data));
456 
457   // Copy basis data to GPU
458   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
459   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
460   if (q_weight) {
461     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
462     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
463   }
464   if (interp) {
465     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
466 
467     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes));
468     CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice));
469   }
470   if (div) {
471     const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div;
472 
473     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes));
474     CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice));
475   }
476 
477   // Compile basis kernels
478   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
479   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
480   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
481   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
482   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
483   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
484                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
485   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
486   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
487   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
488   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
489   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
490   CeedCallBackend(CeedFree(&basis_kernel_path));
491   CeedCallBackend(CeedFree(&basis_kernel_source));
492 
493   CeedCallBackend(CeedBasisSetData(basis, data));
494 
495   // Register backend functions
496   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
497   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
498   return CEED_ERROR_SUCCESS;
499 }
500 
501 //------------------------------------------------------------------------------
502 // Create non-tensor H(curl)
503 //------------------------------------------------------------------------------
504 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
505                               const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
506   Ceed                     ceed;
507   char                    *basis_kernel_source;
508   const char              *basis_kernel_path;
509   CeedInt                  num_comp, q_comp_interp, q_comp_curl;
510   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
511   CeedBasisNonTensor_Cuda *data;
512 
513   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
514   CeedCallBackend(CeedCalloc(1, &data));
515 
516   // Copy basis data to GPU
517   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
518   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
519   if (q_weight) {
520     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
521     CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
522   }
523   if (interp) {
524     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
525 
526     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes));
527     CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice));
528   }
529   if (curl) {
530     const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl;
531 
532     CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes));
533     CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice));
534   }
535 
536   // Compile basis kernels
537   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
538   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
539   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
540   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
541   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
542   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
543                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
544   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
545   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
546   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
547   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
548   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
549   CeedCallBackend(CeedFree(&basis_kernel_path));
550   CeedCallBackend(CeedFree(&basis_kernel_source));
551 
552   CeedCallBackend(CeedBasisSetData(basis, data));
553 
554   // Register backend functions
555   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
556   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
557   return CEED_ERROR_SUCCESS;
558 }
559 
560 //------------------------------------------------------------------------------
561