xref: /libCEED/backends/magma/ceed-magma-basis.c (revision 67d9480a68a5a9232b5fd6d362b2ef6ee659593b)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <string.h>
12 
13 #ifdef CEED_MAGMA_USE_HIP
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 #else
17 #include "../cuda/ceed-cuda-common.h"
18 #include "../cuda/ceed-cuda-compile.h"
19 #endif
20 #include "ceed-magma-common.h"
21 #include "ceed-magma.h"
22 
23 #include "ceed-magma-gemm-nontensor.h"
24 #include "ceed-magma-gemm-selector.h"
25 
26 //------------------------------------------------------------------------------
27 // Basis apply - tensor
28 //------------------------------------------------------------------------------
29 static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
30   Ceed              ceed;
31   Ceed_Magma       *data;
32   CeedInt           dim, num_comp, num_nodes, P_1d, Q_1d, P, Q;
33   const CeedScalar *d_u;
34   CeedScalar       *d_v;
35   CeedBasis_Magma  *impl;
36 
37   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
38   CeedCallBackend(CeedGetData(ceed, &data));
39   CeedCallBackend(CeedBasisGetData(basis, &impl));
40   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
41   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
42   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
43   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
44   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
45   P = P_1d;
46   Q = Q_1d;
47   if (t_mode == CEED_TRANSPOSE) {
48     P = Q_1d;
49     Q = P_1d;
50   }
51 
52   // Read vectors
53   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
54   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
55   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
56 
57   // Apply basis operation
58   switch (e_mode) {
59     case CEED_EVAL_INTERP: {
60       // Define element sizes for dofs/quad
61       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
62       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
63 
64       // E-vector ordering -------------- Q-vector ordering
65       //  component                        component
66       //    elem                             elem
67       //       node                            node
68 
69       // ---  Define strides for NOTRANSPOSE mode: ---
70       // Input (d_u) is E-vector, output (d_v) is Q-vector
71 
72       // Element strides
73       CeedInt u_elem_stride = elem_dofs_size;
74       CeedInt v_elem_stride = elem_qpts_size;
75       // Component strides
76       CeedInt u_comp_stride = num_elem * elem_dofs_size;
77       CeedInt v_comp_stride = num_elem * elem_qpts_size;
78       if (t_mode == CEED_TRANSPOSE) {
79         // Input (d_u) is Q-vector, output (d_v) is E-vector
80         // Element strides
81         v_elem_stride = elem_dofs_size;
82         u_elem_stride = elem_qpts_size;
83         // Component strides
84         v_comp_stride = num_elem * elem_dofs_size;
85         u_comp_stride = num_elem * elem_qpts_size;
86       }
87       CeedInt num_threads = 1;
88       CeedInt num_t_col   = 1;
89       CeedInt shared_mem  = 0;
90       CeedInt max_P_Q     = CeedIntMax(P, Q);
91 
92       switch (dim) {
93         case 1:
94           num_threads = max_P_Q;
95           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
96           shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q));
97           shared_mem += sizeof(CeedScalar) * (P * Q);
98           break;
99         case 2:
100           num_threads = max_P_Q;
101           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
102           shared_mem += P * Q * sizeof(CeedScalar);  // for sT
103           // for reforming rU we need P x P, and for the intermediate output we need P x Q
104           shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar));
105           break;
106         case 3:
107           num_threads = max_P_Q * max_P_Q;
108           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
109           shared_mem += sizeof(CeedScalar) * (P * Q);  // for sT
110           // rU needs P^2 x P, the intermediate output needs max(P^2 x Q, P x Q^2)
111           shared_mem += sizeof(CeedScalar) * num_t_col * (CeedIntMax(P * P * max_P_Q, P * Q * Q));
112           break;
113       }
114       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
115       void   *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem};
116 
117       if (t_mode == CEED_TRANSPOSE) {
118         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
119       } else {
120         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args));
121       }
122     } break;
123     case CEED_EVAL_GRAD: {
124       // Define element sizes for dofs/quad
125       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
126       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
127 
128       // In CEED_NOTRANSPOSE mode:
129       // d_u is (P^dim x nc), column-major layout (nc = num_comp)
130       // d_v is (Q^dim x nc x dim), column-major layout (nc = num_comp)
131       // In CEED_TRANSPOSE mode, the sizes of d_u and d_v are switched.
132 
133       // E-vector ordering -------------- Q-vector ordering
134       //                                  dim
135       //  component                        component
136       //    elem                              elem
137       //       node                            node
138 
139       // ---  Define strides for NOTRANSPOSE mode: ---
140       // Input (d_u) is E-vector, output (d_v) is Q-vector
141 
142       // Element strides
143       CeedInt u_elem_stride = elem_dofs_size;
144       CeedInt v_elem_stride = elem_qpts_size;
145       // Component strides
146       CeedInt u_comp_stride = num_elem * elem_dofs_size;
147       CeedInt v_comp_stride = num_elem * elem_qpts_size;
148       // Dimension strides
149       CeedInt u_dim_stride = 0;
150       CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp;
151       if (t_mode == CEED_TRANSPOSE) {
152         // Input (d_u) is Q-vector, output (d_v) is E-vector
153         // Element strides
154         v_elem_stride = elem_dofs_size;
155         u_elem_stride = elem_qpts_size;
156         // Component strides
157         v_comp_stride = num_elem * elem_dofs_size;
158         u_comp_stride = num_elem * elem_qpts_size;
159         // Dimension strides
160         v_dim_stride = 0;
161         u_dim_stride = num_elem * elem_qpts_size * num_comp;
162       }
163       CeedInt num_threads = 1;
164       CeedInt num_t_col   = 1;
165       CeedInt shared_mem  = 0;
166       CeedInt max_P_Q     = CeedIntMax(P, Q);
167 
168       switch (dim) {
169         case 1:
170           num_threads = max_P_Q;
171           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
172           shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q));
173           shared_mem += sizeof(CeedScalar) * (P * Q);
174           break;
175         case 2:
176           num_threads = max_P_Q;
177           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
178           shared_mem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
179           // for reforming rU we need P x P, and for the intermediate output we need P x Q
180           shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q);
181           break;
182         case 3:
183           num_threads = max_P_Q * max_P_Q;
184           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
185           shared_mem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
186           // rU needs P^2 x P, the intermediate outputs need (P^2 x Q + P x Q^2)
187           shared_mem += sizeof(CeedScalar) * num_t_col * CeedIntMax(P * P * P, (P * P * Q) + (P * Q * Q));
188           break;
189       }
190       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
191       void   *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &d_u,          &u_elem_stride, &u_comp_stride, &u_dim_stride, &d_v,
192                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
193 
194       if (t_mode == CEED_TRANSPOSE) {
195         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
196       } else {
197         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args));
198       }
199     } break;
200     case CEED_EVAL_WEIGHT: {
201       CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
202       CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[e_mode]);
203       CeedInt elem_dofs_size = CeedIntPow(Q, dim);
204       CeedInt num_threads    = 1;
205       CeedInt num_t_col      = 1;
206       CeedInt shared_mem     = 0;
207 
208       switch (dim) {
209         case 1:
210           num_threads = Q;
211           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
212           shared_mem += sizeof(CeedScalar) * Q;              // for d_q_weight_1d
213           shared_mem += sizeof(CeedScalar) * num_t_col * Q;  // for output
214           break;
215         case 2:
216           num_threads = Q;
217           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
218           shared_mem += sizeof(CeedScalar) * Q;  // for d_q_weight_1d
219           break;
220         case 3:
221           num_threads = Q * Q;
222           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
223           shared_mem += sizeof(CeedScalar) * Q;  // for d_q_weight_1d
224           break;
225       }
226       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
227       void   *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem};
228 
229       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args));
230     } break;
231     // LCOV_EXCL_START
232     case CEED_EVAL_DIV:
233     case CEED_EVAL_CURL:
234       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[e_mode]);
235     case CEED_EVAL_NONE:
236       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
237       // LCOV_EXCL_STOP
238   }
239 
240   // Must sync to ensure completeness
241   ceed_magma_queue_sync(data->queue);
242 
243   // Restore vectors
244   if (e_mode != CEED_EVAL_WEIGHT) {
245     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
246   }
247   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
248   return CEED_ERROR_SUCCESS;
249 }
250 
251 //------------------------------------------------------------------------------
252 // Basis apply - tensor AtPoints
253 //------------------------------------------------------------------------------
254 int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
255                                  CeedVector x_ref, CeedVector u, CeedVector v) {
256   return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "Backend does not implement CeedBasisApplyAtPoints");
257 }
258 
259 //------------------------------------------------------------------------------
260 // Basis apply - non-tensor
261 //------------------------------------------------------------------------------
262 static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
263                                          CeedVector v) {
264   Ceed                      ceed;
265   Ceed_Magma               *data;
266   CeedInt                   num_comp, num_nodes, num_qpts, P, Q, N;
267   const CeedScalar         *d_u;
268   CeedScalar               *d_v;
269   CeedBasisNonTensor_Magma *impl;
270 
271   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
272   CeedCallBackend(CeedGetData(ceed, &data));
273   CeedCallBackend(CeedBasisGetData(basis, &impl));
274   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
275   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
276   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
277   P = num_nodes;
278   Q = num_qpts;
279   N = num_elem * num_comp;
280 
281   // Read vectors
282   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
283   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
284   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
285 
286   // Compile kernels for N as needed
287   CeedInt iN = 0;
288   if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q && (e_mode != CEED_EVAL_WEIGHT || !impl->Weight)) {
289     CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES};
290     CeedInt diff                                      = abs(n_array[iN] - N), idiff;
291 
292     for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
293       idiff = abs(n_array[in] - N);
294       if (idiff < diff) {
295         iN   = in;
296         diff = idiff;
297       }
298     }
299 
300     if (!impl->NB_interp[iN]) {
301       CeedFESpace fe_space;
302       CeedInt     q_comp_interp, q_comp_deriv;
303       Ceed        ceed_delegate;
304       char       *basis_kernel_source;
305       const char *basis_kernel_path, *weight_kernel_path;
306       char      **file_paths     = NULL;
307       CeedInt     num_file_paths = 0;
308       magma_int_t arch           = magma_getdevice_arch();
309 
310       // Tuning parameters for NB
311       CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space));
312       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
313       switch (fe_space) {
314         case CEED_FE_SPACE_H1:
315           CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv));
316           break;
317         case CEED_FE_SPACE_HDIV:
318           CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv));
319           break;
320         case CEED_FE_SPACE_HCURL:
321           CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv));
322           break;
323       }
324       impl->NB_interp[iN]   = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]);
325       impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]);
326       impl->NB_deriv[iN]    = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]);
327       impl->NB_deriv_t[iN]  = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]);
328 
329       // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
330       CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
331 
332       // Compile kernels
333       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path));
334       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
335       CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, basis_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
336       if (!impl->Weight) {
337         CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
338         CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
339       }
340       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
341       CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp,
342                                        "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN],
343                                        "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T",
344                                        impl->NB_deriv_t[iN]));
345       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
346       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
347       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
348       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
349       if (!impl->Weight) {
350         CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight));
351         CeedCallBackend(CeedFree(&weight_kernel_path));
352       }
353       CeedCallBackend(CeedFree(&basis_kernel_path));
354       CeedCallBackend(CeedFree(&basis_kernel_source));
355       for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
356       CeedCall(CeedFree(&file_paths));
357     }
358   }
359 
360   // Apply basis operation
361   if (e_mode != CEED_EVAL_WEIGHT) {
362     const CeedScalar *d_b = NULL;
363     CeedInt           q_comp, NB, M, K;
364     CeedMagmaFunction Kernel;
365 
366     switch (e_mode) {
367       case CEED_EVAL_INTERP:
368         d_b = impl->d_interp;
369         break;
370       case CEED_EVAL_GRAD:
371         d_b = impl->d_grad;
372         break;
373       case CEED_EVAL_DIV:
374         d_b = impl->d_div;
375         break;
376       case CEED_EVAL_CURL:
377         d_b = impl->d_curl;
378         break;
379       // LCOV_EXCL_START
380       case CEED_EVAL_WEIGHT:
381       case CEED_EVAL_NONE:
382         return CeedError(ceed, CEED_ERROR_BACKEND, "%s does not make sense in this context", CeedEvalModes[e_mode]);
383         // LCOV_EXCL_STOP
384     }
385     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp));
386     M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P;
387 
388     if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
389       if (e_mode == CEED_EVAL_INTERP) {
390         if (t_mode == CEED_TRANSPOSE) {
391           Kernel = impl->InterpTranspose[iN];
392           NB     = impl->NB_interp_t[iN];
393         } else {
394           Kernel = impl->Interp[iN];
395           NB     = impl->NB_interp[iN];
396         }
397       } else {
398         if (t_mode == CEED_TRANSPOSE) {
399           Kernel = impl->DerivTranspose[iN];
400           NB     = impl->NB_deriv_t[iN];
401         } else {
402           Kernel = impl->Deriv[iN];
403           NB     = impl->NB_deriv[iN];
404         }
405       }
406       CeedInt num_t_col    = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D);
407       CeedInt grid         = CeedDivUpInt(N, num_t_col * NB);
408       CeedInt shared_mem_A = P * Q * sizeof(CeedScalar);
409       CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar);
410       CeedInt shared_mem   = (t_mode != CEED_TRANSPOSE && q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
411       void   *args[]       = {&N, &d_b, &d_u, &d_v};
412 
413       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args));
414     } else {
415       for (CeedInt d = 0; d < q_comp; d++) {
416         if (t_mode == CEED_TRANSPOSE) {
417           const CeedScalar beta = (d > 0) ? 1.0 : 0.0;
418           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue);
419         } else {
420           magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
421         }
422       }
423     }
424   } else {
425     CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
426     CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight not set", CeedEvalModes[e_mode]);
427     CeedInt num_t_col  = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D);
428     CeedInt grid       = CeedDivUpInt(num_elem, num_t_col);
429     CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar);
430     void   *args[]     = {&num_elem, &impl->d_q_weight, &d_v};
431 
432     CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args));
433   }
434 
435   // Must sync to ensure completeness
436   ceed_magma_queue_sync(data->queue);
437 
438   // Restore vectors
439   if (e_mode != CEED_EVAL_WEIGHT) {
440     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
441   }
442   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
443   return CEED_ERROR_SUCCESS;
444 }
445 
446 //------------------------------------------------------------------------------
447 // Destroy tensor basis
448 //------------------------------------------------------------------------------
449 static int CeedBasisDestroy_Magma(CeedBasis basis) {
450   Ceed             ceed;
451   CeedBasis_Magma *impl;
452 
453   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
454   CeedCallBackend(CeedBasisGetData(basis, &impl));
455 #ifdef CEED_MAGMA_USE_HIP
456   CeedCallHip(ceed, hipModuleUnload(impl->module));
457 #else
458   CeedCallCuda(ceed, cuModuleUnload(impl->module));
459 #endif
460   CeedCallBackend(magma_free(impl->d_interp_1d));
461   CeedCallBackend(magma_free(impl->d_grad_1d));
462   if (impl->d_q_weight_1d) CeedCallBackend(magma_free(impl->d_q_weight_1d));
463   CeedCallBackend(CeedFree(&impl));
464   return CEED_ERROR_SUCCESS;
465 }
466 
467 //------------------------------------------------------------------------------
468 // Destroy non-tensor basis
469 //------------------------------------------------------------------------------
470 static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
471   Ceed                      ceed;
472   CeedBasisNonTensor_Magma *impl;
473 
474   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
475   CeedCallBackend(CeedBasisGetData(basis, &impl));
476   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
477     if (impl->module[in]) {
478 #ifdef CEED_MAGMA_USE_HIP
479       CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
480 #else
481       CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
482 #endif
483     }
484   }
485   CeedCallBackend(magma_free(impl->d_interp));
486   CeedCallBackend(magma_free(impl->d_grad));
487   CeedCallBackend(magma_free(impl->d_div));
488   CeedCallBackend(magma_free(impl->d_curl));
489   if (impl->d_q_weight) CeedCallBackend(magma_free(impl->d_q_weight));
490   CeedCallBackend(CeedFree(&impl));
491   return CEED_ERROR_SUCCESS;
492 }
493 
494 //------------------------------------------------------------------------------
495 // Create tensor
496 //------------------------------------------------------------------------------
497 int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
498                                   const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
499   Ceed             ceed, ceed_delegate;
500   Ceed_Magma      *data;
501   char            *basis_kernel_source;
502   const char      *interp_kernel_path, *grad_kernel_path, *weight_kernel_path;
503   char           **file_paths     = NULL;
504   CeedInt          num_file_paths = 0;
505   CeedInt          num_comp;
506   CeedBasis_Magma *impl;
507 
508   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
509   CeedCallBackend(CeedGetData(ceed, &data));
510   CeedCallBackend(CeedCalloc(1, &impl));
511 
512   // Copy basis data to GPU
513   if (q_weight_1d) {
514     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
515     magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
516   }
517   CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])));
518   magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue);
519   CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])));
520   magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue);
521 
522   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
523   CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
524 
525   // Compile kernels
526   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
527   {
528     char   *interp_kernel_name_base = "ceed/jit-source/magma/magma-basis-interp";
529     CeedInt interp_kernel_name_len  = strlen(interp_kernel_name_base) + 6;
530     char    interp_kernel_name[interp_kernel_name_len];
531 
532     snprintf(interp_kernel_name, interp_kernel_name_len, "%s-%" CeedInt_FMT "d.h", interp_kernel_name_base, dim);
533     CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_kernel_name, &interp_kernel_path));
534   }
535   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
536   CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, interp_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
537   {
538     char   *grad_kernel_name_base = "ceed/jit-source/magma/magma-basis-grad";
539     CeedInt grad_kernel_name_len  = strlen(grad_kernel_name_base) + 6;
540     char    grad_kernel_name[grad_kernel_name_len];
541 
542     snprintf(grad_kernel_name, grad_kernel_name_len, "%s-%" CeedInt_FMT "d.h", grad_kernel_name_base, dim);
543     CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_kernel_name, &grad_kernel_path));
544   }
545   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
546   {
547     char   *weight_kernel_name_base = "ceed/jit-source/magma/magma-basis-weight";
548     CeedInt weight_kernel_name_len  = strlen(weight_kernel_name_base) + 6;
549     char    weight_kernel_name[weight_kernel_name_len];
550 
551     snprintf(weight_kernel_name, weight_kernel_name_len, "%s-%" CeedInt_FMT "d.h", weight_kernel_name_base, dim);
552     CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_kernel_name, &weight_kernel_path));
553   }
554   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
555   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
556   CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_P",
557                                    P_1d, "BASIS_Q", Q_1d, "BASIS_MAX_P_Q", CeedIntMax(P_1d, Q_1d)));
558   switch (dim) {
559     case 1:
560       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp));
561       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose));
562       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad));
563       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose));
564       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight));
565       break;
566     case 2:
567       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp));
568       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose));
569       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad));
570       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose));
571       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight));
572       break;
573     case 3:
574       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp));
575       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose));
576       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad));
577       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose));
578       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight));
579       break;
580   }
581   CeedCallBackend(CeedFree(&interp_kernel_path));
582   CeedCallBackend(CeedFree(&grad_kernel_path));
583   CeedCallBackend(CeedFree(&weight_kernel_path));
584   CeedCallBackend(CeedFree(&basis_kernel_source));
585   for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
586   CeedCall(CeedFree(&file_paths));
587 
588   CeedCallBackend(CeedBasisSetData(basis, impl));
589 
590   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
591   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma));
592   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
593   return CEED_ERROR_SUCCESS;
594 }
595 
596 //------------------------------------------------------------------------------
597 // Create non-tensor H^1
598 //------------------------------------------------------------------------------
599 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
600                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
601   Ceed                      ceed;
602   Ceed_Magma               *data;
603   CeedBasisNonTensor_Magma *impl;
604 
605   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
606   CeedCallBackend(CeedGetData(ceed, &data));
607   CeedCallBackend(CeedCalloc(1, &impl));
608 
609   // Copy basis data to GPU
610   if (q_weight) {
611     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
612     magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
613   }
614   if (interp) {
615     CeedInt q_comp_interp;
616 
617     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
618     CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
619     magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
620   }
621   if (grad) {
622     CeedInt q_comp_grad;
623 
624     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
625     CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * q_comp_grad * sizeof(grad[0])));
626     magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
627   }
628 
629   // Compile the weight kernel if it won't be compiled later on
630   if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
631     Ceed        ceed_delegate;
632     char       *basis_kernel_source;
633     const char *weight_kernel_path;
634 
635     // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
636     CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
637 
638     // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
639     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
640     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
641     CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
642     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
643     CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
644     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
645     CeedCallBackend(CeedFree(&weight_kernel_path));
646     CeedCallBackend(CeedFree(&basis_kernel_source));
647   }
648 
649   CeedCallBackend(CeedBasisSetData(basis, impl));
650 
651   // Register backend functions
652   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
653   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
654   return CEED_ERROR_SUCCESS;
655 }
656 
657 //------------------------------------------------------------------------------
658 // Create non-tensor H(div)
659 //------------------------------------------------------------------------------
660 int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
661                               const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
662   Ceed                      ceed;
663   Ceed_Magma               *data;
664   CeedBasisNonTensor_Magma *impl;
665 
666   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
667   CeedCallBackend(CeedGetData(ceed, &data));
668   CeedCallBackend(CeedCalloc(1, &impl));
669 
670   // Copy basis data to GPU
671   if (q_weight) {
672     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
673     magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
674   }
675   if (interp) {
676     CeedInt q_comp_interp;
677 
678     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
679     CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
680     magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
681   }
682   if (div) {
683     CeedInt q_comp_div;
684 
685     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
686     CeedCallBackend(magma_malloc((void **)&impl->d_div, num_qpts * num_nodes * q_comp_div * sizeof(div[0])));
687     magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue);
688   }
689 
690   // Compile the weight kernel if it won't be compiled later on
691   if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
692     Ceed        ceed_delegate;
693     char       *basis_kernel_source;
694     const char *weight_kernel_path;
695 
696     // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
697     CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
698 
699     // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
700     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
701     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
702     CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
703     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
704     CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
705     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
706     CeedCallBackend(CeedFree(&weight_kernel_path));
707     CeedCallBackend(CeedFree(&basis_kernel_source));
708   }
709 
710   CeedCallBackend(CeedBasisSetData(basis, impl));
711 
712   // Register backend functions
713   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
714   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
715   return CEED_ERROR_SUCCESS;
716 }
717 
718 //------------------------------------------------------------------------------
719 // Create non-tensor H(curl)
720 //------------------------------------------------------------------------------
721 int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
722                                const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
723   Ceed                      ceed;
724   Ceed_Magma               *data;
725   CeedBasisNonTensor_Magma *impl;
726 
727   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
728   CeedCallBackend(CeedGetData(ceed, &data));
729   CeedCallBackend(CeedCalloc(1, &impl));
730 
731   // Copy basis data to GPU
732   if (q_weight) {
733     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
734     magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
735   }
736   if (interp) {
737     CeedInt q_comp_interp;
738 
739     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
740     CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
741     magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
742   }
743   if (curl) {
744     CeedInt q_comp_curl;
745 
746     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
747     CeedCallBackend(magma_malloc((void **)&impl->d_curl, num_qpts * num_nodes * q_comp_curl * sizeof(curl[0])));
748     magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue);
749   }
750 
751   // Compile the weight kernel if it won't be compiled later on
752   if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
753     Ceed        ceed_delegate;
754     char       *basis_kernel_source;
755     const char *weight_kernel_path;
756 
757     // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
758     CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
759 
760     // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
761     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
762     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
763     CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
764     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
765     CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
766     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
767     CeedCallBackend(CeedFree(&weight_kernel_path));
768     CeedCallBackend(CeedFree(&basis_kernel_source));
769   }
770 
771   CeedCallBackend(CeedBasisSetData(basis, impl));
772 
773   // Register backend functions
774   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
775   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
776   return CEED_ERROR_SUCCESS;
777 }
778 
779 //------------------------------------------------------------------------------
780