xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37f5b9731SStan Tomov //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57f5b9731SStan Tomov //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
77f5b9731SStan Tomov 
849aac155SJeremy L Thompson #include <ceed.h>
9ec3da8bcSJed Brown #include <ceed/backend.h>
10f6af633fSnbeams #include <ceed/jit-tools.h>
11f6af633fSnbeams #include <string.h>
122b730f8bSJeremy L Thompson 
13e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
14f6af633fSnbeams #include "../hip/ceed-hip-common.h"
15f6af633fSnbeams #include "../hip/ceed-hip-compile.h"
16f6af633fSnbeams #else
17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h"
18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h"
19f6af633fSnbeams #endif
2000fb7a04SSebastian Grimberg #include "ceed-magma-common.h"
2100fb7a04SSebastian Grimberg #include "ceed-magma.h"
227f5b9731SStan Tomov 
23940a72f1SSebastian Grimberg #include "ceed-magma-gemm-nontensor.h"
24940a72f1SSebastian Grimberg #include "ceed-magma-gemm-selector.h"
25940a72f1SSebastian Grimberg 
26940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
27940a72f1SSebastian Grimberg // Basis apply - tensor
28940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisApplyCore_Magma(CeedBasis basis,bool apply_add,CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode e_mode,CeedVector u,CeedVector v)29db2becc9SJeremy L Thompson static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
30db2becc9SJeremy L Thompson                                     CeedVector v) {
317f5b9731SStan Tomov   Ceed              ceed;
32e0582403Sabdelfattah83   Ceed_Magma       *data;
33940a72f1SSebastian Grimberg   CeedInt           dim, num_comp, num_nodes, P_1d, Q_1d, P, Q;
34940a72f1SSebastian Grimberg   const CeedScalar *d_u;
35940a72f1SSebastian Grimberg   CeedScalar       *d_v;
3638293ee6SJeremy L Thompson   CeedBasis_Magma  *impl;
3738293ee6SJeremy L Thompson 
3838293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
39940a72f1SSebastian Grimberg   CeedCallBackend(CeedGetData(ceed, &data));
40940a72f1SSebastian Grimberg   CeedCallBackend(CeedBasisGetData(basis, &impl));
4138293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
4238293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
43940a72f1SSebastian Grimberg   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
4438293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
4538293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
46940a72f1SSebastian Grimberg   P = P_1d;
47940a72f1SSebastian Grimberg   Q = Q_1d;
4838293ee6SJeremy L Thompson   if (t_mode == CEED_TRANSPOSE) {
4938293ee6SJeremy L Thompson     P = Q_1d;
5038293ee6SJeremy L Thompson     Q = P_1d;
517f5b9731SStan Tomov   }
527f5b9731SStan Tomov 
53940a72f1SSebastian Grimberg   // Read vectors
54940a72f1SSebastian Grimberg   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
55940a72f1SSebastian Grimberg   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
56db2becc9SJeremy L Thompson   if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
57db2becc9SJeremy L Thompson   else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
58940a72f1SSebastian Grimberg 
59940a72f1SSebastian Grimberg   // Apply basis operation
60940a72f1SSebastian Grimberg   switch (e_mode) {
61940a72f1SSebastian Grimberg     case CEED_EVAL_INTERP: {
627f5b9731SStan Tomov       // Define element sizes for dofs/quad
6338293ee6SJeremy L Thompson       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
6438293ee6SJeremy L Thompson       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
657f5b9731SStan Tomov 
667f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
67868539c2SNatalie Beams       //  component                        component
68868539c2SNatalie Beams       //    elem                             elem
697f5b9731SStan Tomov       //       node                            node
707f5b9731SStan Tomov 
717f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
72940a72f1SSebastian Grimberg       // Input (d_u) is E-vector, output (d_v) is Q-vector
737f5b9731SStan Tomov 
747f5b9731SStan Tomov       // Element strides
7538293ee6SJeremy L Thompson       CeedInt u_elem_stride = elem_dofs_size;
7638293ee6SJeremy L Thompson       CeedInt v_elem_stride = elem_qpts_size;
777f5b9731SStan Tomov       // Component strides
7838293ee6SJeremy L Thompson       CeedInt u_comp_stride = num_elem * elem_dofs_size;
7938293ee6SJeremy L Thompson       CeedInt v_comp_stride = num_elem * elem_qpts_size;
8038293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
81940a72f1SSebastian Grimberg         // Input (d_u) is Q-vector, output (d_v) is E-vector
827f5b9731SStan Tomov         // Element strides
8338293ee6SJeremy L Thompson         v_elem_stride = elem_dofs_size;
8438293ee6SJeremy L Thompson         u_elem_stride = elem_qpts_size;
857f5b9731SStan Tomov         // Component strides
8638293ee6SJeremy L Thompson         v_comp_stride = num_elem * elem_dofs_size;
8738293ee6SJeremy L Thompson         u_comp_stride = num_elem * elem_qpts_size;
887f5b9731SStan Tomov       }
8938293ee6SJeremy L Thompson       CeedInt num_threads = 1;
9038293ee6SJeremy L Thompson       CeedInt num_t_col   = 1;
9138293ee6SJeremy L Thompson       CeedInt shared_mem  = 0;
9238293ee6SJeremy L Thompson       CeedInt max_P_Q     = CeedIntMax(P, Q);
93f6af633fSnbeams 
94f6af633fSnbeams       switch (dim) {
95f6af633fSnbeams         case 1:
9638293ee6SJeremy L Thompson           num_threads = max_P_Q;
9738293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
9838293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q));
9938293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * (P * Q);
100f6af633fSnbeams           break;
101f6af633fSnbeams         case 2:
10238293ee6SJeremy L Thompson           num_threads = max_P_Q;
10338293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
10438293ee6SJeremy L Thompson           shared_mem += P * Q * sizeof(CeedScalar);  // for sT
105940a72f1SSebastian Grimberg           // for reforming rU we need P x P, and for the intermediate output we need P x Q
106940a72f1SSebastian Grimberg           shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar));
107f6af633fSnbeams           break;
108f6af633fSnbeams         case 3:
10938293ee6SJeremy L Thompson           num_threads = max_P_Q * max_P_Q;
11038293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
11138293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * (P * Q);  // for sT
112940a72f1SSebastian Grimberg           // rU needs P^2 x P, the intermediate output needs max(P^2 x Q, P x Q^2)
113940a72f1SSebastian Grimberg           shared_mem += sizeof(CeedScalar) * num_t_col * (CeedIntMax(P * P * max_P_Q, P * Q * Q));
114940a72f1SSebastian Grimberg           break;
115f6af633fSnbeams       }
116940a72f1SSebastian Grimberg       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
117940a72f1SSebastian Grimberg       void   *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem};
118f6af633fSnbeams 
11938293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
120e9c76bddSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, NULL, grid, num_threads,
121e9c76bddSJeremy L Thompson                                                     num_t_col, 1, shared_mem, args));
122f6af633fSnbeams       } else {
123e9c76bddSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
124f6af633fSnbeams       }
1252b730f8bSJeremy L Thompson     } break;
1263513a710Sjeremylt     case CEED_EVAL_GRAD: {
1277f5b9731SStan Tomov       // Define element sizes for dofs/quad
12838293ee6SJeremy L Thompson       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
12938293ee6SJeremy L Thompson       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
1307f5b9731SStan Tomov 
131940a72f1SSebastian Grimberg       // In CEED_NOTRANSPOSE mode:
132940a72f1SSebastian Grimberg       // d_u is (P^dim x nc), column-major layout (nc = num_comp)
133940a72f1SSebastian Grimberg       // d_v is (Q^dim x nc x dim), column-major layout (nc = num_comp)
134940a72f1SSebastian Grimberg       // In CEED_TRANSPOSE mode, the sizes of d_u and d_v are switched.
135940a72f1SSebastian Grimberg 
1367f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
1377f5b9731SStan Tomov       //                                  dim
138868539c2SNatalie Beams       //  component                        component
139868539c2SNatalie Beams       //    elem                              elem
1407f5b9731SStan Tomov       //       node                            node
1417f5b9731SStan Tomov 
1427f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
143940a72f1SSebastian Grimberg       // Input (d_u) is E-vector, output (d_v) is Q-vector
1447f5b9731SStan Tomov 
1457f5b9731SStan Tomov       // Element strides
14638293ee6SJeremy L Thompson       CeedInt u_elem_stride = elem_dofs_size;
14738293ee6SJeremy L Thompson       CeedInt v_elem_stride = elem_qpts_size;
1487f5b9731SStan Tomov       // Component strides
14938293ee6SJeremy L Thompson       CeedInt u_comp_stride = num_elem * elem_dofs_size;
15038293ee6SJeremy L Thompson       CeedInt v_comp_stride = num_elem * elem_qpts_size;
1517f5b9731SStan Tomov       // Dimension strides
15238293ee6SJeremy L Thompson       CeedInt u_dim_stride = 0;
15338293ee6SJeremy L Thompson       CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp;
15438293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
155940a72f1SSebastian Grimberg         // Input (d_u) is Q-vector, output (d_v) is E-vector
1567f5b9731SStan Tomov         // Element strides
15738293ee6SJeremy L Thompson         v_elem_stride = elem_dofs_size;
15838293ee6SJeremy L Thompson         u_elem_stride = elem_qpts_size;
1597f5b9731SStan Tomov         // Component strides
16038293ee6SJeremy L Thompson         v_comp_stride = num_elem * elem_dofs_size;
16138293ee6SJeremy L Thompson         u_comp_stride = num_elem * elem_qpts_size;
1627f5b9731SStan Tomov         // Dimension strides
16338293ee6SJeremy L Thompson         v_dim_stride = 0;
16438293ee6SJeremy L Thompson         u_dim_stride = num_elem * elem_qpts_size * num_comp;
1657f5b9731SStan Tomov       }
16638293ee6SJeremy L Thompson       CeedInt num_threads = 1;
16738293ee6SJeremy L Thompson       CeedInt num_t_col   = 1;
16838293ee6SJeremy L Thompson       CeedInt shared_mem  = 0;
16938293ee6SJeremy L Thompson       CeedInt max_P_Q     = CeedIntMax(P, Q);
170f6af633fSnbeams 
171f6af633fSnbeams       switch (dim) {
172f6af633fSnbeams         case 1:
17338293ee6SJeremy L Thompson           num_threads = max_P_Q;
17438293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
17538293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q));
17638293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * (P * Q);
177f6af633fSnbeams           break;
178f6af633fSnbeams         case 2:
17938293ee6SJeremy L Thompson           num_threads = max_P_Q;
18038293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
18138293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
182940a72f1SSebastian Grimberg           // for reforming rU we need P x P, and for the intermediate output we need P x Q
183940a72f1SSebastian Grimberg           shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q);
184f6af633fSnbeams           break;
185f6af633fSnbeams         case 3:
18638293ee6SJeremy L Thompson           num_threads = max_P_Q * max_P_Q;
18738293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
18838293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
189940a72f1SSebastian Grimberg           // rU needs P^2 x P, the intermediate outputs need (P^2 x Q + P x Q^2)
190940a72f1SSebastian Grimberg           shared_mem += sizeof(CeedScalar) * num_t_col * CeedIntMax(P * P * P, (P * P * Q) + (P * Q * Q));
191940a72f1SSebastian Grimberg           break;
192f6af633fSnbeams       }
193940a72f1SSebastian Grimberg       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
194940a72f1SSebastian Grimberg       void   *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &d_u,          &u_elem_stride, &u_comp_stride, &u_dim_stride, &d_v,
19538293ee6SJeremy L Thompson                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
196f6af633fSnbeams 
19738293ee6SJeremy L Thompson       if (t_mode == CEED_TRANSPOSE) {
198e9c76bddSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, NULL, grid, num_threads,
199e9c76bddSJeremy L Thompson                                                     num_t_col, 1, shared_mem, args));
200f6af633fSnbeams       } else {
201e9c76bddSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
202f6af633fSnbeams       }
2032b730f8bSJeremy L Thompson     } break;
2043513a710Sjeremylt     case CEED_EVAL_WEIGHT: {
205940a72f1SSebastian Grimberg       CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
206097cc795SJames Wright       CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[e_mode]);
20738293ee6SJeremy L Thompson       CeedInt elem_dofs_size = CeedIntPow(Q, dim);
20838293ee6SJeremy L Thompson       CeedInt num_threads    = 1;
20938293ee6SJeremy L Thompson       CeedInt num_t_col      = 1;
21038293ee6SJeremy L Thompson       CeedInt shared_mem     = 0;
211f6af633fSnbeams 
212f6af633fSnbeams       switch (dim) {
213f6af633fSnbeams         case 1:
21438293ee6SJeremy L Thompson           num_threads = Q;
21538293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D);
21638293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * Q;              // for d_q_weight_1d
21738293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * num_t_col * Q;  // for output
218f6af633fSnbeams           break;
219f6af633fSnbeams         case 2:
22038293ee6SJeremy L Thompson           num_threads = Q;
22138293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
22238293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * Q;  // for d_q_weight_1d
223f6af633fSnbeams           break;
224f6af633fSnbeams         case 3:
22538293ee6SJeremy L Thompson           num_threads = Q * Q;
22638293ee6SJeremy L Thompson           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
22738293ee6SJeremy L Thompson           shared_mem += sizeof(CeedScalar) * Q;  // for d_q_weight_1d
228940a72f1SSebastian Grimberg           break;
229f6af633fSnbeams       }
230940a72f1SSebastian Grimberg       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
231940a72f1SSebastian Grimberg       void   *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem};
232f6af633fSnbeams 
233e9c76bddSJeremy L Thompson       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
2342b730f8bSJeremy L Thompson     } break;
2353513a710Sjeremylt     // LCOV_EXCL_START
2363513a710Sjeremylt     case CEED_EVAL_DIV:
2373513a710Sjeremylt     case CEED_EVAL_CURL:
238bcbe1c99SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[e_mode]);
2393513a710Sjeremylt     case CEED_EVAL_NONE:
2402b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
2413513a710Sjeremylt       // LCOV_EXCL_STOP
2423513a710Sjeremylt   }
2437f5b9731SStan Tomov 
244940a72f1SSebastian Grimberg   // Must sync to ensure completeness
245e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
246e0582403Sabdelfattah83 
247940a72f1SSebastian Grimberg   // Restore vectors
24838293ee6SJeremy L Thompson   if (e_mode != CEED_EVAL_WEIGHT) {
249940a72f1SSebastian Grimberg     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
2507f5b9731SStan Tomov   }
251940a72f1SSebastian Grimberg   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
2529bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
253e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2547f5b9731SStan Tomov }
2557f5b9731SStan Tomov 
CeedBasisApply_Magma(CeedBasis basis,CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode e_mode,CeedVector u,CeedVector v)256db2becc9SJeremy L Thompson static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
257db2becc9SJeremy L Thompson   CeedCallBackend(CeedBasisApplyCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
258db2becc9SJeremy L Thompson   return CEED_ERROR_SUCCESS;
259db2becc9SJeremy L Thompson }
260db2becc9SJeremy L Thompson 
CeedBasisApplyAdd_Magma(CeedBasis basis,CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode e_mode,CeedVector u,CeedVector v)261db2becc9SJeremy L Thompson static int CeedBasisApplyAdd_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
262db2becc9SJeremy L Thompson   CeedCallBackend(CeedBasisApplyCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
263db2becc9SJeremy L Thompson   return CEED_ERROR_SUCCESS;
264db2becc9SJeremy L Thompson }
265db2becc9SJeremy L Thompson 
266940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
26714950a8eSJeremy L Thompson // Basis apply - tensor AtPoints
26814950a8eSJeremy L Thompson //------------------------------------------------------------------------------
CeedBasisApplyAtPoints_Magma(CeedBasis basis,const CeedInt num_elem,const CeedInt * num_points,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector x_ref,CeedVector u,CeedVector v)26914950a8eSJeremy L Thompson int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
27014950a8eSJeremy L Thompson                                  CeedVector x_ref, CeedVector u, CeedVector v) {
27114950a8eSJeremy L Thompson   return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "Backend does not implement CeedBasisApplyAtPoints");
27214950a8eSJeremy L Thompson }
27314950a8eSJeremy L Thompson 
27414950a8eSJeremy L Thompson //------------------------------------------------------------------------------
275940a72f1SSebastian Grimberg // Basis apply - non-tensor
276940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisApplyNonTensorCore_Magma(CeedBasis basis,bool apply_add,CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode e_mode,CeedVector u,CeedVector v)277db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode,
278db2becc9SJeremy L Thompson                                              CeedVector u, CeedVector v) {
279868539c2SNatalie Beams   Ceed                      ceed;
280e0582403Sabdelfattah83   Ceed_Magma               *data;
2817251047cSSebastian Grimberg   CeedInt                   num_comp, num_nodes, num_qpts, P, Q, N;
2827251047cSSebastian Grimberg   const CeedScalar         *d_u;
283940a72f1SSebastian Grimberg   CeedScalar               *d_v;
28438293ee6SJeremy L Thompson   CeedBasisNonTensor_Magma *impl;
28538293ee6SJeremy L Thompson 
28638293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
28738293ee6SJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
288940a72f1SSebastian Grimberg   CeedCallBackend(CeedBasisGetData(basis, &impl));
28938293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
290940a72f1SSebastian Grimberg   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
29138293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
292940a72f1SSebastian Grimberg   P = num_nodes;
293940a72f1SSebastian Grimberg   Q = num_qpts;
294940a72f1SSebastian Grimberg   N = num_elem * num_comp;
29538293ee6SJeremy L Thompson 
296940a72f1SSebastian Grimberg   // Read vectors
297940a72f1SSebastian Grimberg   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
29838293ee6SJeremy L Thompson   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
299db2becc9SJeremy L Thompson   if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
300db2becc9SJeremy L Thompson   else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
301868539c2SNatalie Beams 
3027251047cSSebastian Grimberg   // Compile kernels for N as needed
3037251047cSSebastian Grimberg   CeedInt iN = 0;
3047251047cSSebastian Grimberg   if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q && (e_mode != CEED_EVAL_WEIGHT || !impl->Weight)) {
305940a72f1SSebastian Grimberg     CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES};
3067251047cSSebastian Grimberg     CeedInt diff                                      = abs(n_array[iN] - N), idiff;
30738293ee6SJeremy L Thompson 
308023b8a51Sabdelfattah83     for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
309940a72f1SSebastian Grimberg       idiff = abs(n_array[in] - N);
310023b8a51Sabdelfattah83       if (idiff < diff) {
311023b8a51Sabdelfattah83         iN   = in;
312023b8a51Sabdelfattah83         diff = idiff;
313868539c2SNatalie Beams       }
31480a9ef05SNatalie Beams     }
31580a9ef05SNatalie Beams 
316940a72f1SSebastian Grimberg     if (!impl->NB_interp[iN]) {
3179d15e85bSSebastian Grimberg       CeedFESpace fe_space;
3189d15e85bSSebastian Grimberg       CeedInt     q_comp_interp, q_comp_deriv;
319940a72f1SSebastian Grimberg       Ceed        ceed_delegate;
32022070f95SJeremy L Thompson       char       *basis_kernel_source;
32122070f95SJeremy L Thompson       const char *basis_kernel_path, *weight_kernel_path;
322509d4af6SJeremy L Thompson       char      **file_paths     = NULL;
323509d4af6SJeremy L Thompson       CeedInt     num_file_paths = 0;
324940a72f1SSebastian Grimberg       magma_int_t arch           = magma_getdevice_arch();
32580a9ef05SNatalie Beams 
326940a72f1SSebastian Grimberg       // Tuning parameters for NB
3279d15e85bSSebastian Grimberg       CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space));
3289d15e85bSSebastian Grimberg       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
3299d15e85bSSebastian Grimberg       switch (fe_space) {
3309d15e85bSSebastian Grimberg         case CEED_FE_SPACE_H1:
3319d15e85bSSebastian Grimberg           CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv));
3329d15e85bSSebastian Grimberg           break;
3339d15e85bSSebastian Grimberg         case CEED_FE_SPACE_HDIV:
3349d15e85bSSebastian Grimberg           CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv));
3359d15e85bSSebastian Grimberg           break;
3369d15e85bSSebastian Grimberg         case CEED_FE_SPACE_HCURL:
3379d15e85bSSebastian Grimberg           CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv));
3389d15e85bSSebastian Grimberg           break;
3399d15e85bSSebastian Grimberg       }
3409d15e85bSSebastian Grimberg       impl->NB_interp[iN]   = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]);
3419d15e85bSSebastian Grimberg       impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]);
3429d15e85bSSebastian Grimberg       impl->NB_deriv[iN]    = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]);
3439d15e85bSSebastian Grimberg       impl->NB_deriv_t[iN]  = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]);
344023b8a51Sabdelfattah83 
345940a72f1SSebastian Grimberg       // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
346940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
347023b8a51Sabdelfattah83 
348940a72f1SSebastian Grimberg       // Compile kernels
3499d15e85bSSebastian Grimberg       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path));
350940a72f1SSebastian Grimberg       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
351509d4af6SJeremy L Thompson       CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, basis_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
3527251047cSSebastian Grimberg       if (!impl->Weight) {
3537251047cSSebastian Grimberg         CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
354509d4af6SJeremy L Thompson         CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
3557251047cSSebastian Grimberg       }
356940a72f1SSebastian Grimberg       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
3577251047cSSebastian Grimberg       CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp,
3589d15e85bSSebastian Grimberg                                        "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN],
3599d15e85bSSebastian Grimberg                                        "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T",
3609d15e85bSSebastian Grimberg                                        impl->NB_deriv_t[iN]));
3617251047cSSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
3627251047cSSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
363db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_ta", &impl->InterpTransposeAdd[iN]));
3647251047cSSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
3657251047cSSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
366db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_ta", &impl->DerivTransposeAdd[iN]));
3677251047cSSebastian Grimberg       if (!impl->Weight) {
3687251047cSSebastian Grimberg         CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight));
3697251047cSSebastian Grimberg         CeedCallBackend(CeedFree(&weight_kernel_path));
3707251047cSSebastian Grimberg       }
3719d15e85bSSebastian Grimberg       CeedCallBackend(CeedFree(&basis_kernel_path));
372940a72f1SSebastian Grimberg       CeedCallBackend(CeedFree(&basis_kernel_source));
3735a5594ffSJeremy L Thompson       for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
3745a5594ffSJeremy L Thompson       CeedCallBackend(CeedFree(&file_paths));
3759bc66399SJeremy L Thompson       CeedCallBackend(CeedDestroy(&ceed_delegate));
376940a72f1SSebastian Grimberg     }
3777251047cSSebastian Grimberg   }
3787251047cSSebastian Grimberg 
3797251047cSSebastian Grimberg   // Apply basis operation
3807251047cSSebastian Grimberg   if (e_mode != CEED_EVAL_WEIGHT) {
3817251047cSSebastian Grimberg     const CeedScalar *d_b = NULL;
3827251047cSSebastian Grimberg     CeedInt           q_comp, NB, M, K;
3837251047cSSebastian Grimberg     CeedMagmaFunction Kernel;
3847251047cSSebastian Grimberg 
3857251047cSSebastian Grimberg     switch (e_mode) {
3867251047cSSebastian Grimberg       case CEED_EVAL_INTERP:
3877251047cSSebastian Grimberg         d_b = impl->d_interp;
3887251047cSSebastian Grimberg         break;
3897251047cSSebastian Grimberg       case CEED_EVAL_GRAD:
3907251047cSSebastian Grimberg         d_b = impl->d_grad;
3917251047cSSebastian Grimberg         break;
3927251047cSSebastian Grimberg       case CEED_EVAL_DIV:
3937251047cSSebastian Grimberg         d_b = impl->d_div;
3947251047cSSebastian Grimberg         break;
3957251047cSSebastian Grimberg       case CEED_EVAL_CURL:
3967251047cSSebastian Grimberg         d_b = impl->d_curl;
3977251047cSSebastian Grimberg         break;
3987251047cSSebastian Grimberg       // LCOV_EXCL_START
3997251047cSSebastian Grimberg       case CEED_EVAL_WEIGHT:
4007251047cSSebastian Grimberg       case CEED_EVAL_NONE:
401bcbe1c99SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "%s does not make sense in this context", CeedEvalModes[e_mode]);
4027251047cSSebastian Grimberg         // LCOV_EXCL_STOP
4037251047cSSebastian Grimberg     }
4047251047cSSebastian Grimberg     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp));
4057251047cSSebastian Grimberg     M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P;
4067251047cSSebastian Grimberg 
4077251047cSSebastian Grimberg     if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
4089d15e85bSSebastian Grimberg       if (e_mode == CEED_EVAL_INTERP) {
4099d15e85bSSebastian Grimberg         if (t_mode == CEED_TRANSPOSE) {
410db2becc9SJeremy L Thompson           Kernel = apply_add ? impl->InterpTransposeAdd[iN] : impl->InterpTranspose[iN];
4119d15e85bSSebastian Grimberg           NB     = impl->NB_interp_t[iN];
4129d15e85bSSebastian Grimberg         } else {
4139d15e85bSSebastian Grimberg           Kernel = impl->Interp[iN];
4149d15e85bSSebastian Grimberg           NB     = impl->NB_interp[iN];
4159d15e85bSSebastian Grimberg         }
4169d15e85bSSebastian Grimberg       } else {
4179d15e85bSSebastian Grimberg         if (t_mode == CEED_TRANSPOSE) {
418db2becc9SJeremy L Thompson           Kernel = apply_add ? impl->DerivTransposeAdd[iN] : impl->DerivTranspose[iN];
4199d15e85bSSebastian Grimberg           NB     = impl->NB_deriv_t[iN];
4209d15e85bSSebastian Grimberg         } else {
4219d15e85bSSebastian Grimberg           Kernel = impl->Deriv[iN];
4229d15e85bSSebastian Grimberg           NB     = impl->NB_deriv[iN];
4239d15e85bSSebastian Grimberg         }
4249d15e85bSSebastian Grimberg       }
425940a72f1SSebastian Grimberg       CeedInt num_t_col    = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D);
4269d15e85bSSebastian Grimberg       CeedInt grid         = CeedDivUpInt(N, num_t_col * NB);
427833aa127SSebastian Grimberg       CeedInt shared_mem_A = P * Q * sizeof(CeedScalar);
428940a72f1SSebastian Grimberg       CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar);
429833aa127SSebastian Grimberg       CeedInt shared_mem   = (t_mode != CEED_TRANSPOSE && q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
4309d15e85bSSebastian Grimberg       void   *args[]       = {&N, &d_b, &d_u, &d_v};
431940a72f1SSebastian Grimberg 
432e9c76bddSJeremy L Thompson       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, NULL, grid, M, num_t_col, 1, shared_mem, args));
4339d15e85bSSebastian Grimberg     } else {
4349d15e85bSSebastian Grimberg       for (CeedInt d = 0; d < q_comp; d++) {
43538293ee6SJeremy L Thompson         if (t_mode == CEED_TRANSPOSE) {
436db2becc9SJeremy L Thompson           const CeedScalar beta = (apply_add || (d > 0)) ? 1.0 : 0.0;
4379d15e85bSSebastian Grimberg           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue);
438940a72f1SSebastian Grimberg         } else {
4399d15e85bSSebastian Grimberg           magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
440940a72f1SSebastian Grimberg         }
441940a72f1SSebastian Grimberg       }
442940a72f1SSebastian Grimberg     }
443940a72f1SSebastian Grimberg   } else {
444940a72f1SSebastian Grimberg     CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
445097cc795SJames Wright     CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight not set", CeedEvalModes[e_mode]);
446940a72f1SSebastian Grimberg     CeedInt num_t_col  = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D);
447940a72f1SSebastian Grimberg     CeedInt grid       = CeedDivUpInt(num_elem, num_t_col);
448940a72f1SSebastian Grimberg     CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar);
4499d15e85bSSebastian Grimberg     void   *args[]     = {&num_elem, &impl->d_q_weight, &d_v};
450868539c2SNatalie Beams 
451e9c76bddSJeremy L Thompson     CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, Q, num_t_col, 1, shared_mem, args));
452940a72f1SSebastian Grimberg   }
453940a72f1SSebastian Grimberg 
454940a72f1SSebastian Grimberg   // Must sync to ensure completeness
455e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
456e0582403Sabdelfattah83 
457940a72f1SSebastian Grimberg   // Restore vectors
45838293ee6SJeremy L Thompson   if (e_mode != CEED_EVAL_WEIGHT) {
459940a72f1SSebastian Grimberg     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
460868539c2SNatalie Beams   }
461940a72f1SSebastian Grimberg   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
4629bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
463e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
464868539c2SNatalie Beams }
465868539c2SNatalie Beams 
CeedBasisApplyNonTensor_Magma(CeedBasis basis,CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode e_mode,CeedVector u,CeedVector v)466db2becc9SJeremy L Thompson static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
467db2becc9SJeremy L Thompson                                          CeedVector v) {
468db2becc9SJeremy L Thompson   CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
469db2becc9SJeremy L Thompson   return CEED_ERROR_SUCCESS;
470db2becc9SJeremy L Thompson }
471db2becc9SJeremy L Thompson 
CeedBasisApplyAddNonTensor_Magma(CeedBasis basis,CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode e_mode,CeedVector u,CeedVector v)472db2becc9SJeremy L Thompson static int CeedBasisApplyAddNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
473db2becc9SJeremy L Thompson                                             CeedVector v) {
474db2becc9SJeremy L Thompson   CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
475db2becc9SJeremy L Thompson   return CEED_ERROR_SUCCESS;
476db2becc9SJeremy L Thompson }
477db2becc9SJeremy L Thompson 
478940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
479940a72f1SSebastian Grimberg // Destroy tensor basis
480940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisDestroy_Magma(CeedBasis basis)481940a72f1SSebastian Grimberg static int CeedBasisDestroy_Magma(CeedBasis basis) {
482f6af633fSnbeams   Ceed             ceed;
48338293ee6SJeremy L Thompson   CeedBasis_Magma *impl;
48438293ee6SJeremy L Thompson 
4852b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
486940a72f1SSebastian Grimberg   CeedCallBackend(CeedBasisGetData(basis, &impl));
487e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
4882b730f8bSJeremy L Thompson   CeedCallHip(ceed, hipModuleUnload(impl->module));
489f6af633fSnbeams #else
4902b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(impl->module));
491f6af633fSnbeams #endif
492940a72f1SSebastian Grimberg   CeedCallBackend(magma_free(impl->d_interp_1d));
493940a72f1SSebastian Grimberg   CeedCallBackend(magma_free(impl->d_grad_1d));
494097cc795SJames Wright   if (impl->d_q_weight_1d) CeedCallBackend(magma_free(impl->d_q_weight_1d));
4952b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
4969bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
497e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
4987f5b9731SStan Tomov }
4997f5b9731SStan Tomov 
500940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
501940a72f1SSebastian Grimberg // Destroy non-tensor basis
502940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisDestroyNonTensor_Magma(CeedBasis basis)503940a72f1SSebastian Grimberg static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
504023b8a51Sabdelfattah83   Ceed                      ceed;
50538293ee6SJeremy L Thompson   CeedBasisNonTensor_Magma *impl;
50638293ee6SJeremy L Thompson 
507940a72f1SSebastian Grimberg   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
50838293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
509940a72f1SSebastian Grimberg   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
5107251047cSSebastian Grimberg     if (impl->module[in]) {
511940a72f1SSebastian Grimberg #ifdef CEED_MAGMA_USE_HIP
5127251047cSSebastian Grimberg       CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
513940a72f1SSebastian Grimberg #else
5147251047cSSebastian Grimberg       CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
515940a72f1SSebastian Grimberg #endif
516940a72f1SSebastian Grimberg     }
517940a72f1SSebastian Grimberg   }
51838293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_interp));
51938293ee6SJeremy L Thompson   CeedCallBackend(magma_free(impl->d_grad));
5209d15e85bSSebastian Grimberg   CeedCallBackend(magma_free(impl->d_div));
5219d15e85bSSebastian Grimberg   CeedCallBackend(magma_free(impl->d_curl));
522097cc795SJames Wright   if (impl->d_q_weight) CeedCallBackend(magma_free(impl->d_q_weight));
5232b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
5249bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
525e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
526868539c2SNatalie Beams }
527868539c2SNatalie Beams 
528940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
529940a72f1SSebastian Grimberg // Create tensor
530940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisCreateTensorH1_Magma(CeedInt dim,CeedInt P_1d,CeedInt Q_1d,const CeedScalar * interp_1d,const CeedScalar * grad_1d,const CeedScalar * q_ref_1d,const CeedScalar * q_weight_1d,CeedBasis basis)531940a72f1SSebastian Grimberg int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
53238293ee6SJeremy L Thompson                                   const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
53338293ee6SJeremy L Thompson   Ceed             ceed, ceed_delegate;
53438293ee6SJeremy L Thompson   Ceed_Magma      *data;
53522070f95SJeremy L Thompson   char            *basis_kernel_source;
53622070f95SJeremy L Thompson   const char      *interp_kernel_path, *grad_kernel_path, *weight_kernel_path;
537509d4af6SJeremy L Thompson   char           **file_paths     = NULL;
538509d4af6SJeremy L Thompson   CeedInt          num_file_paths = 0;
539940a72f1SSebastian Grimberg   CeedInt          num_comp;
5407f5b9731SStan Tomov   CeedBasis_Magma *impl;
54138293ee6SJeremy L Thompson 
5422b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
5432b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
544940a72f1SSebastian Grimberg   CeedCallBackend(CeedCalloc(1, &impl));
545e0582403Sabdelfattah83 
546940a72f1SSebastian Grimberg   // Copy basis data to GPU
547097cc795SJames Wright   if (q_weight_1d) {
548940a72f1SSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
549940a72f1SSebastian Grimberg     magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
550097cc795SJames Wright   }
55138293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])));
55238293ee6SJeremy L Thompson   magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue);
55338293ee6SJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])));
55438293ee6SJeremy L Thompson   magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue);
5557f5b9731SStan Tomov 
556940a72f1SSebastian Grimberg   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
557940a72f1SSebastian Grimberg   CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
558940a72f1SSebastian Grimberg 
559940a72f1SSebastian Grimberg   // Compile kernels
560940a72f1SSebastian Grimberg   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
561940a72f1SSebastian Grimberg   {
562940a72f1SSebastian Grimberg     char   *interp_kernel_name_base = "ceed/jit-source/magma/magma-basis-interp";
563940a72f1SSebastian Grimberg     CeedInt interp_kernel_name_len  = strlen(interp_kernel_name_base) + 6;
564940a72f1SSebastian Grimberg     char    interp_kernel_name[interp_kernel_name_len];
565940a72f1SSebastian Grimberg 
566940a72f1SSebastian Grimberg     snprintf(interp_kernel_name, interp_kernel_name_len, "%s-%" CeedInt_FMT "d.h", interp_kernel_name_base, dim);
567940a72f1SSebastian Grimberg     CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_kernel_name, &interp_kernel_path));
568940a72f1SSebastian Grimberg   }
569940a72f1SSebastian Grimberg   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
570509d4af6SJeremy L Thompson   CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, interp_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
571940a72f1SSebastian Grimberg   {
572940a72f1SSebastian Grimberg     char   *grad_kernel_name_base = "ceed/jit-source/magma/magma-basis-grad";
573940a72f1SSebastian Grimberg     CeedInt grad_kernel_name_len  = strlen(grad_kernel_name_base) + 6;
574940a72f1SSebastian Grimberg     char    grad_kernel_name[grad_kernel_name_len];
575940a72f1SSebastian Grimberg 
576940a72f1SSebastian Grimberg     snprintf(grad_kernel_name, grad_kernel_name_len, "%s-%" CeedInt_FMT "d.h", grad_kernel_name_base, dim);
577940a72f1SSebastian Grimberg     CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_kernel_name, &grad_kernel_path));
578940a72f1SSebastian Grimberg   }
579509d4af6SJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
580940a72f1SSebastian Grimberg   {
581940a72f1SSebastian Grimberg     char   *weight_kernel_name_base = "ceed/jit-source/magma/magma-basis-weight";
582940a72f1SSebastian Grimberg     CeedInt weight_kernel_name_len  = strlen(weight_kernel_name_base) + 6;
583940a72f1SSebastian Grimberg     char    weight_kernel_name[weight_kernel_name_len];
584940a72f1SSebastian Grimberg 
585940a72f1SSebastian Grimberg     snprintf(weight_kernel_name, weight_kernel_name_len, "%s-%" CeedInt_FMT "d.h", weight_kernel_name_base, dim);
586940a72f1SSebastian Grimberg     CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_kernel_name, &weight_kernel_path));
587940a72f1SSebastian Grimberg   }
588509d4af6SJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &num_file_paths, &file_paths, &basis_kernel_source));
589940a72f1SSebastian Grimberg   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
590940a72f1SSebastian Grimberg   CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_P",
591940a72f1SSebastian Grimberg                                    P_1d, "BASIS_Q", Q_1d, "BASIS_MAX_P_Q", CeedIntMax(P_1d, Q_1d)));
592940a72f1SSebastian Grimberg   switch (dim) {
593940a72f1SSebastian Grimberg     case 1:
594940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp));
595940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose));
596db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_1d_kernel", &impl->InterpTransposeAdd));
597940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad));
598940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose));
599db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_1d_kernel", &impl->GradTransposeAdd));
600940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight));
601940a72f1SSebastian Grimberg       break;
602940a72f1SSebastian Grimberg     case 2:
603940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp));
604940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose));
605db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_2d_kernel", &impl->InterpTransposeAdd));
606940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad));
607940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose));
608db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_2d_kernel", &impl->GradTransposeAdd));
609940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight));
610940a72f1SSebastian Grimberg       break;
611940a72f1SSebastian Grimberg     case 3:
612940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp));
613940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose));
614db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_3d_kernel", &impl->InterpTransposeAdd));
615940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad));
616940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose));
617db2becc9SJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_3d_kernel", &impl->GradTransposeAdd));
618940a72f1SSebastian Grimberg       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight));
619940a72f1SSebastian Grimberg       break;
620940a72f1SSebastian Grimberg   }
621940a72f1SSebastian Grimberg   CeedCallBackend(CeedFree(&interp_kernel_path));
622940a72f1SSebastian Grimberg   CeedCallBackend(CeedFree(&grad_kernel_path));
623940a72f1SSebastian Grimberg   CeedCallBackend(CeedFree(&weight_kernel_path));
624940a72f1SSebastian Grimberg   CeedCallBackend(CeedFree(&basis_kernel_source));
6255a5594ffSJeremy L Thompson   for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
6265a5594ffSJeremy L Thompson   CeedCallBackend(CeedFree(&file_paths));
6277f5b9731SStan Tomov 
6282b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
629940a72f1SSebastian Grimberg 
630940a72f1SSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
631db2becc9SJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Magma));
63214950a8eSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma));
633940a72f1SSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
6349bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
6359bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed_delegate));
636e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6377f5b9731SStan Tomov }
6387f5b9731SStan Tomov 
639940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
640940a72f1SSebastian Grimberg // Create non-tensor H^1
641940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisCreateH1_Magma(CeedElemTopology topo,CeedInt dim,CeedInt num_nodes,CeedInt num_qpts,const CeedScalar * interp,const CeedScalar * grad,const CeedScalar * q_ref,const CeedScalar * q_weight,CeedBasis basis)642940a72f1SSebastian Grimberg int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
64338293ee6SJeremy L Thompson                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
6447251047cSSebastian Grimberg   Ceed                      ceed;
645e0582403Sabdelfattah83   Ceed_Magma               *data;
64638293ee6SJeremy L Thompson   CeedBasisNonTensor_Magma *impl;
64738293ee6SJeremy L Thompson 
64838293ee6SJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
6492b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
6502b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
651023b8a51Sabdelfattah83 
652940a72f1SSebastian Grimberg   // Copy basis data to GPU
653097cc795SJames Wright   if (q_weight) {
65438293ee6SJeremy L Thompson     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
65538293ee6SJeremy L Thompson     magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
656097cc795SJames Wright   }
6579d15e85bSSebastian Grimberg   if (interp) {
6589d15e85bSSebastian Grimberg     CeedInt q_comp_interp;
6599d15e85bSSebastian Grimberg 
6609d15e85bSSebastian Grimberg     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
6619d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
6629d15e85bSSebastian Grimberg     magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
6639d15e85bSSebastian Grimberg   }
6649d15e85bSSebastian Grimberg   if (grad) {
6659d15e85bSSebastian Grimberg     CeedInt q_comp_grad;
6669d15e85bSSebastian Grimberg 
6679d15e85bSSebastian Grimberg     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
6689d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * q_comp_grad * sizeof(grad[0])));
6699d15e85bSSebastian Grimberg     magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
6709d15e85bSSebastian Grimberg   }
6719d15e85bSSebastian Grimberg 
6727251047cSSebastian Grimberg   // Compile the weight kernel if it won't be compiled later on
6737251047cSSebastian Grimberg   if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
6747251047cSSebastian Grimberg     Ceed        ceed_delegate;
67522070f95SJeremy L Thompson     char       *basis_kernel_source;
67622070f95SJeremy L Thompson     const char *weight_kernel_path;
6777251047cSSebastian Grimberg 
6789d15e85bSSebastian Grimberg     // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
6799d15e85bSSebastian Grimberg     CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
6809d15e85bSSebastian Grimberg 
6819d15e85bSSebastian Grimberg     // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
6829d15e85bSSebastian Grimberg     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
6839d15e85bSSebastian Grimberg     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
6849d15e85bSSebastian Grimberg     CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
6859d15e85bSSebastian Grimberg     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
6867251047cSSebastian Grimberg     CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
6877251047cSSebastian Grimberg     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
6889d15e85bSSebastian Grimberg     CeedCallBackend(CeedFree(&weight_kernel_path));
6899d15e85bSSebastian Grimberg     CeedCallBackend(CeedFree(&basis_kernel_source));
6909bc66399SJeremy L Thompson     CeedCallBackend(CeedDestroy(&ceed_delegate));
6917251047cSSebastian Grimberg   }
6929d15e85bSSebastian Grimberg 
6939d15e85bSSebastian Grimberg   CeedCallBackend(CeedBasisSetData(basis, impl));
6949d15e85bSSebastian Grimberg 
6959d15e85bSSebastian Grimberg   // Register backend functions
6969d15e85bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
697db2becc9SJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
6989d15e85bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
6999bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
7009d15e85bSSebastian Grimberg   return CEED_ERROR_SUCCESS;
7019d15e85bSSebastian Grimberg }
7029d15e85bSSebastian Grimberg 
7039d15e85bSSebastian Grimberg //------------------------------------------------------------------------------
7049d15e85bSSebastian Grimberg // Create non-tensor H(div)
7059d15e85bSSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisCreateHdiv_Magma(CeedElemTopology topo,CeedInt dim,CeedInt num_nodes,CeedInt num_qpts,const CeedScalar * interp,const CeedScalar * div,const CeedScalar * q_ref,const CeedScalar * q_weight,CeedBasis basis)7069d15e85bSSebastian Grimberg int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
7079d15e85bSSebastian Grimberg                               const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
7087251047cSSebastian Grimberg   Ceed                      ceed;
7099d15e85bSSebastian Grimberg   Ceed_Magma               *data;
7109d15e85bSSebastian Grimberg   CeedBasisNonTensor_Magma *impl;
7119d15e85bSSebastian Grimberg 
7129d15e85bSSebastian Grimberg   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
7139d15e85bSSebastian Grimberg   CeedCallBackend(CeedGetData(ceed, &data));
7149d15e85bSSebastian Grimberg   CeedCallBackend(CeedCalloc(1, &impl));
7159d15e85bSSebastian Grimberg 
7169d15e85bSSebastian Grimberg   // Copy basis data to GPU
717097cc795SJames Wright   if (q_weight) {
7189d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
7199d15e85bSSebastian Grimberg     magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
720097cc795SJames Wright   }
7219d15e85bSSebastian Grimberg   if (interp) {
7229d15e85bSSebastian Grimberg     CeedInt q_comp_interp;
7239d15e85bSSebastian Grimberg 
7249d15e85bSSebastian Grimberg     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
7259d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
7269d15e85bSSebastian Grimberg     magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
7279d15e85bSSebastian Grimberg   }
7289d15e85bSSebastian Grimberg   if (div) {
7299d15e85bSSebastian Grimberg     CeedInt q_comp_div;
7309d15e85bSSebastian Grimberg 
7319d15e85bSSebastian Grimberg     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
7329d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_div, num_qpts * num_nodes * q_comp_div * sizeof(div[0])));
7339d15e85bSSebastian Grimberg     magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue);
7349d15e85bSSebastian Grimberg   }
7359d15e85bSSebastian Grimberg 
7367251047cSSebastian Grimberg   // Compile the weight kernel if it won't be compiled later on
7377251047cSSebastian Grimberg   if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
7387251047cSSebastian Grimberg     Ceed        ceed_delegate;
73922070f95SJeremy L Thompson     char       *basis_kernel_source;
74022070f95SJeremy L Thompson     const char *weight_kernel_path;
7417251047cSSebastian Grimberg 
7429d15e85bSSebastian Grimberg     // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
7439d15e85bSSebastian Grimberg     CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
7449d15e85bSSebastian Grimberg 
7459d15e85bSSebastian Grimberg     // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
7469d15e85bSSebastian Grimberg     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
7479d15e85bSSebastian Grimberg     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
7489d15e85bSSebastian Grimberg     CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
7499d15e85bSSebastian Grimberg     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
7507251047cSSebastian Grimberg     CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
7517251047cSSebastian Grimberg     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
7529d15e85bSSebastian Grimberg     CeedCallBackend(CeedFree(&weight_kernel_path));
7539d15e85bSSebastian Grimberg     CeedCallBackend(CeedFree(&basis_kernel_source));
7549bc66399SJeremy L Thompson     CeedCallBackend(CeedDestroy(&ceed_delegate));
7557251047cSSebastian Grimberg   }
7569d15e85bSSebastian Grimberg 
7579d15e85bSSebastian Grimberg   CeedCallBackend(CeedBasisSetData(basis, impl));
7589d15e85bSSebastian Grimberg 
7599d15e85bSSebastian Grimberg   // Register backend functions
7609d15e85bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
761db2becc9SJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
7629d15e85bSSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
7639bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
7649d15e85bSSebastian Grimberg   return CEED_ERROR_SUCCESS;
7659d15e85bSSebastian Grimberg }
7669d15e85bSSebastian Grimberg 
7679d15e85bSSebastian Grimberg //------------------------------------------------------------------------------
7689d15e85bSSebastian Grimberg // Create non-tensor H(curl)
7699d15e85bSSebastian Grimberg //------------------------------------------------------------------------------
CeedBasisCreateHcurl_Magma(CeedElemTopology topo,CeedInt dim,CeedInt num_nodes,CeedInt num_qpts,const CeedScalar * interp,const CeedScalar * curl,const CeedScalar * q_ref,const CeedScalar * q_weight,CeedBasis basis)7709d15e85bSSebastian Grimberg int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
7719d15e85bSSebastian Grimberg                                const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
7727251047cSSebastian Grimberg   Ceed                      ceed;
7739d15e85bSSebastian Grimberg   Ceed_Magma               *data;
7749d15e85bSSebastian Grimberg   CeedBasisNonTensor_Magma *impl;
7759d15e85bSSebastian Grimberg 
7769d15e85bSSebastian Grimberg   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
7779d15e85bSSebastian Grimberg   CeedCallBackend(CeedGetData(ceed, &data));
7789d15e85bSSebastian Grimberg   CeedCallBackend(CeedCalloc(1, &impl));
7799d15e85bSSebastian Grimberg 
7809d15e85bSSebastian Grimberg   // Copy basis data to GPU
781097cc795SJames Wright   if (q_weight) {
7829d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
7839d15e85bSSebastian Grimberg     magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
784097cc795SJames Wright   }
7859d15e85bSSebastian Grimberg   if (interp) {
7869d15e85bSSebastian Grimberg     CeedInt q_comp_interp;
7879d15e85bSSebastian Grimberg 
7889d15e85bSSebastian Grimberg     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
7899d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
7909d15e85bSSebastian Grimberg     magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
7919d15e85bSSebastian Grimberg   }
7929d15e85bSSebastian Grimberg   if (curl) {
7939d15e85bSSebastian Grimberg     CeedInt q_comp_curl;
7949d15e85bSSebastian Grimberg 
7959d15e85bSSebastian Grimberg     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
7969d15e85bSSebastian Grimberg     CeedCallBackend(magma_malloc((void **)&impl->d_curl, num_qpts * num_nodes * q_comp_curl * sizeof(curl[0])));
7979d15e85bSSebastian Grimberg     magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue);
7989d15e85bSSebastian Grimberg   }
799940a72f1SSebastian Grimberg 
8007251047cSSebastian Grimberg   // Compile the weight kernel if it won't be compiled later on
8017251047cSSebastian Grimberg   if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
8027251047cSSebastian Grimberg     Ceed        ceed_delegate;
80322070f95SJeremy L Thompson     char       *basis_kernel_source;
80422070f95SJeremy L Thompson     const char *weight_kernel_path;
8057251047cSSebastian Grimberg 
806940a72f1SSebastian Grimberg     // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
807940a72f1SSebastian Grimberg     CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
808940a72f1SSebastian Grimberg 
809940a72f1SSebastian Grimberg     // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
810940a72f1SSebastian Grimberg     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
811940a72f1SSebastian Grimberg     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
812940a72f1SSebastian Grimberg     CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
813940a72f1SSebastian Grimberg     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
8147251047cSSebastian Grimberg     CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
8157251047cSSebastian Grimberg     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
816940a72f1SSebastian Grimberg     CeedCallBackend(CeedFree(&weight_kernel_path));
817940a72f1SSebastian Grimberg     CeedCallBackend(CeedFree(&basis_kernel_source));
8189bc66399SJeremy L Thompson     CeedCallBackend(CeedDestroy(&ceed_delegate));
8197251047cSSebastian Grimberg   }
820868539c2SNatalie Beams 
821023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisSetData(basis, impl));
822940a72f1SSebastian Grimberg 
823940a72f1SSebastian Grimberg   // Register backend functions
824940a72f1SSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
825db2becc9SJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
826940a72f1SSebastian Grimberg   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
8279bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
828e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
8297f5b9731SStan Tomov }
830940a72f1SSebastian Grimberg 
831940a72f1SSebastian Grimberg //------------------------------------------------------------------------------
832