xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision 49aac155e7a09736f56fb3abac0f57dab29f7cbf)
13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37f5b9731SStan Tomov //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57f5b9731SStan Tomov //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
77f5b9731SStan Tomov 
8*49aac155SJeremy L Thompson #include <ceed.h>
9ec3da8bcSJed Brown #include <ceed/backend.h>
10f6af633fSnbeams #include <ceed/jit-tools.h>
11f6af633fSnbeams #include <string.h>
122b730f8bSJeremy L Thompson 
137f5b9731SStan Tomov #include "ceed-magma.h"
14e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
15f6af633fSnbeams #include "../hip/ceed-hip-common.h"
16f6af633fSnbeams #include "../hip/ceed-hip-compile.h"
17f6af633fSnbeams #else
18f6af633fSnbeams #include "../cuda/ceed-cuda-common.h"
19f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h"
20f6af633fSnbeams #endif
217f5b9731SStan Tomov 
227f5b9731SStan Tomov #ifdef __cplusplus
237f5b9731SStan Tomov CEED_INTERN "C"
247f5b9731SStan Tomov #endif
252b730f8bSJeremy L Thompson     int
262b730f8bSJeremy L Thompson     CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
277f5b9731SStan Tomov   Ceed ceed;
282b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
29e0582403Sabdelfattah83   CeedInt dim, ncomp, ndof;
302b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
312b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
322b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
33e0582403Sabdelfattah83 
34e0582403Sabdelfattah83   Ceed_Magma *data;
352b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
36e0582403Sabdelfattah83 
377f5b9731SStan Tomov   const CeedScalar *u;
387f5b9731SStan Tomov   CeedScalar       *v;
39868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
402b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u));
417f5b9731SStan Tomov   } else if (emode != CEED_EVAL_WEIGHT) {
427f5b9731SStan Tomov     // LCOV_EXCL_START
432b730f8bSJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
447f5b9731SStan Tomov     // LCOV_EXCL_STOP
457f5b9731SStan Tomov   }
462b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v));
477f5b9731SStan Tomov 
487f5b9731SStan Tomov   CeedBasis_Magma *impl;
492b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
507f5b9731SStan Tomov 
517f5b9731SStan Tomov   CeedInt P1d, Q1d;
522b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d));
532b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d));
547f5b9731SStan Tomov 
552b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp);
567f5b9731SStan Tomov 
577f5b9731SStan Tomov   if (tmode == CEED_TRANSPOSE) {
581f9221feSJeremy L Thompson     CeedSize length;
592b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
6080a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
612b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)v, length, data->queue);
6280a9ef05SNatalie Beams     } else {
632b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)v, length, data->queue);
6480a9ef05SNatalie Beams     }
65e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
667f5b9731SStan Tomov   }
67f6af633fSnbeams 
683513a710Sjeremylt   switch (emode) {
693513a710Sjeremylt     case CEED_EVAL_INTERP: {
707f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
717f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
722b730f8bSJeremy L Thompson         P = Q1d;
732b730f8bSJeremy L Thompson         Q = P1d;
747f5b9731SStan Tomov       }
757f5b9731SStan Tomov 
767f5b9731SStan Tomov       // Define element sizes for dofs/quad
777f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
787f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
797f5b9731SStan Tomov 
807f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
81868539c2SNatalie Beams       //  component                        component
82868539c2SNatalie Beams       //    elem                             elem
837f5b9731SStan Tomov       //       node                            node
847f5b9731SStan Tomov 
857f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
867f5b9731SStan Tomov       // Input (u) is E-vector, output (v) is Q-vector
877f5b9731SStan Tomov 
887f5b9731SStan Tomov       // Element strides
89868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
907f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
917f5b9731SStan Tomov       // Component strides
92868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
937f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
947f5b9731SStan Tomov 
957f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
967f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
977f5b9731SStan Tomov         // Input (u) is Q-vector, output (v) is E-vector
987f5b9731SStan Tomov         // Element strides
99868539c2SNatalie Beams         v_elstride = eldofssize;
1007f5b9731SStan Tomov         u_elstride = elquadsize;
1017f5b9731SStan Tomov         // Component strides
102868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
1037f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
1047f5b9731SStan Tomov       }
1057f5b9731SStan Tomov 
106f6af633fSnbeams       CeedInt nthreads = 1;
107f6af633fSnbeams       CeedInt ntcol    = 1;
108f6af633fSnbeams       CeedInt shmem    = 0;
109f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
110f6af633fSnbeams 
111f6af633fSnbeams       switch (dim) {
112f6af633fSnbeams         case 1:
113f6af633fSnbeams           nthreads = maxPQ;
114f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
115f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
116f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
117f6af633fSnbeams           break;
118f6af633fSnbeams         case 2:
119f6af633fSnbeams           nthreads = maxPQ;
120f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
121f6af633fSnbeams           shmem += P * Q * sizeof(CeedScalar);                // for sT
1222b730f8bSJeremy L Thompson           shmem += ntcol * (P * maxPQ * sizeof(CeedScalar));  // for reforming rU we need PxP, and for the intermediate output we need PxQ
123f6af633fSnbeams           break;
124f6af633fSnbeams         case 3:
125f6af633fSnbeams           nthreads = maxPQ * maxPQ;
126f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
127f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);  // for sT
1282b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
1292b730f8bSJeremy L Thompson                    (CeedIntMax(P * P * maxPQ,
130f6af633fSnbeams                                P * Q * Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
131f6af633fSnbeams       }
132f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
1332b730f8bSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &u, &u_elstride, &u_compstride, &v, &v_elstride, &v_compstride, &nelem};
134f6af633fSnbeams 
135f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
1362b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args));
137f6af633fSnbeams       } else {
1382b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args));
139f6af633fSnbeams       }
1402b730f8bSJeremy L Thompson     } break;
1413513a710Sjeremylt     case CEED_EVAL_GRAD: {
1427f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
1437f5b9731SStan Tomov       // In CEED_NOTRANSPOSE mode:
1447f5b9731SStan Tomov       // u is (P^dim x nc), column-major layout (nc = ncomp)
1457f5b9731SStan Tomov       // v is (Q^dim x nc x dim), column-major layout (nc = ncomp)
1467f5b9731SStan Tomov       // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
1477f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
1487f5b9731SStan Tomov         P = Q1d, Q = P1d;
1497f5b9731SStan Tomov       }
1507f5b9731SStan Tomov 
1517f5b9731SStan Tomov       // Define element sizes for dofs/quad
1527f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
1537f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
1547f5b9731SStan Tomov 
1557f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
1567f5b9731SStan Tomov       //                                  dim
157868539c2SNatalie Beams       //  component                        component
158868539c2SNatalie Beams       //    elem                              elem
1597f5b9731SStan Tomov       //       node                            node
1607f5b9731SStan Tomov 
1617f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
1627f5b9731SStan Tomov       // Input (u) is E-vector, output (v) is Q-vector
1637f5b9731SStan Tomov 
1647f5b9731SStan Tomov       // Element strides
165868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
1667f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
1677f5b9731SStan Tomov       // Component strides
168868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
1697f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
1707f5b9731SStan Tomov       // Dimension strides
1717f5b9731SStan Tomov       CeedInt u_dimstride = 0;
1727f5b9731SStan Tomov       CeedInt v_dimstride = nelem * elquadsize * ncomp;
1737f5b9731SStan Tomov 
1747f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
1757f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
1767f5b9731SStan Tomov         // Input (u) is Q-vector, output (v) is E-vector
1777f5b9731SStan Tomov         // Element strides
178868539c2SNatalie Beams         v_elstride = eldofssize;
1797f5b9731SStan Tomov         u_elstride = elquadsize;
1807f5b9731SStan Tomov         // Component strides
181868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
1827f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
1837f5b9731SStan Tomov         // Dimension strides
1847f5b9731SStan Tomov         v_dimstride = 0;
1857f5b9731SStan Tomov         u_dimstride = nelem * elquadsize * ncomp;
1867f5b9731SStan Tomov       }
1877f5b9731SStan Tomov 
188f6af633fSnbeams       CeedInt nthreads = 1;
189f6af633fSnbeams       CeedInt ntcol    = 1;
190f6af633fSnbeams       CeedInt shmem    = 0;
191f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
192f6af633fSnbeams 
193f6af633fSnbeams       switch (dim) {
194f6af633fSnbeams         case 1:
195f6af633fSnbeams           nthreads = maxPQ;
196f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
197f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
198f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
199f6af633fSnbeams           break;
200f6af633fSnbeams         case 2:
201f6af633fSnbeams           nthreads = maxPQ;
202f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
203f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;            // for sTinterp and sTgrad
2042b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
205f6af633fSnbeams           break;
206f6af633fSnbeams         case 3:
207f6af633fSnbeams           nthreads = maxPQ * maxPQ;
208f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
209f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
2102b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
2112b730f8bSJeremy L Thompson                    CeedIntMax(P * P * P,
2122b730f8bSJeremy L Thompson                               (P * P * Q) + (P * Q * Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
213f6af633fSnbeams       }
214f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
2152b730f8bSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &impl->dgrad1d, &u,           &u_elstride, &u_compstride, &u_dimstride, &v,
2162b730f8bSJeremy L Thompson                         &v_elstride,      &v_compstride,  &v_dimstride, &nelem};
217f6af633fSnbeams 
218f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
2192b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args));
220f6af633fSnbeams       } else {
2212b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args));
222f6af633fSnbeams       }
2232b730f8bSJeremy L Thompson     } break;
2243513a710Sjeremylt     case CEED_EVAL_WEIGHT: {
2257f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE)
2267f5b9731SStan Tomov         // LCOV_EXCL_START
2272b730f8bSJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
2287f5b9731SStan Tomov       // LCOV_EXCL_STOP
2297f5b9731SStan Tomov       CeedInt Q          = Q1d;
230f6af633fSnbeams       CeedInt eldofssize = CeedIntPow(Q, dim);
231f6af633fSnbeams       CeedInt nthreads   = 1;
232f6af633fSnbeams       CeedInt ntcol      = 1;
233f6af633fSnbeams       CeedInt shmem      = 0;
234f6af633fSnbeams 
235f6af633fSnbeams       switch (dim) {
236f6af633fSnbeams         case 1:
237f6af633fSnbeams           nthreads = Q;
238f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
239f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;          // for dqweight1d
240f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * Q;  // for output
241f6af633fSnbeams           break;
242f6af633fSnbeams         case 2:
243f6af633fSnbeams           nthreads = Q;
244f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
245f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
246f6af633fSnbeams           break;
247f6af633fSnbeams         case 3:
248f6af633fSnbeams           nthreads = Q * Q;
249f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
250f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
251f6af633fSnbeams       }
252f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
253f6af633fSnbeams       void   *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem};
254f6af633fSnbeams 
2552b730f8bSJeremy L Thompson       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args));
2562b730f8bSJeremy L Thompson     } break;
2573513a710Sjeremylt     // LCOV_EXCL_START
2583513a710Sjeremylt     case CEED_EVAL_DIV:
259e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
2603513a710Sjeremylt     case CEED_EVAL_CURL:
261e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
2623513a710Sjeremylt     case CEED_EVAL_NONE:
2632b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
2643513a710Sjeremylt       // LCOV_EXCL_STOP
2653513a710Sjeremylt   }
2667f5b9731SStan Tomov 
267e0582403Sabdelfattah83   // must sync to ensure completeness
268e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
269e0582403Sabdelfattah83 
2707f5b9731SStan Tomov   if (emode != CEED_EVAL_WEIGHT) {
2712b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &u));
2727f5b9731SStan Tomov   }
2732b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &v));
274e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2757f5b9731SStan Tomov }
2767f5b9731SStan Tomov 
2777f5b9731SStan Tomov #ifdef __cplusplus
2787f5b9731SStan Tomov CEED_INTERN "C"
2797f5b9731SStan Tomov #endif
2802b730f8bSJeremy L Thompson     int
281023b8a51Sabdelfattah83     CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
282868539c2SNatalie Beams   Ceed ceed;
2832b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
284e0582403Sabdelfattah83 
285e0582403Sabdelfattah83   Ceed_Magma *data;
2862b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
287e0582403Sabdelfattah83 
288023b8a51Sabdelfattah83   magma_int_t arch = magma_getdevice_arch();
289023b8a51Sabdelfattah83 
290868539c2SNatalie Beams   CeedInt dim, ncomp, ndof, nqpt;
2912b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
2922b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
2932b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
2942b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt));
295868539c2SNatalie Beams   const CeedScalar *du;
296868539c2SNatalie Beams   CeedScalar       *dv;
297868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
2982b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
299868539c2SNatalie Beams   } else if (emode != CEED_EVAL_WEIGHT) {
300868539c2SNatalie Beams     // LCOV_EXCL_START
3012b730f8bSJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
302868539c2SNatalie Beams     // LCOV_EXCL_STOP
303868539c2SNatalie Beams   }
3042b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
305868539c2SNatalie Beams 
306868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
3072b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
308868539c2SNatalie Beams 
3092b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp);
310868539c2SNatalie Beams 
311868539c2SNatalie Beams   if (tmode == CEED_TRANSPOSE) {
3121f9221feSJeremy L Thompson     CeedSize length;
3132b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
31480a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
3152b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
31680a9ef05SNatalie Beams     } else {
3172b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
31880a9ef05SNatalie Beams     }
319e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
320868539c2SNatalie Beams   }
32180a9ef05SNatalie Beams 
322023b8a51Sabdelfattah83   CeedInt            P = ndof, Q = nqpt, N = nelem * ncomp;
323023b8a51Sabdelfattah83   CeedInt            NB = 1;
324023b8a51Sabdelfattah83   CeedMagmaFunction *interp, *grad;
325868539c2SNatalie Beams 
326023b8a51Sabdelfattah83   CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
327023b8a51Sabdelfattah83   CeedInt iN                                       = 0;
328023b8a51Sabdelfattah83   CeedInt diff                                     = abs(Narray[iN] - N);
329023b8a51Sabdelfattah83   for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
330023b8a51Sabdelfattah83     CeedInt idiff = abs(Narray[in] - N);
331023b8a51Sabdelfattah83     if (idiff < diff) {
332023b8a51Sabdelfattah83       iN   = in;
333023b8a51Sabdelfattah83       diff = idiff;
334868539c2SNatalie Beams     }
33580a9ef05SNatalie Beams   }
33680a9ef05SNatalie Beams 
337023b8a51Sabdelfattah83   NB     = nontensor_rtc_get_nb(arch, 'd', emode, tmode, P, Narray[iN], Q);
338023b8a51Sabdelfattah83   interp = (tmode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN];
339023b8a51Sabdelfattah83   grad   = (tmode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN];
34080a9ef05SNatalie Beams 
34180a9ef05SNatalie Beams   switch (emode) {
34280a9ef05SNatalie Beams     case CEED_EVAL_INTERP: {
34380a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
344023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
345023b8a51Sabdelfattah83         CeedInt M     = (tmode == CEED_TRANSPOSE) ? P : Q;
346023b8a51Sabdelfattah83         CeedInt K     = (tmode == CEED_TRANSPOSE) ? Q : P;
347023b8a51Sabdelfattah83         CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M);
348023b8a51Sabdelfattah83         CeedInt shmem = 0, shmemA = 0, shmemB = 0;
349023b8a51Sabdelfattah83         shmemB += ntcol * K * NB * sizeof(CeedScalar);
350023b8a51Sabdelfattah83         shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
351023b8a51Sabdelfattah83         shmem = (tmode == CEED_TRANSPOSE) ? (shmemA + shmemB) : CeedIntMax(shmemA, shmemB);
352023b8a51Sabdelfattah83 
353023b8a51Sabdelfattah83         CeedInt       grid   = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol);
354023b8a51Sabdelfattah83         magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
355023b8a51Sabdelfattah83         magma_trans_t transB = MagmaNoTrans;
356023b8a51Sabdelfattah83         CeedScalar    alpha = 1.0, beta = 0.0;
357023b8a51Sabdelfattah83 
358023b8a51Sabdelfattah83         void *args[] = {&transA, &transB, &N, &alpha, &impl->dinterp, &P, &du, &K, &beta, &dv, &M};
359023b8a51Sabdelfattah83         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, ntcol, 1, shmem, args));
360023b8a51Sabdelfattah83       } else {
36180a9ef05SNatalie Beams         if (tmode == CEED_TRANSPOSE)
362023b8a51Sabdelfattah83           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dinterp, P, du, Q, 0.0, dv, P, data->queue);
363023b8a51Sabdelfattah83         else magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dinterp, P, du, P, 0.0, dv, Q, data->queue);
364023b8a51Sabdelfattah83       }
3652b730f8bSJeremy L Thompson     } break;
36680a9ef05SNatalie Beams 
36780a9ef05SNatalie Beams     case CEED_EVAL_GRAD: {
36880a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
369023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
370023b8a51Sabdelfattah83         CeedInt M     = (tmode == CEED_TRANSPOSE) ? P : Q;
371023b8a51Sabdelfattah83         CeedInt K     = (tmode == CEED_TRANSPOSE) ? Q : P;
372023b8a51Sabdelfattah83         CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M);
373023b8a51Sabdelfattah83         CeedInt shmem = 0, shmemA = 0, shmemB = 0;
374023b8a51Sabdelfattah83         shmemB += ntcol * K * NB * sizeof(CeedScalar);
375023b8a51Sabdelfattah83         shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
376023b8a51Sabdelfattah83         shmem = shmemA + shmemB;
377023b8a51Sabdelfattah83 
378023b8a51Sabdelfattah83         CeedInt       grid   = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol);
379023b8a51Sabdelfattah83         magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
380023b8a51Sabdelfattah83         magma_trans_t transB = MagmaNoTrans;
381023b8a51Sabdelfattah83 
382023b8a51Sabdelfattah83         void *args[] = {&transA, &transB, &N, &impl->dgrad, &P, &du, &K, &dv, &M};
383023b8a51Sabdelfattah83         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, ntcol, 1, shmem, args));
384023b8a51Sabdelfattah83       } else {
38580a9ef05SNatalie Beams         if (tmode == CEED_TRANSPOSE) {
38680a9ef05SNatalie Beams           CeedScalar beta = 0.0;
38780a9ef05SNatalie Beams           for (int d = 0; d < dim; d++) {
3882b730f8bSJeremy L Thompson             if (d > 0) beta = 1.0;
389023b8a51Sabdelfattah83             magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dgrad + d * P * Q, P, du + d * nelem * ncomp * Q, Q,
390023b8a51Sabdelfattah83                                  beta, dv, P, data->queue);
39180a9ef05SNatalie Beams           }
39280a9ef05SNatalie Beams         } else {
39380a9ef05SNatalie Beams           for (int d = 0; d < dim; d++)
394023b8a51Sabdelfattah83             magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dgrad + d * P * Q, P, du, P, 0.0,
395023b8a51Sabdelfattah83                                  dv + d * nelem * ncomp * Q, Q, data->queue);
396023b8a51Sabdelfattah83         }
397868539c2SNatalie Beams       }
3982b730f8bSJeremy L Thompson     } break;
399868539c2SNatalie Beams 
400868539c2SNatalie Beams     case CEED_EVAL_WEIGHT: {
401868539c2SNatalie Beams       if (tmode == CEED_TRANSPOSE)
402868539c2SNatalie Beams         // LCOV_EXCL_START
4032b730f8bSJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
404868539c2SNatalie Beams       // LCOV_EXCL_STOP
405868539c2SNatalie Beams 
406868539c2SNatalie Beams       int elemsPerBlock = 1;  // basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
4072b730f8bSJeremy L Thompson       int grid          = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0);
4082b730f8bSJeremy L Thompson       magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue);
4092b730f8bSJeremy L Thompson     } break;
410868539c2SNatalie Beams 
411868539c2SNatalie Beams     // LCOV_EXCL_START
412868539c2SNatalie Beams     case CEED_EVAL_DIV:
413e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
414868539c2SNatalie Beams     case CEED_EVAL_CURL:
415e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
416868539c2SNatalie Beams     case CEED_EVAL_NONE:
4172b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
418868539c2SNatalie Beams       // LCOV_EXCL_STOP
419868539c2SNatalie Beams   }
420868539c2SNatalie Beams 
421e0582403Sabdelfattah83   // must sync to ensure completeness
422e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
423e0582403Sabdelfattah83 
424868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
4252b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
426868539c2SNatalie Beams   }
4272b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
428e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
429868539c2SNatalie Beams }
430868539c2SNatalie Beams 
431868539c2SNatalie Beams #ifdef __cplusplus
432868539c2SNatalie Beams CEED_INTERN "C"
433868539c2SNatalie Beams #endif
4342b730f8bSJeremy L Thompson     int
4352b730f8bSJeremy L Thompson     CeedBasisDestroy_Magma(CeedBasis basis) {
4367f5b9731SStan Tomov   CeedBasis_Magma *impl;
4372b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
4387f5b9731SStan Tomov 
4392b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref1d));
4402b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp1d));
4412b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad1d));
4422b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight1d));
443f6af633fSnbeams   Ceed ceed;
4442b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
445e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
4462b730f8bSJeremy L Thompson   CeedCallHip(ceed, hipModuleUnload(impl->module));
447f6af633fSnbeams #else
4482b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(impl->module));
449f6af633fSnbeams #endif
4507f5b9731SStan Tomov 
4512b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
4527f5b9731SStan Tomov 
453e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
4547f5b9731SStan Tomov }
4557f5b9731SStan Tomov 
4567f5b9731SStan Tomov #ifdef __cplusplus
4577f5b9731SStan Tomov CEED_INTERN "C"
4587f5b9731SStan Tomov #endif
4592b730f8bSJeremy L Thompson     int
4602b730f8bSJeremy L Thompson     CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
461868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
4622b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
463868539c2SNatalie Beams 
4642b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref));
4652b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp));
4662b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad));
4672b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight));
468023b8a51Sabdelfattah83   Ceed ceed;
469023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
470023b8a51Sabdelfattah83 #ifdef CEED_MAGMA_USE_HIP
471023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
472023b8a51Sabdelfattah83     CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
473023b8a51Sabdelfattah83   }
474023b8a51Sabdelfattah83 #else
475023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
476023b8a51Sabdelfattah83     CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
477023b8a51Sabdelfattah83   }
478023b8a51Sabdelfattah83 #endif
4792b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
480868539c2SNatalie Beams 
481e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
482868539c2SNatalie Beams }
483868539c2SNatalie Beams 
484868539c2SNatalie Beams #ifdef __cplusplus
485868539c2SNatalie Beams CEED_INTERN "C"
486868539c2SNatalie Beams #endif
4872b730f8bSJeremy L Thompson     int
4882b730f8bSJeremy L Thompson     CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
4892b730f8bSJeremy L Thompson                                   const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) {
4907f5b9731SStan Tomov   CeedBasis_Magma *impl;
4912b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
4927f5b9731SStan Tomov   Ceed ceed;
4932b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
4947f5b9731SStan Tomov 
495c9f8acf2SJeremy L Thompson   // Check for supported parameters
496c9f8acf2SJeremy L Thompson   CeedInt ncomp = 0;
4972b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
498e0582403Sabdelfattah83   Ceed_Magma *data;
4992b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
500e0582403Sabdelfattah83 
501f6af633fSnbeams   // Compile kernels
502f6af633fSnbeams   char *magma_common_path;
503f6af633fSnbeams   char *interp_path, *grad_path, *weight_path;
504f6af633fSnbeams   char *basis_kernel_source;
505023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
506f6af633fSnbeams   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
5072b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
508023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path));
509023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
510f6af633fSnbeams   char   *interp_name_base = "ceed/jit-source/magma/interp";
511f6af633fSnbeams   CeedInt interp_name_len  = strlen(interp_name_base) + 6;
512f6af633fSnbeams   char    interp_name[interp_name_len];
5132b730f8bSJeremy L Thompson   snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim);
5142b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path));
5152b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
516f6af633fSnbeams   char   *grad_name_base = "ceed/jit-source/magma/grad";
517f6af633fSnbeams   CeedInt grad_name_len  = strlen(grad_name_base) + 6;
518f6af633fSnbeams   char    grad_name[grad_name_len];
5192b730f8bSJeremy L Thompson   snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim);
5202b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path));
5212b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
522f6af633fSnbeams   char   *weight_name_base = "ceed/jit-source/magma/weight";
523f6af633fSnbeams   CeedInt weight_name_len  = strlen(weight_name_base) + 6;
524f6af633fSnbeams   char    weight_name[weight_name_len];
5252b730f8bSJeremy L Thompson   snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim);
5262b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path));
5272b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source));
5282b730f8bSJeremy L Thompson   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
529f6af633fSnbeams   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
530f6af633fSnbeams   // data
531f6af633fSnbeams   Ceed delegate;
5322b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetDelegate(ceed, &delegate));
5332b730f8bSJeremy L Thompson   CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ",
5342b730f8bSJeremy L Thompson                                    CeedIntMax(P1d, Q1d)));
535f6af633fSnbeams 
536f6af633fSnbeams   // Kernel setup
537f6af633fSnbeams   switch (dim) {
538f6af633fSnbeams     case 1:
5392b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp));
5402b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr));
5412b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad));
5422b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr));
5432b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight));
544f6af633fSnbeams       break;
545f6af633fSnbeams     case 2:
5462b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp));
5472b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr));
5482b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad));
5492b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr));
5502b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight));
551f6af633fSnbeams       break;
552f6af633fSnbeams     case 3:
5532b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp));
5542b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr));
5552b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad));
5562b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr));
5572b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight));
558f6af633fSnbeams   }
559f6af633fSnbeams 
5602b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
5612b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
5627f5b9731SStan Tomov 
5637f5b9731SStan Tomov   // Copy qref1d to the GPU
5642b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0])));
5652b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue);
5667f5b9731SStan Tomov 
5677f5b9731SStan Tomov   // Copy interp1d to the GPU
5682b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0])));
5692b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue);
5707f5b9731SStan Tomov 
5717f5b9731SStan Tomov   // Copy grad1d to the GPU
5722b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0])));
5732b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue);
5747f5b9731SStan Tomov 
5757f5b9731SStan Tomov   // Copy qweight1d to the GPU
5762b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0])));
5772b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue);
5787f5b9731SStan Tomov 
5792b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
5802b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&magma_common_path));
5812b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&interp_path));
5822b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&grad_path));
5832b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&weight_path));
5842b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_source));
585f6af633fSnbeams 
586e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5877f5b9731SStan Tomov }
5887f5b9731SStan Tomov 
5897f5b9731SStan Tomov #ifdef __cplusplus
5907f5b9731SStan Tomov CEED_INTERN "C"
5917f5b9731SStan Tomov #endif
5922b730f8bSJeremy L Thompson     int
5932b730f8bSJeremy L Thompson     CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad,
5942b730f8bSJeremy L Thompson                             const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) {
595868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
5967f5b9731SStan Tomov   Ceed                      ceed;
5972b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
5987f5b9731SStan Tomov 
599e0582403Sabdelfattah83   Ceed_Magma *data;
6002b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
601023b8a51Sabdelfattah83   magma_int_t arch = magma_getdevice_arch();
6022b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
603023b8a51Sabdelfattah83   // Compile kernels
604023b8a51Sabdelfattah83   char *magma_common_path;
605023b8a51Sabdelfattah83   char *interp_path, *grad_path;
606023b8a51Sabdelfattah83   char *basis_kernel_source;
607023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
608023b8a51Sabdelfattah83   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
609023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
610023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path));
611023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
612023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path));
613023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
614023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path));
615023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
616023b8a51Sabdelfattah83 
617023b8a51Sabdelfattah83   // tuning parameters for nb
618023b8a51Sabdelfattah83   CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
619023b8a51Sabdelfattah83   CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
620023b8a51Sabdelfattah83   CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
621023b8a51Sabdelfattah83   CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
622023b8a51Sabdelfattah83   CeedInt P = ndof, Q = nqpts;
623023b8a51Sabdelfattah83   CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
624023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
625023b8a51Sabdelfattah83     nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, Narray[in], Q);
626023b8a51Sabdelfattah83     nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, Narray[in], Q);
627023b8a51Sabdelfattah83     nb_grad_n[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, Narray[in], Q);
628023b8a51Sabdelfattah83     nb_grad_t[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, Narray[in], Q);
629023b8a51Sabdelfattah83   }
630023b8a51Sabdelfattah83 
631023b8a51Sabdelfattah83   // compile
632023b8a51Sabdelfattah83   Ceed delegate;
633023b8a51Sabdelfattah83   CeedCallBackend(CeedGetDelegate(ceed, &delegate));
634023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
635023b8a51Sabdelfattah83     CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", nb_interp_n[in],
636023b8a51Sabdelfattah83                                      "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in]));
637023b8a51Sabdelfattah83   }
638023b8a51Sabdelfattah83 
639023b8a51Sabdelfattah83   // get kernels
640023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
641023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in]));
642023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in]));
643023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in]));
644023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in]));
645023b8a51Sabdelfattah83   }
646023b8a51Sabdelfattah83 
647023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
648023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
649868539c2SNatalie Beams 
650868539c2SNatalie Beams   // Copy qref to the GPU
6512b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0])));
652e0582403Sabdelfattah83   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
653868539c2SNatalie Beams 
654868539c2SNatalie Beams   // Copy interp to the GPU
6552b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0])));
6562b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue);
657868539c2SNatalie Beams 
658868539c2SNatalie Beams   // Copy grad to the GPU
6592b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0])));
6602b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue);
661868539c2SNatalie Beams 
662868539c2SNatalie Beams   // Copy qweight to the GPU
6632b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0])));
6642b730f8bSJeremy L Thompson   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue);
665868539c2SNatalie Beams 
666023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisSetData(basis, impl));
667023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&magma_common_path));
668023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&interp_path));
669023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&grad_path));
670023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&basis_kernel_source));
671e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6727f5b9731SStan Tomov }
673