xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma-basis.c (revision 00fb7a044a7fd1c8bfdb0605078b0c7ba7a4ad58)
13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37f5b9731SStan Tomov //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57f5b9731SStan Tomov //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
77f5b9731SStan Tomov 
849aac155SJeremy L Thompson #include <ceed.h>
9ec3da8bcSJed Brown #include <ceed/backend.h>
10f6af633fSnbeams #include <ceed/jit-tools.h>
11f6af633fSnbeams #include <string.h>
122b730f8bSJeremy L Thompson 
13e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
14f6af633fSnbeams #include "../hip/ceed-hip-common.h"
15f6af633fSnbeams #include "../hip/ceed-hip-compile.h"
16f6af633fSnbeams #else
17f6af633fSnbeams #include "../cuda/ceed-cuda-common.h"
18f6af633fSnbeams #include "../cuda/ceed-cuda-compile.h"
19f6af633fSnbeams #endif
20*00fb7a04SSebastian Grimberg #include "ceed-magma-common.h"
21*00fb7a04SSebastian Grimberg #include "ceed-magma.h"
227f5b9731SStan Tomov 
237f5b9731SStan Tomov #ifdef __cplusplus
247f5b9731SStan Tomov CEED_INTERN "C"
257f5b9731SStan Tomov #endif
262b730f8bSJeremy L Thompson     int
272b730f8bSJeremy L Thompson     CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
287f5b9731SStan Tomov   Ceed ceed;
292b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
30e0582403Sabdelfattah83   CeedInt dim, ncomp, ndof;
312b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
322b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
332b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
34e0582403Sabdelfattah83 
35e0582403Sabdelfattah83   Ceed_Magma *data;
362b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
37e0582403Sabdelfattah83 
386574a04fSJeremy L Thompson   const CeedScalar *du;
396574a04fSJeremy L Thompson   CeedScalar       *dv;
406574a04fSJeremy L Thompson   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
416574a04fSJeremy L Thompson   else CeedCheck(emode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
426574a04fSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
437f5b9731SStan Tomov 
447f5b9731SStan Tomov   CeedBasis_Magma *impl;
452b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
467f5b9731SStan Tomov 
477f5b9731SStan Tomov   CeedInt P1d, Q1d;
482b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d));
492b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d));
507f5b9731SStan Tomov 
512b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp);
527f5b9731SStan Tomov 
537f5b9731SStan Tomov   if (tmode == CEED_TRANSPOSE) {
541f9221feSJeremy L Thompson     CeedSize length;
552b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
5680a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
576574a04fSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
5880a9ef05SNatalie Beams     } else {
596574a04fSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
6080a9ef05SNatalie Beams     }
61e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
627f5b9731SStan Tomov   }
63f6af633fSnbeams 
643513a710Sjeremylt   switch (emode) {
653513a710Sjeremylt     case CEED_EVAL_INTERP: {
667f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
677f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
682b730f8bSJeremy L Thompson         P = Q1d;
692b730f8bSJeremy L Thompson         Q = P1d;
707f5b9731SStan Tomov       }
717f5b9731SStan Tomov 
727f5b9731SStan Tomov       // Define element sizes for dofs/quad
737f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
747f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
757f5b9731SStan Tomov 
767f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
77868539c2SNatalie Beams       //  component                        component
78868539c2SNatalie Beams       //    elem                             elem
797f5b9731SStan Tomov       //       node                            node
807f5b9731SStan Tomov 
817f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
826574a04fSJeremy L Thompson       // Input (du) is E-vector, output (dv) is Q-vector
837f5b9731SStan Tomov 
847f5b9731SStan Tomov       // Element strides
85868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
867f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
877f5b9731SStan Tomov       // Component strides
88868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
897f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
907f5b9731SStan Tomov 
917f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
927f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
936574a04fSJeremy L Thompson         // Input (du) is Q-vector, output (dv) is E-vector
947f5b9731SStan Tomov         // Element strides
95868539c2SNatalie Beams         v_elstride = eldofssize;
967f5b9731SStan Tomov         u_elstride = elquadsize;
977f5b9731SStan Tomov         // Component strides
98868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
997f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
1007f5b9731SStan Tomov       }
1017f5b9731SStan Tomov 
102f6af633fSnbeams       CeedInt nthreads = 1;
103f6af633fSnbeams       CeedInt ntcol    = 1;
104f6af633fSnbeams       CeedInt shmem    = 0;
105f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
106f6af633fSnbeams 
107f6af633fSnbeams       switch (dim) {
108f6af633fSnbeams         case 1:
109f6af633fSnbeams           nthreads = maxPQ;
110f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
111f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
112f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
113f6af633fSnbeams           break;
114f6af633fSnbeams         case 2:
115f6af633fSnbeams           nthreads = maxPQ;
116f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
117f6af633fSnbeams           shmem += P * Q * sizeof(CeedScalar);                // for sT
1182b730f8bSJeremy L Thompson           shmem += ntcol * (P * maxPQ * sizeof(CeedScalar));  // for reforming rU we need PxP, and for the intermediate output we need PxQ
119f6af633fSnbeams           break;
120f6af633fSnbeams         case 3:
121f6af633fSnbeams           nthreads = maxPQ * maxPQ;
122f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
123f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);  // for sT
1242b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
1252b730f8bSJeremy L Thompson                    (CeedIntMax(P * P * maxPQ,
126f6af633fSnbeams                                P * Q * Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
127f6af633fSnbeams       }
128f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
1296574a04fSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &du, &u_elstride, &u_compstride, &dv, &v_elstride, &v_compstride, &nelem};
130f6af633fSnbeams 
131f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
1322b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args));
133f6af633fSnbeams       } else {
1342b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args));
135f6af633fSnbeams       }
1362b730f8bSJeremy L Thompson     } break;
1373513a710Sjeremylt     case CEED_EVAL_GRAD: {
1387f5b9731SStan Tomov       CeedInt P = P1d, Q = Q1d;
1397f5b9731SStan Tomov       // In CEED_NOTRANSPOSE mode:
1406574a04fSJeremy L Thompson       // du is (P^dim x nc), column-major layout (nc = ncomp)
1416574a04fSJeremy L Thompson       // dv is (Q^dim x nc x dim), column-major layout (nc = ncomp)
1426574a04fSJeremy L Thompson       // In CEED_TRANSPOSE mode, the sizes of du and dv are switched.
1437f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
1447f5b9731SStan Tomov         P = Q1d, Q = P1d;
1457f5b9731SStan Tomov       }
1467f5b9731SStan Tomov 
1477f5b9731SStan Tomov       // Define element sizes for dofs/quad
1487f5b9731SStan Tomov       CeedInt elquadsize = CeedIntPow(Q1d, dim);
1497f5b9731SStan Tomov       CeedInt eldofssize = CeedIntPow(P1d, dim);
1507f5b9731SStan Tomov 
1517f5b9731SStan Tomov       // E-vector ordering -------------- Q-vector ordering
1527f5b9731SStan Tomov       //                                  dim
153868539c2SNatalie Beams       //  component                        component
154868539c2SNatalie Beams       //    elem                              elem
1557f5b9731SStan Tomov       //       node                            node
1567f5b9731SStan Tomov 
1577f5b9731SStan Tomov       // ---  Define strides for NOTRANSPOSE mode: ---
1586574a04fSJeremy L Thompson       // Input (du) is E-vector, output (dv) is Q-vector
1597f5b9731SStan Tomov 
1607f5b9731SStan Tomov       // Element strides
161868539c2SNatalie Beams       CeedInt u_elstride = eldofssize;
1627f5b9731SStan Tomov       CeedInt v_elstride = elquadsize;
1637f5b9731SStan Tomov       // Component strides
164868539c2SNatalie Beams       CeedInt u_compstride = nelem * eldofssize;
1657f5b9731SStan Tomov       CeedInt v_compstride = nelem * elquadsize;
1667f5b9731SStan Tomov       // Dimension strides
1677f5b9731SStan Tomov       CeedInt u_dimstride = 0;
1687f5b9731SStan Tomov       CeedInt v_dimstride = nelem * elquadsize * ncomp;
1697f5b9731SStan Tomov 
1707f5b9731SStan Tomov       // ---  Swap strides for TRANSPOSE mode: ---
1717f5b9731SStan Tomov       if (tmode == CEED_TRANSPOSE) {
1726574a04fSJeremy L Thompson         // Input (du) is Q-vector, output (dv) is E-vector
1737f5b9731SStan Tomov         // Element strides
174868539c2SNatalie Beams         v_elstride = eldofssize;
1757f5b9731SStan Tomov         u_elstride = elquadsize;
1767f5b9731SStan Tomov         // Component strides
177868539c2SNatalie Beams         v_compstride = nelem * eldofssize;
1787f5b9731SStan Tomov         u_compstride = nelem * elquadsize;
1797f5b9731SStan Tomov         // Dimension strides
1807f5b9731SStan Tomov         v_dimstride = 0;
1817f5b9731SStan Tomov         u_dimstride = nelem * elquadsize * ncomp;
1827f5b9731SStan Tomov       }
1837f5b9731SStan Tomov 
184f6af633fSnbeams       CeedInt nthreads = 1;
185f6af633fSnbeams       CeedInt ntcol    = 1;
186f6af633fSnbeams       CeedInt shmem    = 0;
187f6af633fSnbeams       CeedInt maxPQ    = CeedIntMax(P, Q);
188f6af633fSnbeams 
189f6af633fSnbeams       switch (dim) {
190f6af633fSnbeams         case 1:
191f6af633fSnbeams           nthreads = maxPQ;
192f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
193f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q));
194f6af633fSnbeams           shmem += sizeof(CeedScalar) * (P * Q);
195f6af633fSnbeams           break;
196f6af633fSnbeams         case 2:
197f6af633fSnbeams           nthreads = maxPQ;
198f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
199f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;            // for sTinterp and sTgrad
2002b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
201f6af633fSnbeams           break;
202f6af633fSnbeams         case 3:
203f6af633fSnbeams           nthreads = maxPQ * maxPQ;
204f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
205f6af633fSnbeams           shmem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
2062b730f8bSJeremy L Thompson           shmem += sizeof(CeedScalar) * ntcol *
2072b730f8bSJeremy L Thompson                    CeedIntMax(P * P * P,
2082b730f8bSJeremy L Thompson                               (P * P * Q) + (P * Q * Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
209f6af633fSnbeams       }
210f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
2116574a04fSJeremy L Thompson       void   *args[] = {&impl->dinterp1d, &impl->dgrad1d, &du,          &u_elstride, &u_compstride, &u_dimstride, &dv,
2122b730f8bSJeremy L Thompson                         &v_elstride,      &v_compstride,  &v_dimstride, &nelem};
213f6af633fSnbeams 
214f6af633fSnbeams       if (tmode == CEED_TRANSPOSE) {
2152b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args));
216f6af633fSnbeams       } else {
2172b730f8bSJeremy L Thompson         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args));
218f6af633fSnbeams       }
2192b730f8bSJeremy L Thompson     } break;
2203513a710Sjeremylt     case CEED_EVAL_WEIGHT: {
2216574a04fSJeremy L Thompson       CeedCheck(tmode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
2227f5b9731SStan Tomov       CeedInt Q          = Q1d;
223f6af633fSnbeams       CeedInt eldofssize = CeedIntPow(Q, dim);
224f6af633fSnbeams       CeedInt nthreads   = 1;
225f6af633fSnbeams       CeedInt ntcol      = 1;
226f6af633fSnbeams       CeedInt shmem      = 0;
227f6af633fSnbeams 
228f6af633fSnbeams       switch (dim) {
229f6af633fSnbeams         case 1:
230f6af633fSnbeams           nthreads = Q;
231f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D);
232f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;          // for dqweight1d
233f6af633fSnbeams           shmem += sizeof(CeedScalar) * ntcol * Q;  // for output
234f6af633fSnbeams           break;
235f6af633fSnbeams         case 2:
236f6af633fSnbeams           nthreads = Q;
237f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D);
238f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
239f6af633fSnbeams           break;
240f6af633fSnbeams         case 3:
241f6af633fSnbeams           nthreads = Q * Q;
242f6af633fSnbeams           ntcol    = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D);
243f6af633fSnbeams           shmem += sizeof(CeedScalar) * Q;  // for dqweight1d
244f6af633fSnbeams       }
245f6af633fSnbeams       CeedInt grid   = (nelem + ntcol - 1) / ntcol;
2466574a04fSJeremy L Thompson       void   *args[] = {&impl->dqweight1d, &dv, &eldofssize, &nelem};
247f6af633fSnbeams 
2482b730f8bSJeremy L Thompson       CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args));
2492b730f8bSJeremy L Thompson     } break;
2503513a710Sjeremylt     // LCOV_EXCL_START
2513513a710Sjeremylt     case CEED_EVAL_DIV:
252e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
2533513a710Sjeremylt     case CEED_EVAL_CURL:
254e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
2553513a710Sjeremylt     case CEED_EVAL_NONE:
2562b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
2573513a710Sjeremylt       // LCOV_EXCL_STOP
2583513a710Sjeremylt   }
2597f5b9731SStan Tomov 
260e0582403Sabdelfattah83   // must sync to ensure completeness
261e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
262e0582403Sabdelfattah83 
2637f5b9731SStan Tomov   if (emode != CEED_EVAL_WEIGHT) {
2646574a04fSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
2657f5b9731SStan Tomov   }
2666574a04fSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
267e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2687f5b9731SStan Tomov }
2697f5b9731SStan Tomov 
2707f5b9731SStan Tomov #ifdef __cplusplus
2717f5b9731SStan Tomov CEED_INTERN "C"
2727f5b9731SStan Tomov #endif
2732b730f8bSJeremy L Thompson     int
274023b8a51Sabdelfattah83     CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) {
275868539c2SNatalie Beams   Ceed ceed;
2762b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
277e0582403Sabdelfattah83 
278e0582403Sabdelfattah83   Ceed_Magma *data;
2792b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
280e0582403Sabdelfattah83 
281023b8a51Sabdelfattah83   magma_int_t arch = magma_getdevice_arch();
282023b8a51Sabdelfattah83 
283868539c2SNatalie Beams   CeedInt dim, ncomp, ndof, nqpt;
2842b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
2852b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
2862b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof));
2872b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt));
288868539c2SNatalie Beams   const CeedScalar *du;
289868539c2SNatalie Beams   CeedScalar       *dv;
2906574a04fSJeremy L Thompson   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
2916574a04fSJeremy L Thompson   else CeedCheck(emode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
2922b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
293868539c2SNatalie Beams 
294868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
2952b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
296868539c2SNatalie Beams 
2972b730f8bSJeremy L Thompson   CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp);
298868539c2SNatalie Beams 
299868539c2SNatalie Beams   if (tmode == CEED_TRANSPOSE) {
3001f9221feSJeremy L Thompson     CeedSize length;
3012b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorGetLength(V, &length));
30280a9ef05SNatalie Beams     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
3032b730f8bSJeremy L Thompson       magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
30480a9ef05SNatalie Beams     } else {
3052b730f8bSJeremy L Thompson       magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
30680a9ef05SNatalie Beams     }
307e0582403Sabdelfattah83     ceed_magma_queue_sync(data->queue);
308868539c2SNatalie Beams   }
30980a9ef05SNatalie Beams 
310023b8a51Sabdelfattah83   CeedInt            P = ndof, Q = nqpt, N = nelem * ncomp;
311023b8a51Sabdelfattah83   CeedInt            NB = 1;
312023b8a51Sabdelfattah83   CeedMagmaFunction *interp, *grad;
313868539c2SNatalie Beams 
314023b8a51Sabdelfattah83   CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
315023b8a51Sabdelfattah83   CeedInt iN                                       = 0;
316023b8a51Sabdelfattah83   CeedInt diff                                     = abs(Narray[iN] - N);
317023b8a51Sabdelfattah83   for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
318023b8a51Sabdelfattah83     CeedInt idiff = abs(Narray[in] - N);
319023b8a51Sabdelfattah83     if (idiff < diff) {
320023b8a51Sabdelfattah83       iN   = in;
321023b8a51Sabdelfattah83       diff = idiff;
322868539c2SNatalie Beams     }
32380a9ef05SNatalie Beams   }
32480a9ef05SNatalie Beams 
325023b8a51Sabdelfattah83   NB     = nontensor_rtc_get_nb(arch, 'd', emode, tmode, P, Narray[iN], Q);
326023b8a51Sabdelfattah83   interp = (tmode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN];
327023b8a51Sabdelfattah83   grad   = (tmode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN];
32880a9ef05SNatalie Beams 
32980a9ef05SNatalie Beams   switch (emode) {
33080a9ef05SNatalie Beams     case CEED_EVAL_INTERP: {
33180a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
332023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
333023b8a51Sabdelfattah83         CeedInt M     = (tmode == CEED_TRANSPOSE) ? P : Q;
334023b8a51Sabdelfattah83         CeedInt K     = (tmode == CEED_TRANSPOSE) ? Q : P;
335023b8a51Sabdelfattah83         CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M);
336023b8a51Sabdelfattah83         CeedInt shmem = 0, shmemA = 0, shmemB = 0;
337023b8a51Sabdelfattah83         shmemB += ntcol * K * NB * sizeof(CeedScalar);
338023b8a51Sabdelfattah83         shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
339023b8a51Sabdelfattah83         shmem = (tmode == CEED_TRANSPOSE) ? (shmemA + shmemB) : CeedIntMax(shmemA, shmemB);
340023b8a51Sabdelfattah83 
341023b8a51Sabdelfattah83         CeedInt       grid   = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol);
342023b8a51Sabdelfattah83         magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
343023b8a51Sabdelfattah83         magma_trans_t transB = MagmaNoTrans;
344023b8a51Sabdelfattah83         CeedScalar    alpha = 1.0, beta = 0.0;
345023b8a51Sabdelfattah83 
346023b8a51Sabdelfattah83         void *args[] = {&transA, &transB, &N, &alpha, &impl->dinterp, &P, &du, &K, &beta, &dv, &M};
347023b8a51Sabdelfattah83         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, ntcol, 1, shmem, args));
348023b8a51Sabdelfattah83       } else {
34980a9ef05SNatalie Beams         if (tmode == CEED_TRANSPOSE)
350023b8a51Sabdelfattah83           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dinterp, P, du, Q, 0.0, dv, P, data->queue);
351023b8a51Sabdelfattah83         else magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dinterp, P, du, P, 0.0, dv, Q, data->queue);
352023b8a51Sabdelfattah83       }
3532b730f8bSJeremy L Thompson     } break;
35480a9ef05SNatalie Beams 
35580a9ef05SNatalie Beams     case CEED_EVAL_GRAD: {
35680a9ef05SNatalie Beams       CeedInt P = ndof, Q = nqpt;
357023b8a51Sabdelfattah83       if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
358023b8a51Sabdelfattah83         CeedInt M     = (tmode == CEED_TRANSPOSE) ? P : Q;
359023b8a51Sabdelfattah83         CeedInt K     = (tmode == CEED_TRANSPOSE) ? Q : P;
360023b8a51Sabdelfattah83         CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M);
361023b8a51Sabdelfattah83         CeedInt shmem = 0, shmemA = 0, shmemB = 0;
362023b8a51Sabdelfattah83         shmemB += ntcol * K * NB * sizeof(CeedScalar);
363023b8a51Sabdelfattah83         shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
364023b8a51Sabdelfattah83         shmem = shmemA + shmemB;
365023b8a51Sabdelfattah83 
366023b8a51Sabdelfattah83         CeedInt       grid   = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol);
367023b8a51Sabdelfattah83         magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
368023b8a51Sabdelfattah83         magma_trans_t transB = MagmaNoTrans;
369023b8a51Sabdelfattah83 
370023b8a51Sabdelfattah83         void *args[] = {&transA, &transB, &N, &impl->dgrad, &P, &du, &K, &dv, &M};
371023b8a51Sabdelfattah83         CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, ntcol, 1, shmem, args));
372023b8a51Sabdelfattah83       } else {
37380a9ef05SNatalie Beams         if (tmode == CEED_TRANSPOSE) {
37480a9ef05SNatalie Beams           CeedScalar beta = 0.0;
37580a9ef05SNatalie Beams           for (int d = 0; d < dim; d++) {
3762b730f8bSJeremy L Thompson             if (d > 0) beta = 1.0;
377023b8a51Sabdelfattah83             magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dgrad + d * P * Q, P, du + d * nelem * ncomp * Q, Q,
378023b8a51Sabdelfattah83                                  beta, dv, P, data->queue);
37980a9ef05SNatalie Beams           }
38080a9ef05SNatalie Beams         } else {
38180a9ef05SNatalie Beams           for (int d = 0; d < dim; d++)
382023b8a51Sabdelfattah83             magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dgrad + d * P * Q, P, du, P, 0.0,
383023b8a51Sabdelfattah83                                  dv + d * nelem * ncomp * Q, Q, data->queue);
384023b8a51Sabdelfattah83         }
385868539c2SNatalie Beams       }
3862b730f8bSJeremy L Thompson     } break;
387868539c2SNatalie Beams 
388868539c2SNatalie Beams     case CEED_EVAL_WEIGHT: {
3896574a04fSJeremy L Thompson       CeedCheck(tmode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
390868539c2SNatalie Beams 
391868539c2SNatalie Beams       int elemsPerBlock = 1;  // basis->Q1d < 7 ? optElems[basis->Q1d] : 1;
3922b730f8bSJeremy L Thompson       int grid          = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0);
3932b730f8bSJeremy L Thompson       magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue);
3942b730f8bSJeremy L Thompson     } break;
395868539c2SNatalie Beams 
396868539c2SNatalie Beams     // LCOV_EXCL_START
397868539c2SNatalie Beams     case CEED_EVAL_DIV:
398e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
399868539c2SNatalie Beams     case CEED_EVAL_CURL:
400e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
401868539c2SNatalie Beams     case CEED_EVAL_NONE:
4022b730f8bSJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
403868539c2SNatalie Beams       // LCOV_EXCL_STOP
404868539c2SNatalie Beams   }
405868539c2SNatalie Beams 
406e0582403Sabdelfattah83   // must sync to ensure completeness
407e0582403Sabdelfattah83   ceed_magma_queue_sync(data->queue);
408e0582403Sabdelfattah83 
409868539c2SNatalie Beams   if (emode != CEED_EVAL_WEIGHT) {
4102b730f8bSJeremy L Thompson     CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
411868539c2SNatalie Beams   }
4122b730f8bSJeremy L Thompson   CeedCallBackend(CeedVectorRestoreArray(V, &dv));
413e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
414868539c2SNatalie Beams }
415868539c2SNatalie Beams 
416868539c2SNatalie Beams #ifdef __cplusplus
417868539c2SNatalie Beams CEED_INTERN "C"
418868539c2SNatalie Beams #endif
4192b730f8bSJeremy L Thompson     int
4202b730f8bSJeremy L Thompson     CeedBasisDestroy_Magma(CeedBasis basis) {
4217f5b9731SStan Tomov   CeedBasis_Magma *impl;
4222b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
4237f5b9731SStan Tomov 
4242b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref1d));
4252b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp1d));
4262b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad1d));
4272b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight1d));
428f6af633fSnbeams   Ceed ceed;
4292b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
430e5f091ebSnbeams #ifdef CEED_MAGMA_USE_HIP
4312b730f8bSJeremy L Thompson   CeedCallHip(ceed, hipModuleUnload(impl->module));
432f6af633fSnbeams #else
4332b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleUnload(impl->module));
434f6af633fSnbeams #endif
4357f5b9731SStan Tomov 
4362b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
4377f5b9731SStan Tomov 
438e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
4397f5b9731SStan Tomov }
4407f5b9731SStan Tomov 
4417f5b9731SStan Tomov #ifdef __cplusplus
4427f5b9731SStan Tomov CEED_INTERN "C"
4437f5b9731SStan Tomov #endif
4442b730f8bSJeremy L Thompson     int
4452b730f8bSJeremy L Thompson     CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
446868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
4472b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetData(basis, &impl));
448868539c2SNatalie Beams 
4492b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqref));
4502b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dinterp));
4512b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dgrad));
4522b730f8bSJeremy L Thompson   CeedCallBackend(magma_free(impl->dqweight));
453023b8a51Sabdelfattah83   Ceed ceed;
454023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
455023b8a51Sabdelfattah83 #ifdef CEED_MAGMA_USE_HIP
456023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
457023b8a51Sabdelfattah83     CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
458023b8a51Sabdelfattah83   }
459023b8a51Sabdelfattah83 #else
460023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
461023b8a51Sabdelfattah83     CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
462023b8a51Sabdelfattah83   }
463023b8a51Sabdelfattah83 #endif
4642b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&impl));
465868539c2SNatalie Beams 
466e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
467868539c2SNatalie Beams }
468868539c2SNatalie Beams 
469868539c2SNatalie Beams #ifdef __cplusplus
470868539c2SNatalie Beams CEED_INTERN "C"
471868539c2SNatalie Beams #endif
4722b730f8bSJeremy L Thompson     int
4732b730f8bSJeremy L Thompson     CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
4742b730f8bSJeremy L Thompson                                   const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) {
4757f5b9731SStan Tomov   CeedBasis_Magma *impl;
4762b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
4777f5b9731SStan Tomov   Ceed ceed;
4782b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
4797f5b9731SStan Tomov 
480c9f8acf2SJeremy L Thompson   // Check for supported parameters
481c9f8acf2SJeremy L Thompson   CeedInt ncomp = 0;
4822b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp));
483e0582403Sabdelfattah83   Ceed_Magma *data;
4842b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
485e0582403Sabdelfattah83 
486f6af633fSnbeams   // Compile kernels
487f6af633fSnbeams   char *magma_common_path;
488f6af633fSnbeams   char *interp_path, *grad_path, *weight_path;
489f6af633fSnbeams   char *basis_kernel_source;
490023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
49123d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
4922b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
493023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path));
494023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
495f6af633fSnbeams   char   *interp_name_base = "ceed/jit-source/magma/interp";
496f6af633fSnbeams   CeedInt interp_name_len  = strlen(interp_name_base) + 6;
497f6af633fSnbeams   char    interp_name[interp_name_len];
4982b730f8bSJeremy L Thompson   snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim);
4992b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path));
5002b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
501f6af633fSnbeams   char   *grad_name_base = "ceed/jit-source/magma/grad";
502f6af633fSnbeams   CeedInt grad_name_len  = strlen(grad_name_base) + 6;
503f6af633fSnbeams   char    grad_name[grad_name_len];
5042b730f8bSJeremy L Thompson   snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim);
5052b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path));
5062b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
507f6af633fSnbeams   char   *weight_name_base = "ceed/jit-source/magma/weight";
508f6af633fSnbeams   CeedInt weight_name_len  = strlen(weight_name_base) + 6;
509f6af633fSnbeams   char    weight_name[weight_name_len];
5102b730f8bSJeremy L Thompson   snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim);
5112b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path));
5122b730f8bSJeremy L Thompson   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source));
51323d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
514f6af633fSnbeams   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
515f6af633fSnbeams   // data
516f6af633fSnbeams   Ceed delegate;
5172b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetDelegate(ceed, &delegate));
5182b730f8bSJeremy L Thompson   CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ",
5192b730f8bSJeremy L Thompson                                    CeedIntMax(P1d, Q1d)));
520f6af633fSnbeams 
521f6af633fSnbeams   // Kernel setup
522f6af633fSnbeams   switch (dim) {
523f6af633fSnbeams     case 1:
5242b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp));
5252b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr));
5262b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad));
5272b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr));
5282b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight));
529f6af633fSnbeams       break;
530f6af633fSnbeams     case 2:
5312b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp));
5322b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr));
5332b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad));
5342b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr));
5352b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight));
536f6af633fSnbeams       break;
537f6af633fSnbeams     case 3:
5382b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp));
5392b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr));
5402b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad));
5412b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr));
5422b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight));
543f6af633fSnbeams   }
544f6af633fSnbeams 
5452b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
5462b730f8bSJeremy L Thompson   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
5477f5b9731SStan Tomov 
5487f5b9731SStan Tomov   // Copy qref1d to the GPU
5492b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0])));
5502b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue);
5517f5b9731SStan Tomov 
5527f5b9731SStan Tomov   // Copy interp1d to the GPU
5532b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0])));
5542b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue);
5557f5b9731SStan Tomov 
5567f5b9731SStan Tomov   // Copy grad1d to the GPU
5572b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0])));
5582b730f8bSJeremy L Thompson   magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue);
5597f5b9731SStan Tomov 
5607f5b9731SStan Tomov   // Copy qweight1d to the GPU
5612b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0])));
5622b730f8bSJeremy L Thompson   magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue);
5637f5b9731SStan Tomov 
5642b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisSetData(basis, impl));
5652b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&magma_common_path));
5662b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&interp_path));
5672b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&grad_path));
5682b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&weight_path));
5692b730f8bSJeremy L Thompson   CeedCallBackend(CeedFree(&basis_kernel_source));
570f6af633fSnbeams 
571e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5727f5b9731SStan Tomov }
5737f5b9731SStan Tomov 
5747f5b9731SStan Tomov #ifdef __cplusplus
5757f5b9731SStan Tomov CEED_INTERN "C"
5767f5b9731SStan Tomov #endif
5772b730f8bSJeremy L Thompson     int
5782b730f8bSJeremy L Thompson     CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad,
5792b730f8bSJeremy L Thompson                             const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) {
580868539c2SNatalie Beams   CeedBasisNonTensor_Magma *impl;
5817f5b9731SStan Tomov   Ceed                      ceed;
5822b730f8bSJeremy L Thompson   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
5837f5b9731SStan Tomov 
584e0582403Sabdelfattah83   Ceed_Magma *data;
5852b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &data));
586023b8a51Sabdelfattah83   magma_int_t arch = magma_getdevice_arch();
5872b730f8bSJeremy L Thompson   CeedCallBackend(CeedCalloc(1, &impl));
588023b8a51Sabdelfattah83   // Compile kernels
589023b8a51Sabdelfattah83   char *magma_common_path;
590023b8a51Sabdelfattah83   char *interp_path, *grad_path;
591023b8a51Sabdelfattah83   char *basis_kernel_source;
592023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
59323d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
594023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
595023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path));
596023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
597023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path));
598023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
599023b8a51Sabdelfattah83   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path));
600023b8a51Sabdelfattah83   CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
601023b8a51Sabdelfattah83 
602023b8a51Sabdelfattah83   // tuning parameters for nb
603023b8a51Sabdelfattah83   CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
604023b8a51Sabdelfattah83   CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
605023b8a51Sabdelfattah83   CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
606023b8a51Sabdelfattah83   CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
607023b8a51Sabdelfattah83   CeedInt P = ndof, Q = nqpts;
608023b8a51Sabdelfattah83   CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
609023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
610023b8a51Sabdelfattah83     nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, Narray[in], Q);
611023b8a51Sabdelfattah83     nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, Narray[in], Q);
612023b8a51Sabdelfattah83     nb_grad_n[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, Narray[in], Q);
613023b8a51Sabdelfattah83     nb_grad_t[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, Narray[in], Q);
614023b8a51Sabdelfattah83   }
615023b8a51Sabdelfattah83 
616023b8a51Sabdelfattah83   // compile
617023b8a51Sabdelfattah83   Ceed delegate;
618023b8a51Sabdelfattah83   CeedCallBackend(CeedGetDelegate(ceed, &delegate));
619023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
620023b8a51Sabdelfattah83     CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", nb_interp_n[in],
621023b8a51Sabdelfattah83                                      "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in]));
622023b8a51Sabdelfattah83   }
623023b8a51Sabdelfattah83 
624023b8a51Sabdelfattah83   // get kernels
625023b8a51Sabdelfattah83   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
626023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in]));
627023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in]));
628023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in]));
629023b8a51Sabdelfattah83     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in]));
630023b8a51Sabdelfattah83   }
631023b8a51Sabdelfattah83 
632023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
633023b8a51Sabdelfattah83   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
634868539c2SNatalie Beams 
635868539c2SNatalie Beams   // Copy qref to the GPU
6362b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0])));
637e0582403Sabdelfattah83   magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue);
638868539c2SNatalie Beams 
639868539c2SNatalie Beams   // Copy interp to the GPU
6402b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0])));
6412b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue);
642868539c2SNatalie Beams 
643868539c2SNatalie Beams   // Copy grad to the GPU
6442b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0])));
6452b730f8bSJeremy L Thompson   magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue);
646868539c2SNatalie Beams 
647868539c2SNatalie Beams   // Copy qweight to the GPU
6482b730f8bSJeremy L Thompson   CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0])));
6492b730f8bSJeremy L Thompson   magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue);
650868539c2SNatalie Beams 
651023b8a51Sabdelfattah83   CeedCallBackend(CeedBasisSetData(basis, impl));
652023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&magma_common_path));
653023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&interp_path));
654023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&grad_path));
655023b8a51Sabdelfattah83   CeedCallBackend(CeedFree(&basis_kernel_source));
656e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6577f5b9731SStan Tomov }
658