xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision db2becc9f302fe8eb3a32ace50ce3f3a5d42e6c4)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29d15e85bSSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39d15e85bSSebastian Grimberg //
49d15e85bSSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause
59d15e85bSSebastian Grimberg //
69d15e85bSSebastian Grimberg // This file is part of CEED:  http://github.com/ceed
79d15e85bSSebastian Grimberg 
89d15e85bSSebastian Grimberg /// @file
99d15e85bSSebastian Grimberg /// Internal header for MAGMA non-tensor basis interpolation
109d15e85bSSebastian Grimberg 
119d15e85bSSebastian Grimberg #include "magma-common-nontensor.h"
129d15e85bSSebastian Grimberg 
139d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
149d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB>
159d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
169d15e85bSSebastian Grimberg                                                                  CeedScalar *shared_data) {
179d15e85bSSebastian Grimberg   const int tx      = threadIdx.x;
189d15e85bSSebastian Grimberg   const int ty      = threadIdx.y;
199d15e85bSSebastian Grimberg   const int id      = blockIdx.x * blockDim.y + ty;
209d15e85bSSebastian Grimberg   const int nblocks = (n + NB - 1) / NB;
219d15e85bSSebastian Grimberg   const int myn     = min(NB, n - id * NB);
229d15e85bSSebastian Grimberg 
239d15e85bSSebastian Grimberg   dB += id * P * NB;
249d15e85bSSebastian Grimberg   dC += id * Q * NB;
259d15e85bSSebastian Grimberg 
269d15e85bSSebastian Grimberg   // A is P x Q
279d15e85bSSebastian Grimberg   CeedScalar *sB = shared_data + ty * P * NB;
289d15e85bSSebastian Grimberg   CeedScalar *sA = shared_data + blockDim.y * P * NB;
299d15e85bSSebastian Grimberg 
309d15e85bSSebastian Grimberg   // read B once for all C's
319d15e85bSSebastian Grimberg   if (id < nblocks) {
329d15e85bSSebastian Grimberg     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
339d15e85bSSebastian Grimberg   }
349d15e85bSSebastian Grimberg 
359d15e85bSSebastian Grimberg   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
369d15e85bSSebastian Grimberg   for (int d = 0; d < Q_COMP; d++) {
379d15e85bSSebastian Grimberg     // read A using all threads
3886ad04ccSSebastian Grimberg     CeedScalar rA[P];
399d15e85bSSebastian Grimberg     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
409d15e85bSSebastian Grimberg 
4186ad04ccSSebastian Grimberg     CeedScalar rC[NB];
429d15e85bSSebastian Grimberg     mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
439d15e85bSSebastian Grimberg 
449d15e85bSSebastian Grimberg     // write C
45833aa127SSebastian Grimberg     if (id < nblocks) {
469d15e85bSSebastian Grimberg       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
479d15e85bSSebastian Grimberg     }
489d15e85bSSebastian Grimberg 
499d15e85bSSebastian Grimberg     dA += Q * P;
509d15e85bSSebastian Grimberg     dC += Q * n;
519d15e85bSSebastian Grimberg 
529d15e85bSSebastian Grimberg     __syncthreads();
539d15e85bSSebastian Grimberg   }
549d15e85bSSebastian Grimberg }
559d15e85bSSebastian Grimberg 
569d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
579d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB>
589d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
599d15e85bSSebastian Grimberg                                                                  CeedScalar *shared_data) {
609d15e85bSSebastian Grimberg   const int tx      = threadIdx.x;
619d15e85bSSebastian Grimberg   const int ty      = threadIdx.y;
629d15e85bSSebastian Grimberg   const int id      = blockIdx.x * blockDim.y + ty;
639d15e85bSSebastian Grimberg   const int nblocks = (n + NB - 1) / NB;
649d15e85bSSebastian Grimberg   const int myn     = min(NB, n - id * NB);
659d15e85bSSebastian Grimberg 
669d15e85bSSebastian Grimberg   dB += id * Q * NB;
679d15e85bSSebastian Grimberg   dC += id * P * NB;
689d15e85bSSebastian Grimberg 
699d15e85bSSebastian Grimberg   // A is P x Q
70833aa127SSebastian Grimberg   CeedScalar *sA = shared_data;
719d15e85bSSebastian Grimberg   CeedScalar *sB = shared_data + ty * Q * NB;
729d15e85bSSebastian Grimberg 
739d15e85bSSebastian Grimberg   CeedScalar rC[NB] = {0.0};
749d15e85bSSebastian Grimberg 
759d15e85bSSebastian Grimberg   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
769d15e85bSSebastian Grimberg   for (int d = 0; d < Q_COMP; d++) {
77833aa127SSebastian Grimberg     // read A using all threads
7886ad04ccSSebastian Grimberg     CeedScalar rA[Q];
79833aa127SSebastian Grimberg     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
80833aa127SSebastian Grimberg     __syncthreads();
819d15e85bSSebastian Grimberg 
829d15e85bSSebastian Grimberg     // read B
83833aa127SSebastian Grimberg     if (id < nblocks) {
849d15e85bSSebastian Grimberg       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
85833aa127SSebastian Grimberg     }
869d15e85bSSebastian Grimberg     __syncthreads();
879d15e85bSSebastian Grimberg 
889d15e85bSSebastian Grimberg     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
899d15e85bSSebastian Grimberg 
909d15e85bSSebastian Grimberg     dA += P * Q;
919d15e85bSSebastian Grimberg     dB += Q * n;
929d15e85bSSebastian Grimberg 
939d15e85bSSebastian Grimberg     __syncthreads();
949d15e85bSSebastian Grimberg   }
959d15e85bSSebastian Grimberg 
969d15e85bSSebastian Grimberg   // write C
97833aa127SSebastian Grimberg   if (id < nblocks) {
989d15e85bSSebastian Grimberg     write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
999d15e85bSSebastian Grimberg   }
100833aa127SSebastian Grimberg }
1019d15e85bSSebastian Grimberg 
1029d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
103*db2becc9SJeremy L Thompson template <typename T, int Q_COMP, int P, int Q, int NB>
104*db2becc9SJeremy L Thompson static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
105*db2becc9SJeremy L Thompson                                                                   CeedScalar *shared_data) {
106*db2becc9SJeremy L Thompson   const int tx      = threadIdx.x;
107*db2becc9SJeremy L Thompson   const int ty      = threadIdx.y;
108*db2becc9SJeremy L Thompson   const int id      = blockIdx.x * blockDim.y + ty;
109*db2becc9SJeremy L Thompson   const int nblocks = (n + NB - 1) / NB;
110*db2becc9SJeremy L Thompson   const int myn     = min(NB, n - id * NB);
111*db2becc9SJeremy L Thompson 
112*db2becc9SJeremy L Thompson   dB += id * Q * NB;
113*db2becc9SJeremy L Thompson   dC += id * P * NB;
114*db2becc9SJeremy L Thompson 
115*db2becc9SJeremy L Thompson   // A is P x Q
116*db2becc9SJeremy L Thompson   CeedScalar *sA = shared_data;
117*db2becc9SJeremy L Thompson   CeedScalar *sB = shared_data + ty * Q * NB;
118*db2becc9SJeremy L Thompson 
119*db2becc9SJeremy L Thompson   CeedScalar rC[NB] = {0.0};
120*db2becc9SJeremy L Thompson 
121*db2becc9SJeremy L Thompson   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
122*db2becc9SJeremy L Thompson   for (int d = 0; d < Q_COMP; d++) {
123*db2becc9SJeremy L Thompson     // read A using all threads
124*db2becc9SJeremy L Thompson     CeedScalar rA[Q];
125*db2becc9SJeremy L Thompson     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
126*db2becc9SJeremy L Thompson     __syncthreads();
127*db2becc9SJeremy L Thompson 
128*db2becc9SJeremy L Thompson     // read B
129*db2becc9SJeremy L Thompson     if (id < nblocks) {
130*db2becc9SJeremy L Thompson       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
131*db2becc9SJeremy L Thompson     }
132*db2becc9SJeremy L Thompson     __syncthreads();
133*db2becc9SJeremy L Thompson 
134*db2becc9SJeremy L Thompson     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
135*db2becc9SJeremy L Thompson 
136*db2becc9SJeremy L Thompson     dA += P * Q;
137*db2becc9SJeremy L Thompson     dB += Q * n;
138*db2becc9SJeremy L Thompson 
139*db2becc9SJeremy L Thompson     __syncthreads();
140*db2becc9SJeremy L Thompson   }
141*db2becc9SJeremy L Thompson 
142*db2becc9SJeremy L Thompson   // sum into C
143*db2becc9SJeremy L Thompson   if (id < nblocks) {
144*db2becc9SJeremy L Thompson     sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
145*db2becc9SJeremy L Thompson   }
146*db2becc9SJeremy L Thompson }
147*db2becc9SJeremy L Thompson 
148*db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
14986ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB>
15086ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
15186ad04ccSSebastian Grimberg                                                                   CeedScalar *shared_data) {
15286ad04ccSSebastian Grimberg   const int tx      = threadIdx.x;
15386ad04ccSSebastian Grimberg   const int ty      = threadIdx.y;
15486ad04ccSSebastian Grimberg   const int id      = blockIdx.x * blockDim.y + ty;
15586ad04ccSSebastian Grimberg   const int nblocks = (n + NB - 1) / NB;
15686ad04ccSSebastian Grimberg   const int myn     = min(NB, n - id * NB);
15786ad04ccSSebastian Grimberg 
15886ad04ccSSebastian Grimberg   dB += id * P * NB;
15986ad04ccSSebastian Grimberg   dC += id * Q * NB;
16086ad04ccSSebastian Grimberg 
16186ad04ccSSebastian Grimberg   // A is P x Q
16286ad04ccSSebastian Grimberg   CeedScalar *sA = shared_data;
16386ad04ccSSebastian Grimberg   CeedScalar *sB = shared_data + ty * P * NB;
16486ad04ccSSebastian Grimberg 
16586ad04ccSSebastian Grimberg   // read A using all threads
16686ad04ccSSebastian Grimberg   CeedScalar rA[P];
16786ad04ccSSebastian Grimberg   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
16886ad04ccSSebastian Grimberg   __syncthreads();
16986ad04ccSSebastian Grimberg 
17086ad04ccSSebastian Grimberg   // terminate threads with no work
17186ad04ccSSebastian Grimberg   if (id >= nblocks) return;
17286ad04ccSSebastian Grimberg 
17386ad04ccSSebastian Grimberg   // read B
17486ad04ccSSebastian Grimberg   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
17586ad04ccSSebastian Grimberg   __syncthreads();
17686ad04ccSSebastian Grimberg 
17786ad04ccSSebastian Grimberg   CeedScalar rC[NB];
17886ad04ccSSebastian Grimberg   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
17986ad04ccSSebastian Grimberg 
18086ad04ccSSebastian Grimberg   // write C
18186ad04ccSSebastian Grimberg   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
18286ad04ccSSebastian Grimberg }
18386ad04ccSSebastian Grimberg 
18486ad04ccSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
18586ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB>
18686ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
18786ad04ccSSebastian Grimberg                                                                   CeedScalar *shared_data) {
18886ad04ccSSebastian Grimberg   const int tx      = threadIdx.x;
18986ad04ccSSebastian Grimberg   const int ty      = threadIdx.y;
19086ad04ccSSebastian Grimberg   const int id      = blockIdx.x * blockDim.y + ty;
19186ad04ccSSebastian Grimberg   const int nblocks = (n + NB - 1) / NB;
19286ad04ccSSebastian Grimberg   const int myn     = min(NB, n - id * NB);
19386ad04ccSSebastian Grimberg 
19486ad04ccSSebastian Grimberg   dB += id * Q * NB;
19586ad04ccSSebastian Grimberg   dC += id * P * NB;
19686ad04ccSSebastian Grimberg 
19786ad04ccSSebastian Grimberg   // A is P x Q
19886ad04ccSSebastian Grimberg   CeedScalar *sA = shared_data;
19986ad04ccSSebastian Grimberg   CeedScalar *sB = shared_data + ty * Q * NB;
20086ad04ccSSebastian Grimberg 
20186ad04ccSSebastian Grimberg   // read A using all threads
20286ad04ccSSebastian Grimberg   CeedScalar rA[Q];
20386ad04ccSSebastian Grimberg   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
20486ad04ccSSebastian Grimberg   __syncthreads();
20586ad04ccSSebastian Grimberg 
20686ad04ccSSebastian Grimberg   // terminate threads with no work
20786ad04ccSSebastian Grimberg   if (id >= nblocks) return;
20886ad04ccSSebastian Grimberg 
20986ad04ccSSebastian Grimberg   // read B
21086ad04ccSSebastian Grimberg   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
21186ad04ccSSebastian Grimberg   __syncthreads();
21286ad04ccSSebastian Grimberg 
21386ad04ccSSebastian Grimberg   CeedScalar rC[NB];
21486ad04ccSSebastian Grimberg   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
21586ad04ccSSebastian Grimberg 
21686ad04ccSSebastian Grimberg   // write C
21786ad04ccSSebastian Grimberg   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
21886ad04ccSSebastian Grimberg }
21986ad04ccSSebastian Grimberg 
22086ad04ccSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
221*db2becc9SJeremy L Thompson template <typename T, int P, int Q, int NB>
222*db2becc9SJeremy L Thompson static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
223*db2becc9SJeremy L Thompson                                                                    CeedScalar *shared_data) {
224*db2becc9SJeremy L Thompson   const int tx      = threadIdx.x;
225*db2becc9SJeremy L Thompson   const int ty      = threadIdx.y;
226*db2becc9SJeremy L Thompson   const int id      = blockIdx.x * blockDim.y + ty;
227*db2becc9SJeremy L Thompson   const int nblocks = (n + NB - 1) / NB;
228*db2becc9SJeremy L Thompson   const int myn     = min(NB, n - id * NB);
229*db2becc9SJeremy L Thompson 
230*db2becc9SJeremy L Thompson   dB += id * Q * NB;
231*db2becc9SJeremy L Thompson   dC += id * P * NB;
232*db2becc9SJeremy L Thompson 
233*db2becc9SJeremy L Thompson   // A is P x Q
234*db2becc9SJeremy L Thompson   CeedScalar *sA = shared_data;
235*db2becc9SJeremy L Thompson   CeedScalar *sB = shared_data + ty * Q * NB;
236*db2becc9SJeremy L Thompson 
237*db2becc9SJeremy L Thompson   // read A using all threads
238*db2becc9SJeremy L Thompson   CeedScalar rA[Q];
239*db2becc9SJeremy L Thompson   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
240*db2becc9SJeremy L Thompson   __syncthreads();
241*db2becc9SJeremy L Thompson 
242*db2becc9SJeremy L Thompson   // terminate threads with no work
243*db2becc9SJeremy L Thompson   if (id >= nblocks) return;
244*db2becc9SJeremy L Thompson 
245*db2becc9SJeremy L Thompson   // read B
246*db2becc9SJeremy L Thompson   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
247*db2becc9SJeremy L Thompson   __syncthreads();
248*db2becc9SJeremy L Thompson 
249*db2becc9SJeremy L Thompson   CeedScalar rC[NB];
250*db2becc9SJeremy L Thompson   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
251*db2becc9SJeremy L Thompson 
252*db2becc9SJeremy L Thompson   // sum into C
253*db2becc9SJeremy L Thompson   sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
254*db2becc9SJeremy L Thompson }
255*db2becc9SJeremy L Thompson 
256*db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
2579d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
2589d15e85bSSebastian Grimberg     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
2599d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
2609d15e85bSSebastian Grimberg 
26186ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1
2629d15e85bSSebastian Grimberg   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
26386ad04ccSSebastian Grimberg #else
2649d15e85bSSebastian Grimberg   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
26586ad04ccSSebastian Grimberg #endif
2669d15e85bSSebastian Grimberg }
2679d15e85bSSebastian Grimberg 
2689d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
2699d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
2709d15e85bSSebastian Grimberg     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
2719d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
2729d15e85bSSebastian Grimberg 
27386ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1
27486ad04ccSSebastian Grimberg   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
27586ad04ccSSebastian Grimberg #else
2769d15e85bSSebastian Grimberg   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
27786ad04ccSSebastian Grimberg #endif
2789d15e85bSSebastian Grimberg }
2799d15e85bSSebastian Grimberg 
2809d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
281*db2becc9SJeremy L Thompson extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
282*db2becc9SJeremy L Thompson     void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
283*db2becc9SJeremy L Thompson   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
284*db2becc9SJeremy L Thompson 
285*db2becc9SJeremy L Thompson #if BASIS_Q_COMP_INTERP == 1
286*db2becc9SJeremy L Thompson   magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
287*db2becc9SJeremy L Thompson #else
288*db2becc9SJeremy L Thompson   magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
289*db2becc9SJeremy L Thompson #endif
290*db2becc9SJeremy L Thompson }
291*db2becc9SJeremy L Thompson 
292*db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
2939d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
2949d15e85bSSebastian Grimberg     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
2959d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
2969d15e85bSSebastian Grimberg 
29786ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1
2989d15e85bSSebastian Grimberg   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
29986ad04ccSSebastian Grimberg #else
3009d15e85bSSebastian Grimberg   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
30186ad04ccSSebastian Grimberg #endif
3029d15e85bSSebastian Grimberg }
3039d15e85bSSebastian Grimberg 
3049d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
3059d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
3069d15e85bSSebastian Grimberg     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
3079d15e85bSSebastian Grimberg   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
3089d15e85bSSebastian Grimberg 
30986ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1
31086ad04ccSSebastian Grimberg   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
31186ad04ccSSebastian Grimberg #else
3129d15e85bSSebastian Grimberg   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
31386ad04ccSSebastian Grimberg #endif
3149d15e85bSSebastian Grimberg }
315*db2becc9SJeremy L Thompson 
316*db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
317*db2becc9SJeremy L Thompson extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
318*db2becc9SJeremy L Thompson     void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
319*db2becc9SJeremy L Thompson   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
320*db2becc9SJeremy L Thompson 
321*db2becc9SJeremy L Thompson #if BASIS_Q_COMP_DERIV == 1
322*db2becc9SJeremy L Thompson   magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
323*db2becc9SJeremy L Thompson #else
324*db2becc9SJeremy L Thompson   magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
325*db2becc9SJeremy L Thompson #endif
326*db2becc9SJeremy L Thompson }
327