1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29d15e85bSSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39d15e85bSSebastian Grimberg //
49d15e85bSSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause
59d15e85bSSebastian Grimberg //
69d15e85bSSebastian Grimberg // This file is part of CEED: http://github.com/ceed
79d15e85bSSebastian Grimberg
89d15e85bSSebastian Grimberg /// @file
99d15e85bSSebastian Grimberg /// Internal header for MAGMA non-tensor basis interpolation
109d15e85bSSebastian Grimberg #include "magma-common-nontensor.h"
119d15e85bSSebastian Grimberg
129d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
139d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB>
magma_basis_nontensor_device_n(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)149d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
159d15e85bSSebastian Grimberg CeedScalar *shared_data) {
169d15e85bSSebastian Grimberg const int tx = threadIdx.x;
179d15e85bSSebastian Grimberg const int ty = threadIdx.y;
189d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty;
199d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB;
209d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB);
219d15e85bSSebastian Grimberg
229d15e85bSSebastian Grimberg dB += id * P * NB;
239d15e85bSSebastian Grimberg dC += id * Q * NB;
249d15e85bSSebastian Grimberg
259d15e85bSSebastian Grimberg // A is P x Q
269d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB;
279d15e85bSSebastian Grimberg CeedScalar *sA = shared_data + blockDim.y * P * NB;
289d15e85bSSebastian Grimberg
299d15e85bSSebastian Grimberg // read B once for all C's
309d15e85bSSebastian Grimberg if (id < nblocks) {
319d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
329d15e85bSSebastian Grimberg }
339d15e85bSSebastian Grimberg
349d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
359d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) {
369d15e85bSSebastian Grimberg // read A using all threads
3786ad04ccSSebastian Grimberg CeedScalar rA[P];
389d15e85bSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
399d15e85bSSebastian Grimberg
4086ad04ccSSebastian Grimberg CeedScalar rC[NB];
419d15e85bSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
429d15e85bSSebastian Grimberg
439d15e85bSSebastian Grimberg // write C
44833aa127SSebastian Grimberg if (id < nblocks) {
459d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
469d15e85bSSebastian Grimberg }
479d15e85bSSebastian Grimberg
489d15e85bSSebastian Grimberg dA += Q * P;
499d15e85bSSebastian Grimberg dC += Q * n;
509d15e85bSSebastian Grimberg
519d15e85bSSebastian Grimberg __syncthreads();
529d15e85bSSebastian Grimberg }
539d15e85bSSebastian Grimberg }
549d15e85bSSebastian Grimberg
559d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
569d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB>
magma_basis_nontensor_device_t(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)579d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
589d15e85bSSebastian Grimberg CeedScalar *shared_data) {
599d15e85bSSebastian Grimberg const int tx = threadIdx.x;
609d15e85bSSebastian Grimberg const int ty = threadIdx.y;
619d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty;
629d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB;
639d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB);
649d15e85bSSebastian Grimberg
659d15e85bSSebastian Grimberg dB += id * Q * NB;
669d15e85bSSebastian Grimberg dC += id * P * NB;
679d15e85bSSebastian Grimberg
689d15e85bSSebastian Grimberg // A is P x Q
69833aa127SSebastian Grimberg CeedScalar *sA = shared_data;
709d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB;
719d15e85bSSebastian Grimberg
729d15e85bSSebastian Grimberg CeedScalar rC[NB] = {0.0};
739d15e85bSSebastian Grimberg
749d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
759d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) {
76833aa127SSebastian Grimberg // read A using all threads
7786ad04ccSSebastian Grimberg CeedScalar rA[Q];
78833aa127SSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
79833aa127SSebastian Grimberg __syncthreads();
809d15e85bSSebastian Grimberg
819d15e85bSSebastian Grimberg // read B
82833aa127SSebastian Grimberg if (id < nblocks) {
839d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
84833aa127SSebastian Grimberg }
859d15e85bSSebastian Grimberg __syncthreads();
869d15e85bSSebastian Grimberg
879d15e85bSSebastian Grimberg addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
889d15e85bSSebastian Grimberg
899d15e85bSSebastian Grimberg dA += P * Q;
909d15e85bSSebastian Grimberg dB += Q * n;
919d15e85bSSebastian Grimberg
929d15e85bSSebastian Grimberg __syncthreads();
939d15e85bSSebastian Grimberg }
949d15e85bSSebastian Grimberg
959d15e85bSSebastian Grimberg // write C
96833aa127SSebastian Grimberg if (id < nblocks) {
979d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
989d15e85bSSebastian Grimberg }
99833aa127SSebastian Grimberg }
1009d15e85bSSebastian Grimberg
1019d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
102db2becc9SJeremy L Thompson template <typename T, int Q_COMP, int P, int Q, int NB>
magma_basis_nontensor_device_ta(const int n,const CeedScalar * dA,const CeedScalar * dB,CeedScalar * dC,CeedScalar * shared_data)103db2becc9SJeremy L Thompson static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
104db2becc9SJeremy L Thompson CeedScalar *shared_data) {
105db2becc9SJeremy L Thompson const int tx = threadIdx.x;
106db2becc9SJeremy L Thompson const int ty = threadIdx.y;
107db2becc9SJeremy L Thompson const int id = blockIdx.x * blockDim.y + ty;
108db2becc9SJeremy L Thompson const int nblocks = (n + NB - 1) / NB;
109db2becc9SJeremy L Thompson const int myn = min(NB, n - id * NB);
110db2becc9SJeremy L Thompson
111db2becc9SJeremy L Thompson dB += id * Q * NB;
112db2becc9SJeremy L Thompson dC += id * P * NB;
113db2becc9SJeremy L Thompson
114db2becc9SJeremy L Thompson // A is P x Q
115db2becc9SJeremy L Thompson CeedScalar *sA = shared_data;
116db2becc9SJeremy L Thompson CeedScalar *sB = shared_data + ty * Q * NB;
117db2becc9SJeremy L Thompson
118db2becc9SJeremy L Thompson CeedScalar rC[NB] = {0.0};
119db2becc9SJeremy L Thompson
120db2becc9SJeremy L Thompson // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
121db2becc9SJeremy L Thompson for (int d = 0; d < Q_COMP; d++) {
122db2becc9SJeremy L Thompson // read A using all threads
123db2becc9SJeremy L Thompson CeedScalar rA[Q];
124db2becc9SJeremy L Thompson read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
125db2becc9SJeremy L Thompson __syncthreads();
126db2becc9SJeremy L Thompson
127db2becc9SJeremy L Thompson // read B
128db2becc9SJeremy L Thompson if (id < nblocks) {
129db2becc9SJeremy L Thompson read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
130db2becc9SJeremy L Thompson }
131db2becc9SJeremy L Thompson __syncthreads();
132db2becc9SJeremy L Thompson
133db2becc9SJeremy L Thompson addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
134db2becc9SJeremy L Thompson
135db2becc9SJeremy L Thompson dA += P * Q;
136db2becc9SJeremy L Thompson dB += Q * n;
137db2becc9SJeremy L Thompson
138db2becc9SJeremy L Thompson __syncthreads();
139db2becc9SJeremy L Thompson }
140db2becc9SJeremy L Thompson
141db2becc9SJeremy L Thompson // sum into C
142db2becc9SJeremy L Thompson if (id < nblocks) {
143db2becc9SJeremy L Thompson sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
144db2becc9SJeremy L Thompson }
145db2becc9SJeremy L Thompson }
146db2becc9SJeremy L Thompson
147db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
14886ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB>
magma_basis_nontensor_device_n1(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)14986ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
15086ad04ccSSebastian Grimberg CeedScalar *shared_data) {
15186ad04ccSSebastian Grimberg const int tx = threadIdx.x;
15286ad04ccSSebastian Grimberg const int ty = threadIdx.y;
15386ad04ccSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty;
15486ad04ccSSebastian Grimberg const int nblocks = (n + NB - 1) / NB;
15586ad04ccSSebastian Grimberg const int myn = min(NB, n - id * NB);
15686ad04ccSSebastian Grimberg
15786ad04ccSSebastian Grimberg dB += id * P * NB;
15886ad04ccSSebastian Grimberg dC += id * Q * NB;
15986ad04ccSSebastian Grimberg
16086ad04ccSSebastian Grimberg // A is P x Q
16186ad04ccSSebastian Grimberg CeedScalar *sA = shared_data;
16286ad04ccSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB;
16386ad04ccSSebastian Grimberg
16486ad04ccSSebastian Grimberg // read A using all threads
16586ad04ccSSebastian Grimberg CeedScalar rA[P];
16686ad04ccSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
16786ad04ccSSebastian Grimberg __syncthreads();
16886ad04ccSSebastian Grimberg
16986ad04ccSSebastian Grimberg // terminate threads with no work
17086ad04ccSSebastian Grimberg if (id >= nblocks) return;
17186ad04ccSSebastian Grimberg
17286ad04ccSSebastian Grimberg // read B
17386ad04ccSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
17486ad04ccSSebastian Grimberg __syncthreads();
17586ad04ccSSebastian Grimberg
17686ad04ccSSebastian Grimberg CeedScalar rC[NB];
17786ad04ccSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
17886ad04ccSSebastian Grimberg
17986ad04ccSSebastian Grimberg // write C
18086ad04ccSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
18186ad04ccSSebastian Grimberg }
18286ad04ccSSebastian Grimberg
18386ad04ccSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
18486ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB>
magma_basis_nontensor_device_t1(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)18586ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
18686ad04ccSSebastian Grimberg CeedScalar *shared_data) {
18786ad04ccSSebastian Grimberg const int tx = threadIdx.x;
18886ad04ccSSebastian Grimberg const int ty = threadIdx.y;
18986ad04ccSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty;
19086ad04ccSSebastian Grimberg const int nblocks = (n + NB - 1) / NB;
19186ad04ccSSebastian Grimberg const int myn = min(NB, n - id * NB);
19286ad04ccSSebastian Grimberg
19386ad04ccSSebastian Grimberg dB += id * Q * NB;
19486ad04ccSSebastian Grimberg dC += id * P * NB;
19586ad04ccSSebastian Grimberg
19686ad04ccSSebastian Grimberg // A is P x Q
19786ad04ccSSebastian Grimberg CeedScalar *sA = shared_data;
19886ad04ccSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB;
19986ad04ccSSebastian Grimberg
20086ad04ccSSebastian Grimberg // read A using all threads
20186ad04ccSSebastian Grimberg CeedScalar rA[Q];
20286ad04ccSSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
20386ad04ccSSebastian Grimberg __syncthreads();
20486ad04ccSSebastian Grimberg
20586ad04ccSSebastian Grimberg // terminate threads with no work
20686ad04ccSSebastian Grimberg if (id >= nblocks) return;
20786ad04ccSSebastian Grimberg
20886ad04ccSSebastian Grimberg // read B
20986ad04ccSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
21086ad04ccSSebastian Grimberg __syncthreads();
21186ad04ccSSebastian Grimberg
21286ad04ccSSebastian Grimberg CeedScalar rC[NB];
21386ad04ccSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
21486ad04ccSSebastian Grimberg
21586ad04ccSSebastian Grimberg // write C
21686ad04ccSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
21786ad04ccSSebastian Grimberg }
21886ad04ccSSebastian Grimberg
21986ad04ccSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
220db2becc9SJeremy L Thompson template <typename T, int P, int Q, int NB>
magma_basis_nontensor_device_ta1(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)221db2becc9SJeremy L Thompson static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
222db2becc9SJeremy L Thompson CeedScalar *shared_data) {
223db2becc9SJeremy L Thompson const int tx = threadIdx.x;
224db2becc9SJeremy L Thompson const int ty = threadIdx.y;
225db2becc9SJeremy L Thompson const int id = blockIdx.x * blockDim.y + ty;
226db2becc9SJeremy L Thompson const int nblocks = (n + NB - 1) / NB;
227db2becc9SJeremy L Thompson const int myn = min(NB, n - id * NB);
228db2becc9SJeremy L Thompson
229db2becc9SJeremy L Thompson dB += id * Q * NB;
230db2becc9SJeremy L Thompson dC += id * P * NB;
231db2becc9SJeremy L Thompson
232db2becc9SJeremy L Thompson // A is P x Q
233db2becc9SJeremy L Thompson CeedScalar *sA = shared_data;
234db2becc9SJeremy L Thompson CeedScalar *sB = shared_data + ty * Q * NB;
235db2becc9SJeremy L Thompson
236db2becc9SJeremy L Thompson // read A using all threads
237db2becc9SJeremy L Thompson CeedScalar rA[Q];
238db2becc9SJeremy L Thompson read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
239db2becc9SJeremy L Thompson __syncthreads();
240db2becc9SJeremy L Thompson
241db2becc9SJeremy L Thompson // terminate threads with no work
242db2becc9SJeremy L Thompson if (id >= nblocks) return;
243db2becc9SJeremy L Thompson
244db2becc9SJeremy L Thompson // read B
245db2becc9SJeremy L Thompson read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
246db2becc9SJeremy L Thompson __syncthreads();
247db2becc9SJeremy L Thompson
248db2becc9SJeremy L Thompson CeedScalar rC[NB];
249db2becc9SJeremy L Thompson mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
250db2becc9SJeremy L Thompson
251db2becc9SJeremy L Thompson // sum into C
252db2becc9SJeremy L Thompson sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
253db2becc9SJeremy L Thompson }
254db2becc9SJeremy L Thompson
255db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_Q,MAGMA_MAXTHREADS_1D))2569d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
2579d15e85bSSebastian Grimberg void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
2589d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
2599d15e85bSSebastian Grimberg
26086ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1
2619d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
26286ad04ccSSebastian Grimberg #else
2639d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
26486ad04ccSSebastian Grimberg #endif
2659d15e85bSSebastian Grimberg }
2669d15e85bSSebastian Grimberg
2679d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))2689d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
2699d15e85bSSebastian Grimberg void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
2709d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
2719d15e85bSSebastian Grimberg
27286ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1
27386ad04ccSSebastian Grimberg magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
27486ad04ccSSebastian Grimberg #else
2759d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
27686ad04ccSSebastian Grimberg #endif
2779d15e85bSSebastian Grimberg }
2789d15e85bSSebastian Grimberg
2799d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))280db2becc9SJeremy L Thompson extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
281db2becc9SJeremy L Thompson void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
282db2becc9SJeremy L Thompson MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
283db2becc9SJeremy L Thompson
284db2becc9SJeremy L Thompson #if BASIS_Q_COMP_INTERP == 1
285db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
286db2becc9SJeremy L Thompson #else
287db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
288db2becc9SJeremy L Thompson #endif
289db2becc9SJeremy L Thompson }
290db2becc9SJeremy L Thompson
291db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_Q,MAGMA_MAXTHREADS_1D))2929d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
2939d15e85bSSebastian Grimberg void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
2949d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
2959d15e85bSSebastian Grimberg
29686ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1
2979d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
29886ad04ccSSebastian Grimberg #else
2999d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
30086ad04ccSSebastian Grimberg #endif
3019d15e85bSSebastian Grimberg }
3029d15e85bSSebastian Grimberg
3039d15e85bSSebastian Grimberg ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))3049d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
3059d15e85bSSebastian Grimberg void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
3069d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
3079d15e85bSSebastian Grimberg
30886ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1
30986ad04ccSSebastian Grimberg magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
31086ad04ccSSebastian Grimberg #else
3119d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
31286ad04ccSSebastian Grimberg #endif
3139d15e85bSSebastian Grimberg }
314db2becc9SJeremy L Thompson
315db2becc9SJeremy L Thompson ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))316db2becc9SJeremy L Thompson extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
317db2becc9SJeremy L Thompson void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
318db2becc9SJeremy L Thompson MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
319db2becc9SJeremy L Thompson
320db2becc9SJeremy L Thompson #if BASIS_Q_COMP_DERIV == 1
321db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
322db2becc9SJeremy L Thompson #else
323db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
324db2becc9SJeremy L Thompson #endif
325db2becc9SJeremy L Thompson }
326