15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29d15e85bSSebastian Grimberg // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39d15e85bSSebastian Grimberg // 49d15e85bSSebastian Grimberg // SPDX-License-Identifier: BSD-2-Clause 59d15e85bSSebastian Grimberg // 69d15e85bSSebastian Grimberg // This file is part of CEED: http://github.com/ceed 79d15e85bSSebastian Grimberg 89d15e85bSSebastian Grimberg /// @file 99d15e85bSSebastian Grimberg /// Internal header for MAGMA non-tensor basis interpolation 109d15e85bSSebastian Grimberg 119d15e85bSSebastian Grimberg #include "magma-common-nontensor.h" 129d15e85bSSebastian Grimberg 139d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 149d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 159d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 169d15e85bSSebastian Grimberg CeedScalar *shared_data) { 179d15e85bSSebastian Grimberg const int tx = threadIdx.x; 189d15e85bSSebastian Grimberg const int ty = threadIdx.y; 199d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 209d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 219d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 229d15e85bSSebastian Grimberg 239d15e85bSSebastian Grimberg dB += id * P * NB; 249d15e85bSSebastian Grimberg dC += id * Q * NB; 259d15e85bSSebastian Grimberg 269d15e85bSSebastian Grimberg // A is P x Q 279d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 289d15e85bSSebastian Grimberg CeedScalar *sA = shared_data + blockDim.y * P * NB; 299d15e85bSSebastian Grimberg 309d15e85bSSebastian Grimberg // read B once for all C's 319d15e85bSSebastian Grimberg if (id < nblocks) { 329d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 339d15e85bSSebastian Grimberg } 349d15e85bSSebastian Grimberg 359d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 369d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 379d15e85bSSebastian Grimberg // read A using all threads 3886ad04ccSSebastian Grimberg CeedScalar rA[P]; 399d15e85bSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 409d15e85bSSebastian Grimberg 4186ad04ccSSebastian Grimberg CeedScalar rC[NB]; 429d15e85bSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 439d15e85bSSebastian Grimberg 449d15e85bSSebastian Grimberg // write C 45833aa127SSebastian Grimberg if (id < nblocks) { 469d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 479d15e85bSSebastian Grimberg } 489d15e85bSSebastian Grimberg 499d15e85bSSebastian Grimberg dA += Q * P; 509d15e85bSSebastian Grimberg dC += Q * n; 519d15e85bSSebastian Grimberg 529d15e85bSSebastian Grimberg __syncthreads(); 539d15e85bSSebastian Grimberg } 549d15e85bSSebastian Grimberg } 559d15e85bSSebastian Grimberg 569d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 579d15e85bSSebastian Grimberg template <typename T, int Q_COMP, int P, int Q, int NB> 589d15e85bSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 599d15e85bSSebastian Grimberg CeedScalar *shared_data) { 609d15e85bSSebastian Grimberg const int tx = threadIdx.x; 619d15e85bSSebastian Grimberg const int ty = threadIdx.y; 629d15e85bSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 639d15e85bSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 649d15e85bSSebastian Grimberg const int myn = min(NB, n - id * NB); 659d15e85bSSebastian Grimberg 669d15e85bSSebastian Grimberg dB += id * Q * NB; 679d15e85bSSebastian Grimberg dC += id * P * NB; 689d15e85bSSebastian Grimberg 699d15e85bSSebastian Grimberg // A is P x Q 70833aa127SSebastian Grimberg CeedScalar *sA = shared_data; 719d15e85bSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB; 729d15e85bSSebastian Grimberg 739d15e85bSSebastian Grimberg CeedScalar rC[NB] = {0.0}; 749d15e85bSSebastian Grimberg 759d15e85bSSebastian Grimberg // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 769d15e85bSSebastian Grimberg for (int d = 0; d < Q_COMP; d++) { 77833aa127SSebastian Grimberg // read A using all threads 7886ad04ccSSebastian Grimberg CeedScalar rA[Q]; 79833aa127SSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 80833aa127SSebastian Grimberg __syncthreads(); 819d15e85bSSebastian Grimberg 829d15e85bSSebastian Grimberg // read B 83833aa127SSebastian Grimberg if (id < nblocks) { 849d15e85bSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 85833aa127SSebastian Grimberg } 869d15e85bSSebastian Grimberg __syncthreads(); 879d15e85bSSebastian Grimberg 889d15e85bSSebastian Grimberg addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 899d15e85bSSebastian Grimberg 909d15e85bSSebastian Grimberg dA += P * Q; 919d15e85bSSebastian Grimberg dB += Q * n; 929d15e85bSSebastian Grimberg 939d15e85bSSebastian Grimberg __syncthreads(); 949d15e85bSSebastian Grimberg } 959d15e85bSSebastian Grimberg 969d15e85bSSebastian Grimberg // write C 97833aa127SSebastian Grimberg if (id < nblocks) { 989d15e85bSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 999d15e85bSSebastian Grimberg } 100833aa127SSebastian Grimberg } 1019d15e85bSSebastian Grimberg 1029d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 103*db2becc9SJeremy L Thompson template <typename T, int Q_COMP, int P, int Q, int NB> 104*db2becc9SJeremy L Thompson static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC, 105*db2becc9SJeremy L Thompson CeedScalar *shared_data) { 106*db2becc9SJeremy L Thompson const int tx = threadIdx.x; 107*db2becc9SJeremy L Thompson const int ty = threadIdx.y; 108*db2becc9SJeremy L Thompson const int id = blockIdx.x * blockDim.y + ty; 109*db2becc9SJeremy L Thompson const int nblocks = (n + NB - 1) / NB; 110*db2becc9SJeremy L Thompson const int myn = min(NB, n - id * NB); 111*db2becc9SJeremy L Thompson 112*db2becc9SJeremy L Thompson dB += id * Q * NB; 113*db2becc9SJeremy L Thompson dC += id * P * NB; 114*db2becc9SJeremy L Thompson 115*db2becc9SJeremy L Thompson // A is P x Q 116*db2becc9SJeremy L Thompson CeedScalar *sA = shared_data; 117*db2becc9SJeremy L Thompson CeedScalar *sB = shared_data + ty * Q * NB; 118*db2becc9SJeremy L Thompson 119*db2becc9SJeremy L Thompson CeedScalar rC[NB] = {0.0}; 120*db2becc9SJeremy L Thompson 121*db2becc9SJeremy L Thompson // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 122*db2becc9SJeremy L Thompson for (int d = 0; d < Q_COMP; d++) { 123*db2becc9SJeremy L Thompson // read A using all threads 124*db2becc9SJeremy L Thompson CeedScalar rA[Q]; 125*db2becc9SJeremy L Thompson read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 126*db2becc9SJeremy L Thompson __syncthreads(); 127*db2becc9SJeremy L Thompson 128*db2becc9SJeremy L Thompson // read B 129*db2becc9SJeremy L Thompson if (id < nblocks) { 130*db2becc9SJeremy L Thompson read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 131*db2becc9SJeremy L Thompson } 132*db2becc9SJeremy L Thompson __syncthreads(); 133*db2becc9SJeremy L Thompson 134*db2becc9SJeremy L Thompson addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 135*db2becc9SJeremy L Thompson 136*db2becc9SJeremy L Thompson dA += P * Q; 137*db2becc9SJeremy L Thompson dB += Q * n; 138*db2becc9SJeremy L Thompson 139*db2becc9SJeremy L Thompson __syncthreads(); 140*db2becc9SJeremy L Thompson } 141*db2becc9SJeremy L Thompson 142*db2becc9SJeremy L Thompson // sum into C 143*db2becc9SJeremy L Thompson if (id < nblocks) { 144*db2becc9SJeremy L Thompson sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 145*db2becc9SJeremy L Thompson } 146*db2becc9SJeremy L Thompson } 147*db2becc9SJeremy L Thompson 148*db2becc9SJeremy L Thompson //////////////////////////////////////////////////////////////////////////////// 14986ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB> 15086ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 15186ad04ccSSebastian Grimberg CeedScalar *shared_data) { 15286ad04ccSSebastian Grimberg const int tx = threadIdx.x; 15386ad04ccSSebastian Grimberg const int ty = threadIdx.y; 15486ad04ccSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 15586ad04ccSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 15686ad04ccSSebastian Grimberg const int myn = min(NB, n - id * NB); 15786ad04ccSSebastian Grimberg 15886ad04ccSSebastian Grimberg dB += id * P * NB; 15986ad04ccSSebastian Grimberg dC += id * Q * NB; 16086ad04ccSSebastian Grimberg 16186ad04ccSSebastian Grimberg // A is P x Q 16286ad04ccSSebastian Grimberg CeedScalar *sA = shared_data; 16386ad04ccSSebastian Grimberg CeedScalar *sB = shared_data + ty * P * NB; 16486ad04ccSSebastian Grimberg 16586ad04ccSSebastian Grimberg // read A using all threads 16686ad04ccSSebastian Grimberg CeedScalar rA[P]; 16786ad04ccSSebastian Grimberg read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 16886ad04ccSSebastian Grimberg __syncthreads(); 16986ad04ccSSebastian Grimberg 17086ad04ccSSebastian Grimberg // terminate threads with no work 17186ad04ccSSebastian Grimberg if (id >= nblocks) return; 17286ad04ccSSebastian Grimberg 17386ad04ccSSebastian Grimberg // read B 17486ad04ccSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 17586ad04ccSSebastian Grimberg __syncthreads(); 17686ad04ccSSebastian Grimberg 17786ad04ccSSebastian Grimberg CeedScalar rC[NB]; 17886ad04ccSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 17986ad04ccSSebastian Grimberg 18086ad04ccSSebastian Grimberg // write C 18186ad04ccSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 18286ad04ccSSebastian Grimberg } 18386ad04ccSSebastian Grimberg 18486ad04ccSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 18586ad04ccSSebastian Grimberg template <typename T, int P, int Q, int NB> 18686ad04ccSSebastian Grimberg static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 18786ad04ccSSebastian Grimberg CeedScalar *shared_data) { 18886ad04ccSSebastian Grimberg const int tx = threadIdx.x; 18986ad04ccSSebastian Grimberg const int ty = threadIdx.y; 19086ad04ccSSebastian Grimberg const int id = blockIdx.x * blockDim.y + ty; 19186ad04ccSSebastian Grimberg const int nblocks = (n + NB - 1) / NB; 19286ad04ccSSebastian Grimberg const int myn = min(NB, n - id * NB); 19386ad04ccSSebastian Grimberg 19486ad04ccSSebastian Grimberg dB += id * Q * NB; 19586ad04ccSSebastian Grimberg dC += id * P * NB; 19686ad04ccSSebastian Grimberg 19786ad04ccSSebastian Grimberg // A is P x Q 19886ad04ccSSebastian Grimberg CeedScalar *sA = shared_data; 19986ad04ccSSebastian Grimberg CeedScalar *sB = shared_data + ty * Q * NB; 20086ad04ccSSebastian Grimberg 20186ad04ccSSebastian Grimberg // read A using all threads 20286ad04ccSSebastian Grimberg CeedScalar rA[Q]; 20386ad04ccSSebastian Grimberg read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 20486ad04ccSSebastian Grimberg __syncthreads(); 20586ad04ccSSebastian Grimberg 20686ad04ccSSebastian Grimberg // terminate threads with no work 20786ad04ccSSebastian Grimberg if (id >= nblocks) return; 20886ad04ccSSebastian Grimberg 20986ad04ccSSebastian Grimberg // read B 21086ad04ccSSebastian Grimberg read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 21186ad04ccSSebastian Grimberg __syncthreads(); 21286ad04ccSSebastian Grimberg 21386ad04ccSSebastian Grimberg CeedScalar rC[NB]; 21486ad04ccSSebastian Grimberg mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 21586ad04ccSSebastian Grimberg 21686ad04ccSSebastian Grimberg // write C 21786ad04ccSSebastian Grimberg write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 21886ad04ccSSebastian Grimberg } 21986ad04ccSSebastian Grimberg 22086ad04ccSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 221*db2becc9SJeremy L Thompson template <typename T, int P, int Q, int NB> 222*db2becc9SJeremy L Thompson static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 223*db2becc9SJeremy L Thompson CeedScalar *shared_data) { 224*db2becc9SJeremy L Thompson const int tx = threadIdx.x; 225*db2becc9SJeremy L Thompson const int ty = threadIdx.y; 226*db2becc9SJeremy L Thompson const int id = blockIdx.x * blockDim.y + ty; 227*db2becc9SJeremy L Thompson const int nblocks = (n + NB - 1) / NB; 228*db2becc9SJeremy L Thompson const int myn = min(NB, n - id * NB); 229*db2becc9SJeremy L Thompson 230*db2becc9SJeremy L Thompson dB += id * Q * NB; 231*db2becc9SJeremy L Thompson dC += id * P * NB; 232*db2becc9SJeremy L Thompson 233*db2becc9SJeremy L Thompson // A is P x Q 234*db2becc9SJeremy L Thompson CeedScalar *sA = shared_data; 235*db2becc9SJeremy L Thompson CeedScalar *sB = shared_data + ty * Q * NB; 236*db2becc9SJeremy L Thompson 237*db2becc9SJeremy L Thompson // read A using all threads 238*db2becc9SJeremy L Thompson CeedScalar rA[Q]; 239*db2becc9SJeremy L Thompson read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 240*db2becc9SJeremy L Thompson __syncthreads(); 241*db2becc9SJeremy L Thompson 242*db2becc9SJeremy L Thompson // terminate threads with no work 243*db2becc9SJeremy L Thompson if (id >= nblocks) return; 244*db2becc9SJeremy L Thompson 245*db2becc9SJeremy L Thompson // read B 246*db2becc9SJeremy L Thompson read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 247*db2becc9SJeremy L Thompson __syncthreads(); 248*db2becc9SJeremy L Thompson 249*db2becc9SJeremy L Thompson CeedScalar rC[NB]; 250*db2becc9SJeremy L Thompson mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 251*db2becc9SJeremy L Thompson 252*db2becc9SJeremy L Thompson // sum into C 253*db2becc9SJeremy L Thompson sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 254*db2becc9SJeremy L Thompson } 255*db2becc9SJeremy L Thompson 256*db2becc9SJeremy L Thompson //////////////////////////////////////////////////////////////////////////////// 2579d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 2589d15e85bSSebastian Grimberg void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 2599d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 2609d15e85bSSebastian Grimberg 26186ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1 2629d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 26386ad04ccSSebastian Grimberg #else 2649d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 26586ad04ccSSebastian Grimberg #endif 2669d15e85bSSebastian Grimberg } 2679d15e85bSSebastian Grimberg 2689d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 2699d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 2709d15e85bSSebastian Grimberg void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 2719d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 2729d15e85bSSebastian Grimberg 27386ad04ccSSebastian Grimberg #if BASIS_Q_COMP_INTERP == 1 27486ad04ccSSebastian Grimberg magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 27586ad04ccSSebastian Grimberg #else 2769d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 27786ad04ccSSebastian Grimberg #endif 2789d15e85bSSebastian Grimberg } 2799d15e85bSSebastian Grimberg 2809d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 281*db2becc9SJeremy L Thompson extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 282*db2becc9SJeremy L Thompson void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 283*db2becc9SJeremy L Thompson MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 284*db2becc9SJeremy L Thompson 285*db2becc9SJeremy L Thompson #if BASIS_Q_COMP_INTERP == 1 286*db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 287*db2becc9SJeremy L Thompson #else 288*db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 289*db2becc9SJeremy L Thompson #endif 290*db2becc9SJeremy L Thompson } 291*db2becc9SJeremy L Thompson 292*db2becc9SJeremy L Thompson //////////////////////////////////////////////////////////////////////////////// 2939d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 2949d15e85bSSebastian Grimberg void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 2959d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 2969d15e85bSSebastian Grimberg 29786ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1 2989d15e85bSSebastian Grimberg magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 29986ad04ccSSebastian Grimberg #else 3009d15e85bSSebastian Grimberg magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 30186ad04ccSSebastian Grimberg #endif 3029d15e85bSSebastian Grimberg } 3039d15e85bSSebastian Grimberg 3049d15e85bSSebastian Grimberg //////////////////////////////////////////////////////////////////////////////// 3059d15e85bSSebastian Grimberg extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 3069d15e85bSSebastian Grimberg void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 3079d15e85bSSebastian Grimberg MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 3089d15e85bSSebastian Grimberg 30986ad04ccSSebastian Grimberg #if BASIS_Q_COMP_DERIV == 1 31086ad04ccSSebastian Grimberg magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 31186ad04ccSSebastian Grimberg #else 3129d15e85bSSebastian Grimberg magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 31386ad04ccSSebastian Grimberg #endif 3149d15e85bSSebastian Grimberg } 315*db2becc9SJeremy L Thompson 316*db2becc9SJeremy L Thompson //////////////////////////////////////////////////////////////////////////////// 317*db2becc9SJeremy L Thompson extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 318*db2becc9SJeremy L Thompson void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 319*db2becc9SJeremy L Thompson MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 320*db2becc9SJeremy L Thompson 321*db2becc9SJeremy L Thompson #if BASIS_Q_COMP_DERIV == 1 322*db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 323*db2becc9SJeremy L Thompson #else 324*db2becc9SJeremy L Thompson magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 325*db2becc9SJeremy L Thompson #endif 326*db2becc9SJeremy L Thompson } 327