jit-source/magma/magma-basis-interp-deriv-nontensor.h

5aed82e4SJeremy L Thompson// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
9d15e85bSSebastian Grimberg// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
9d15e85bSSebastian Grimberg//
9d15e85bSSebastian Grimberg// SPDX-License-Identifier: BSD-2-Clause
9d15e85bSSebastian Grimberg//
9d15e85bSSebastian Grimberg// This file is part of CEED:  http://github.com/ceed
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg/// @file
9d15e85bSSebastian Grimberg/// Internal header for MAGMA non-tensor basis interpolation
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg#include "magma-common-nontensor.h"
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
9d15e85bSSebastian Grimbergtemplate <typename T, int Q_COMP, int P, int Q, int NB>
9d15e85bSSebastian Grimbergstatic __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
9d15e85bSSebastian Grimberg                                                                 CeedScalar *shared_data) {
9d15e85bSSebastian Grimberg  const int tx      = threadIdx.x;
9d15e85bSSebastian Grimberg  const int ty      = threadIdx.y;
9d15e85bSSebastian Grimberg  const int id      = blockIdx.x * blockDim.y + ty;
9d15e85bSSebastian Grimberg  const int nblocks = (n + NB - 1) / NB;
9d15e85bSSebastian Grimberg  const int myn     = min(NB, n - id * NB);
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  dB += id * P * NB;
9d15e85bSSebastian Grimberg  dC += id * Q * NB;
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  // A is P x Q
9d15e85bSSebastian Grimberg  CeedScalar *sB = shared_data + ty * P * NB;
9d15e85bSSebastian Grimberg  CeedScalar *sA = shared_data + blockDim.y * P * NB;
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  // read B once for all C's
9d15e85bSSebastian Grimberg  if (id < nblocks) {
9d15e85bSSebastian Grimberg    read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
9d15e85bSSebastian Grimberg  }
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
9d15e85bSSebastian Grimberg  for (int d = 0; d < Q_COMP; d++) {
9d15e85bSSebastian Grimberg    // read A using all threads
86ad04ccSSebastian Grimberg    CeedScalar rA[P];
9d15e85bSSebastian Grimberg    read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
9d15e85bSSebastian Grimberg
86ad04ccSSebastian Grimberg    CeedScalar rC[NB];
9d15e85bSSebastian Grimberg    mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg    // write C
833aa127SSebastian Grimberg    if (id < nblocks) {
9d15e85bSSebastian Grimberg      write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
9d15e85bSSebastian Grimberg    }
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg    dA += Q * P;
9d15e85bSSebastian Grimberg    dC += Q * n;
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg    __syncthreads();
9d15e85bSSebastian Grimberg  }
9d15e85bSSebastian Grimberg}
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
9d15e85bSSebastian Grimbergtemplate <typename T, int Q_COMP, int P, int Q, int NB>
9d15e85bSSebastian Grimbergstatic __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
9d15e85bSSebastian Grimberg                                                                 CeedScalar *shared_data) {
9d15e85bSSebastian Grimberg  const int tx      = threadIdx.x;
9d15e85bSSebastian Grimberg  const int ty      = threadIdx.y;
9d15e85bSSebastian Grimberg  const int id      = blockIdx.x * blockDim.y + ty;
9d15e85bSSebastian Grimberg  const int nblocks = (n + NB - 1) / NB;
9d15e85bSSebastian Grimberg  const int myn     = min(NB, n - id * NB);
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  dB += id * Q * NB;
9d15e85bSSebastian Grimberg  dC += id * P * NB;
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  // A is P x Q
833aa127SSebastian Grimberg  CeedScalar *sA = shared_data;
9d15e85bSSebastian Grimberg  CeedScalar *sB = shared_data + ty * Q * NB;
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  CeedScalar rC[NB] = {0.0};
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
9d15e85bSSebastian Grimberg  for (int d = 0; d < Q_COMP; d++) {
833aa127SSebastian Grimberg    // read A using all threads
86ad04ccSSebastian Grimberg    CeedScalar rA[Q];
833aa127SSebastian Grimberg    read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
833aa127SSebastian Grimberg    __syncthreads();
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg    // read B
833aa127SSebastian Grimberg    if (id < nblocks) {
9d15e85bSSebastian Grimberg      read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
833aa127SSebastian Grimberg    }
9d15e85bSSebastian Grimberg    __syncthreads();
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg    addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg    dA += P * Q;
9d15e85bSSebastian Grimberg    dB += Q * n;
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg    __syncthreads();
9d15e85bSSebastian Grimberg  }
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg  // write C
833aa127SSebastian Grimberg  if (id < nblocks) {
9d15e85bSSebastian Grimberg    write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
9d15e85bSSebastian Grimberg  }
833aa127SSebastian Grimberg}
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
*db2becc9SJeremy L Thompsontemplate <typename T, int Q_COMP, int P, int Q, int NB>
*db2becc9SJeremy L Thompsonstatic __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
*db2becc9SJeremy L Thompson                                                                  CeedScalar *shared_data) {
*db2becc9SJeremy L Thompson  const int tx      = threadIdx.x;
*db2becc9SJeremy L Thompson  const int ty      = threadIdx.y;
*db2becc9SJeremy L Thompson  const int id      = blockIdx.x * blockDim.y + ty;
*db2becc9SJeremy L Thompson  const int nblocks = (n + NB - 1) / NB;
*db2becc9SJeremy L Thompson  const int myn     = min(NB, n - id * NB);
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  dB += id * Q * NB;
*db2becc9SJeremy L Thompson  dC += id * P * NB;
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // A is P x Q
*db2becc9SJeremy L Thompson  CeedScalar *sA = shared_data;
*db2becc9SJeremy L Thompson  CeedScalar *sB = shared_data + ty * Q * NB;
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  CeedScalar rC[NB] = {0.0};
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
*db2becc9SJeremy L Thompson  for (int d = 0; d < Q_COMP; d++) {
*db2becc9SJeremy L Thompson    // read A using all threads
*db2becc9SJeremy L Thompson    CeedScalar rA[Q];
*db2becc9SJeremy L Thompson    read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
*db2becc9SJeremy L Thompson    __syncthreads();
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson    // read B
*db2becc9SJeremy L Thompson    if (id < nblocks) {
*db2becc9SJeremy L Thompson      read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
*db2becc9SJeremy L Thompson    }
*db2becc9SJeremy L Thompson    __syncthreads();
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson    addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson    dA += P * Q;
*db2becc9SJeremy L Thompson    dB += Q * n;
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson    __syncthreads();
*db2becc9SJeremy L Thompson  }
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // sum into C
*db2becc9SJeremy L Thompson  if (id < nblocks) {
*db2becc9SJeremy L Thompson    sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
*db2becc9SJeremy L Thompson  }
*db2becc9SJeremy L Thompson}
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
86ad04ccSSebastian Grimbergtemplate <typename T, int P, int Q, int NB>
86ad04ccSSebastian Grimbergstatic __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
86ad04ccSSebastian Grimberg                                                                  CeedScalar *shared_data) {
86ad04ccSSebastian Grimberg  const int tx      = threadIdx.x;
86ad04ccSSebastian Grimberg  const int ty      = threadIdx.y;
86ad04ccSSebastian Grimberg  const int id      = blockIdx.x * blockDim.y + ty;
86ad04ccSSebastian Grimberg  const int nblocks = (n + NB - 1) / NB;
86ad04ccSSebastian Grimberg  const int myn     = min(NB, n - id * NB);
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  dB += id * P * NB;
86ad04ccSSebastian Grimberg  dC += id * Q * NB;
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // A is P x Q
86ad04ccSSebastian Grimberg  CeedScalar *sA = shared_data;
86ad04ccSSebastian Grimberg  CeedScalar *sB = shared_data + ty * P * NB;
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // read A using all threads
86ad04ccSSebastian Grimberg  CeedScalar rA[P];
86ad04ccSSebastian Grimberg  read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
86ad04ccSSebastian Grimberg  __syncthreads();
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // terminate threads with no work
86ad04ccSSebastian Grimberg  if (id >= nblocks) return;
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // read B
86ad04ccSSebastian Grimberg  read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
86ad04ccSSebastian Grimberg  __syncthreads();
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  CeedScalar rC[NB];
86ad04ccSSebastian Grimberg  mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // write C
86ad04ccSSebastian Grimberg  write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
86ad04ccSSebastian Grimberg}
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
86ad04ccSSebastian Grimbergtemplate <typename T, int P, int Q, int NB>
86ad04ccSSebastian Grimbergstatic __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
86ad04ccSSebastian Grimberg                                                                  CeedScalar *shared_data) {
86ad04ccSSebastian Grimberg  const int tx      = threadIdx.x;
86ad04ccSSebastian Grimberg  const int ty      = threadIdx.y;
86ad04ccSSebastian Grimberg  const int id      = blockIdx.x * blockDim.y + ty;
86ad04ccSSebastian Grimberg  const int nblocks = (n + NB - 1) / NB;
86ad04ccSSebastian Grimberg  const int myn     = min(NB, n - id * NB);
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  dB += id * Q * NB;
86ad04ccSSebastian Grimberg  dC += id * P * NB;
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // A is P x Q
86ad04ccSSebastian Grimberg  CeedScalar *sA = shared_data;
86ad04ccSSebastian Grimberg  CeedScalar *sB = shared_data + ty * Q * NB;
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // read A using all threads
86ad04ccSSebastian Grimberg  CeedScalar rA[Q];
86ad04ccSSebastian Grimberg  read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
86ad04ccSSebastian Grimberg  __syncthreads();
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // terminate threads with no work
86ad04ccSSebastian Grimberg  if (id >= nblocks) return;
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // read B
86ad04ccSSebastian Grimberg  read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
86ad04ccSSebastian Grimberg  __syncthreads();
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  CeedScalar rC[NB];
86ad04ccSSebastian Grimberg  mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg  // write C
86ad04ccSSebastian Grimberg  write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
86ad04ccSSebastian Grimberg}
86ad04ccSSebastian Grimberg
86ad04ccSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
*db2becc9SJeremy L Thompsontemplate <typename T, int P, int Q, int NB>
*db2becc9SJeremy L Thompsonstatic __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
*db2becc9SJeremy L Thompson                                                                   CeedScalar *shared_data) {
*db2becc9SJeremy L Thompson  const int tx      = threadIdx.x;
*db2becc9SJeremy L Thompson  const int ty      = threadIdx.y;
*db2becc9SJeremy L Thompson  const int id      = blockIdx.x * blockDim.y + ty;
*db2becc9SJeremy L Thompson  const int nblocks = (n + NB - 1) / NB;
*db2becc9SJeremy L Thompson  const int myn     = min(NB, n - id * NB);
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  dB += id * Q * NB;
*db2becc9SJeremy L Thompson  dC += id * P * NB;
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // A is P x Q
*db2becc9SJeremy L Thompson  CeedScalar *sA = shared_data;
*db2becc9SJeremy L Thompson  CeedScalar *sB = shared_data + ty * Q * NB;
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // read A using all threads
*db2becc9SJeremy L Thompson  CeedScalar rA[Q];
*db2becc9SJeremy L Thompson  read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
*db2becc9SJeremy L Thompson  __syncthreads();
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // terminate threads with no work
*db2becc9SJeremy L Thompson  if (id >= nblocks) return;
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // read B
*db2becc9SJeremy L Thompson  read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
*db2becc9SJeremy L Thompson  __syncthreads();
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  CeedScalar rC[NB];
*db2becc9SJeremy L Thompson  mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson  // sum into C
*db2becc9SJeremy L Thompson  sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
*db2becc9SJeremy L Thompson}
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
9d15e85bSSebastian Grimbergextern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
9d15e85bSSebastian Grimberg    void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
9d15e85bSSebastian Grimberg  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
9d15e85bSSebastian Grimberg
86ad04ccSSebastian Grimberg#if BASIS_Q_COMP_INTERP == 1
9d15e85bSSebastian Grimberg  magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#else
9d15e85bSSebastian Grimberg  magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#endif
9d15e85bSSebastian Grimberg}
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
9d15e85bSSebastian Grimbergextern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
9d15e85bSSebastian Grimberg    void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
9d15e85bSSebastian Grimberg  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
9d15e85bSSebastian Grimberg
86ad04ccSSebastian Grimberg#if BASIS_Q_COMP_INTERP == 1
86ad04ccSSebastian Grimberg  magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#else
9d15e85bSSebastian Grimberg  magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#endif
9d15e85bSSebastian Grimberg}
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
*db2becc9SJeremy L Thompsonextern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
*db2becc9SJeremy L Thompson    void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
*db2becc9SJeremy L Thompson  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson#if BASIS_Q_COMP_INTERP == 1
*db2becc9SJeremy L Thompson  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
*db2becc9SJeremy L Thompson#else
*db2becc9SJeremy L Thompson  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
*db2becc9SJeremy L Thompson#endif
*db2becc9SJeremy L Thompson}
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
9d15e85bSSebastian Grimbergextern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
9d15e85bSSebastian Grimberg    void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
9d15e85bSSebastian Grimberg  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
9d15e85bSSebastian Grimberg
86ad04ccSSebastian Grimberg#if BASIS_Q_COMP_DERIV == 1
9d15e85bSSebastian Grimberg  magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#else
9d15e85bSSebastian Grimberg  magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#endif
9d15e85bSSebastian Grimberg}
9d15e85bSSebastian Grimberg
9d15e85bSSebastian Grimberg////////////////////////////////////////////////////////////////////////////////
9d15e85bSSebastian Grimbergextern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
9d15e85bSSebastian Grimberg    void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
9d15e85bSSebastian Grimberg  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
9d15e85bSSebastian Grimberg
86ad04ccSSebastian Grimberg#if BASIS_Q_COMP_DERIV == 1
86ad04ccSSebastian Grimberg  magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#else
9d15e85bSSebastian Grimberg  magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
86ad04ccSSebastian Grimberg#endif
9d15e85bSSebastian Grimberg}
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson////////////////////////////////////////////////////////////////////////////////
*db2becc9SJeremy L Thompsonextern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
*db2becc9SJeremy L Thompson    void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
*db2becc9SJeremy L Thompson  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
*db2becc9SJeremy L Thompson
*db2becc9SJeremy L Thompson#if BASIS_Q_COMP_DERIV == 1
*db2becc9SJeremy L Thompson  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
*db2becc9SJeremy L Thompson#else
*db2becc9SJeremy L Thompson  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
*db2becc9SJeremy L Thompson#endif
*db2becc9SJeremy L Thompson}