xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e1d4b82SJeremy L Thompson //
49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59e1d4b82SJeremy L Thompson //
69e1d4b82SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
79e1d4b82SJeremy L Thompson 
89e1d4b82SJeremy L Thompson /// @file
99e1d4b82SJeremy L Thompson /// Internal header for HIP shared memory tensor product basis AtPoints templates
109e1d4b82SJeremy L Thompson #include <ceed/types.h>
119e1d4b82SJeremy L Thompson 
129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
139e1d4b82SJeremy L Thompson // Chebyshev values
149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
159e1d4b82SJeremy L Thompson template <int Q_1D>
ChebyshevPolynomialsAtPoint(const CeedScalar x,CeedScalar * chebyshev_x)169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
179e1d4b82SJeremy L Thompson   chebyshev_x[0] = 1.0;
189e1d4b82SJeremy L Thompson   chebyshev_x[1] = 2 * x;
199e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
209e1d4b82SJeremy L Thompson }
219e1d4b82SJeremy L Thompson 
229e1d4b82SJeremy L Thompson template <int Q_1D>
ChebyshevDerivativeAtPoint(const CeedScalar x,CeedScalar * chebyshev_dx)239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
249e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[3];
259e1d4b82SJeremy L Thompson 
269e1d4b82SJeremy L Thompson   chebyshev_x[1]  = 1.0;
279e1d4b82SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
289e1d4b82SJeremy L Thompson   chebyshev_dx[0] = 0.0;
299e1d4b82SJeremy L Thompson   chebyshev_dx[1] = 2.0;
309e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
319e1d4b82SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
329e1d4b82SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
339e1d4b82SJeremy L Thompson   }
349e1d4b82SJeremy L Thompson }
359e1d4b82SJeremy L Thompson 
369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
379e1d4b82SJeremy L Thompson // 1D
389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
399e1d4b82SJeremy L Thompson 
409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
419e1d4b82SJeremy L Thompson // 1D interpolate to points
429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
43f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
459e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
469e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
479e1d4b82SJeremy L Thompson 
489e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
499e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
509e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
519e1d4b82SJeremy L Thompson     // Load coefficients
52d6c19ee8SJeremy L Thompson     __syncthreads();
539e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
549e1d4b82SJeremy L Thompson     __syncthreads();
559e1d4b82SJeremy L Thompson     // Contract x direction
569e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
579e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
589e1d4b82SJeremy L Thompson     }
599e1d4b82SJeremy L Thompson   }
609e1d4b82SJeremy L Thompson }
619e1d4b82SJeremy L Thompson 
629e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
639e1d4b82SJeremy L Thompson // 1D interpolate transpose
649e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
65f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)669e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
679e1d4b82SJeremy L Thompson                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
689e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
699e1d4b82SJeremy L Thompson 
709e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
719e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
729e1d4b82SJeremy L Thompson     // Clear shared memory
739e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
749e1d4b82SJeremy L Thompson     __syncthreads();
759e1d4b82SJeremy L Thompson     // Contract x direction
769e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
779e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
78a24d84eaSJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
799e1d4b82SJeremy L Thompson       }
809e1d4b82SJeremy L Thompson     }
819e1d4b82SJeremy L Thompson     // Pull from shared to register
829e1d4b82SJeremy L Thompson     __syncthreads();
834eda27c2SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
849e1d4b82SJeremy L Thompson   }
859e1d4b82SJeremy L Thompson }
869e1d4b82SJeremy L Thompson 
879e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
889e1d4b82SJeremy L Thompson // 1D derivatives at points
899e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
90f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)919e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
929e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
939e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
949e1d4b82SJeremy L Thompson 
959e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
969e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
979e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98d6c19ee8SJeremy L Thompson     __syncthreads();
999e1d4b82SJeremy L Thompson     // Load coefficients
1009e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
1019e1d4b82SJeremy L Thompson     __syncthreads();
1029e1d4b82SJeremy L Thompson     // Contract x direction
1039e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1049e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
1059e1d4b82SJeremy L Thompson     }
1069e1d4b82SJeremy L Thompson   }
1079e1d4b82SJeremy L Thompson }
1089e1d4b82SJeremy L Thompson 
1099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1109e1d4b82SJeremy L Thompson // 1D derivatives transpose
1119e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
112f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)1139e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
1149e1d4b82SJeremy L Thompson                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
1159e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
1169e1d4b82SJeremy L Thompson 
1179e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
1189e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1199e1d4b82SJeremy L Thompson     // Clear shared memory
1209e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
1219e1d4b82SJeremy L Thompson     __syncthreads();
1229e1d4b82SJeremy L Thompson     // Contract x direction
1239e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
1249e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
125a24d84eaSJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
1269e1d4b82SJeremy L Thompson       }
1279e1d4b82SJeremy L Thompson     }
1289e1d4b82SJeremy L Thompson     // Pull from shared to register
1299e1d4b82SJeremy L Thompson     __syncthreads();
1304eda27c2SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
1319e1d4b82SJeremy L Thompson   }
1329e1d4b82SJeremy L Thompson }
1339e1d4b82SJeremy L Thompson 
1349e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1359e1d4b82SJeremy L Thompson // 2D
1369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1379e1d4b82SJeremy L Thompson 
1389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1399e1d4b82SJeremy L Thompson // 2D interpolate to points
1409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
141f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)1429e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
1439e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
1449e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
1459e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1469e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
1479e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
1489e1d4b82SJeremy L Thompson 
1499e1d4b82SJeremy L Thompson     // Load coefficients
150d6c19ee8SJeremy L Thompson     __syncthreads();
1519e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
1529e1d4b82SJeremy L Thompson     __syncthreads();
1539e1d4b82SJeremy L Thompson     // Contract x direction
1549e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1559e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1569e1d4b82SJeremy L Thompson       buffer[i] = 0.0;
1579e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < Q_1D; j++) {
1589e1d4b82SJeremy L Thompson         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
1599e1d4b82SJeremy L Thompson       }
1609e1d4b82SJeremy L Thompson     }
1619e1d4b82SJeremy L Thompson     // Contract y direction
1629e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
1639e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1649e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * buffer[i];
1659e1d4b82SJeremy L Thompson     }
1669e1d4b82SJeremy L Thompson   }
1679e1d4b82SJeremy L Thompson }
1689e1d4b82SJeremy L Thompson 
1699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1709e1d4b82SJeremy L Thompson // 2D interpolate transpose
1719e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
172f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)1739e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
1749e1d4b82SJeremy L Thompson                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
1759e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1769e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
1779e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
1789e1d4b82SJeremy L Thompson 
1799e1d4b82SJeremy L Thompson     // Clear shared memory
1809e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
1819e1d4b82SJeremy L Thompson     __syncthreads();
1829e1d4b82SJeremy L Thompson     // Contract y direction
1839e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
184360be29cSJeremy L Thompson     const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
185360be29cSJeremy L Thompson 
1869e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
187360be29cSJeremy L Thompson       buffer[i] = chebyshev_x[i] * r_u;
1889e1d4b82SJeremy L Thompson     }
1899e1d4b82SJeremy L Thompson     // Contract x direction
1909e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1919e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1929e1d4b82SJeremy L Thompson       // Note: shifting to avoid atomic adds
193c6cb50faSJeremy L Thompson       const CeedInt ii = (i + data.t_id_y) % Q_1D;
1949e1d4b82SJeremy L Thompson 
1959e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < Q_1D; j++) {
196c6cb50faSJeremy L Thompson         const CeedInt jj = (j + data.t_id_x) % Q_1D;
1979e1d4b82SJeremy L Thompson 
19849337e26SJeremy L Thompson         if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
1999e1d4b82SJeremy L Thompson       }
2009e1d4b82SJeremy L Thompson     }
2019e1d4b82SJeremy L Thompson     // Pull from shared to register
2029e1d4b82SJeremy L Thompson     __syncthreads();
2039e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2049e1d4b82SJeremy L Thompson   }
2059e1d4b82SJeremy L Thompson }
2069e1d4b82SJeremy L Thompson 
2079e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2089e1d4b82SJeremy L Thompson // 2D derivatives at points
2099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
210f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)2119e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
2129e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
2139e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
2149e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2159e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2169e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2179e1d4b82SJeremy L Thompson 
2189e1d4b82SJeremy L Thompson     // Load coefficients
219d6c19ee8SJeremy L Thompson     __syncthreads();
2209e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
2219e1d4b82SJeremy L Thompson     __syncthreads();
2229e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
2239e1d4b82SJeremy L Thompson       // Contract x direction
2249e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2259e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2269e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2279e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
2289e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
2299e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
2309e1d4b82SJeremy L Thompson         }
2319e1d4b82SJeremy L Thompson       }
2329e1d4b82SJeremy L Thompson       // Contract y direction
2339e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2349e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
2359e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2369e1d4b82SJeremy L Thompson         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
2379e1d4b82SJeremy L Thompson       }
2389e1d4b82SJeremy L Thompson     }
2399e1d4b82SJeremy L Thompson   }
2409e1d4b82SJeremy L Thompson }
2419e1d4b82SJeremy L Thompson 
2429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2439e1d4b82SJeremy L Thompson // 2D derivatives transpose
2449e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
245f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)2469e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
2479e1d4b82SJeremy L Thompson                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
2489e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2499e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2509e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2519e1d4b82SJeremy L Thompson 
2529e1d4b82SJeremy L Thompson     // Clear shared memory
2539e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
2549e1d4b82SJeremy L Thompson     __syncthreads();
2559e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
2569e1d4b82SJeremy L Thompson       // Contract y direction
2579e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2589e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
259360be29cSJeremy L Thompson       const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
260360be29cSJeremy L Thompson 
2619e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
262360be29cSJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_u;
2639e1d4b82SJeremy L Thompson       }
2649e1d4b82SJeremy L Thompson       // Contract x direction
2659e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2669e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2679e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2689e1d4b82SJeremy L Thompson         // Note: shifting to avoid atomic adds
269c6cb50faSJeremy L Thompson         const CeedInt ii = (i + data.t_id_y) % Q_1D;
2709e1d4b82SJeremy L Thompson 
2719e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
272c6cb50faSJeremy L Thompson           const CeedInt jj = (j + data.t_id_x) % Q_1D;
2739e1d4b82SJeremy L Thompson 
27449337e26SJeremy L Thompson           if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
2759e1d4b82SJeremy L Thompson         }
2769e1d4b82SJeremy L Thompson       }
2779e1d4b82SJeremy L Thompson     }
2789e1d4b82SJeremy L Thompson     // Pull from shared to register
2799e1d4b82SJeremy L Thompson     __syncthreads();
2809e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2819e1d4b82SJeremy L Thompson   }
2829e1d4b82SJeremy L Thompson }
2839e1d4b82SJeremy L Thompson 
2849e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2859e1d4b82SJeremy L Thompson // 3D
2869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2879e1d4b82SJeremy L Thompson 
2889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2899e1d4b82SJeremy L Thompson // 3D interpolate to points
2909e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
291f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)2929e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
2939e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
2949e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
2959e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
2969e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2979e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2989e1d4b82SJeremy L Thompson 
299802d760aSJeremy L Thompson     // Get z contraction value
300802d760aSJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
301802d760aSJeremy L Thompson     const CeedScalar z = chebyshev_x[k];
302802d760aSJeremy L Thompson 
303802d760aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3049e1d4b82SJeremy L Thompson       // Load coefficients
305d6c19ee8SJeremy L Thompson       __syncthreads();
3069e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3079e1d4b82SJeremy L Thompson       __syncthreads();
3089e1d4b82SJeremy L Thompson       // Contract x direction
3099e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3109e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3119e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
3129e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
3139e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
3149e1d4b82SJeremy L Thompson         }
3159e1d4b82SJeremy L Thompson       }
3169e1d4b82SJeremy L Thompson       // Contract y and z direction
3179e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3189e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3199e1d4b82SJeremy L Thompson         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
3209e1d4b82SJeremy L Thompson       }
3219e1d4b82SJeremy L Thompson     }
3229e1d4b82SJeremy L Thompson   }
3239e1d4b82SJeremy L Thompson }
3249e1d4b82SJeremy L Thompson 
3259e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3269e1d4b82SJeremy L Thompson // 3D interpolate transpose
3279e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
328f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)3299e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
3309e1d4b82SJeremy L Thompson                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
3319e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
3329e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
3339e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
3349e1d4b82SJeremy L Thompson 
335802d760aSJeremy L Thompson     // Get z contraction value
336802d760aSJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
337802d760aSJeremy L Thompson     const CeedScalar z = chebyshev_x[k];
338802d760aSJeremy L Thompson 
339802d760aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3409e1d4b82SJeremy L Thompson       // Clear shared memory
3419e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
3429e1d4b82SJeremy L Thompson       __syncthreads();
3439e1d4b82SJeremy L Thompson       // Contract y and z direction
3449e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
345360be29cSJeremy L Thompson       const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
346360be29cSJeremy L Thompson 
3479e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
348360be29cSJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_u * z;
3499e1d4b82SJeremy L Thompson       }
3509e1d4b82SJeremy L Thompson       // Contract x direction
3519e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3529e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3539e1d4b82SJeremy L Thompson         // Note: shifting to avoid atomic adds
354c6cb50faSJeremy L Thompson         const CeedInt ii = (i + data.t_id_y) % Q_1D;
3559e1d4b82SJeremy L Thompson 
3569e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
357c6cb50faSJeremy L Thompson           const CeedInt jj = (j + data.t_id_x) % Q_1D;
3589e1d4b82SJeremy L Thompson 
35949337e26SJeremy L Thompson           if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
3609e1d4b82SJeremy L Thompson         }
3619e1d4b82SJeremy L Thompson       }
3629e1d4b82SJeremy L Thompson       // Pull from shared to register
3639e1d4b82SJeremy L Thompson       __syncthreads();
3649e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
3659e1d4b82SJeremy L Thompson     }
3669e1d4b82SJeremy L Thompson   }
3679e1d4b82SJeremy L Thompson }
3689e1d4b82SJeremy L Thompson 
3699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3709e1d4b82SJeremy L Thompson // 3D derivatives at points
3719e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
372f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)3739e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
3749e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
3759e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
3769e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
3779e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
3789e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
3799e1d4b82SJeremy L Thompson 
380dc7b9553SJeremy L Thompson     // Get z contraction values
381802d760aSJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
382dc7b9553SJeremy L Thompson     const CeedScalar z = chebyshev_x[k];
383802d760aSJeremy L Thompson 
384dc7b9553SJeremy L Thompson     ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
385dc7b9553SJeremy L Thompson     const CeedScalar dz = chebyshev_x[k];
386dc7b9553SJeremy L Thompson 
387dc7b9553SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3889e1d4b82SJeremy L Thompson       // Load coefficients
389d6c19ee8SJeremy L Thompson       __syncthreads();
3909e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3919e1d4b82SJeremy L Thompson       __syncthreads();
392802d760aSJeremy L Thompson       // Gradient directions
3939e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
3949e1d4b82SJeremy L Thompson         // Contract x direction
3959e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
3969e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3979e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
3989e1d4b82SJeremy L Thompson           buffer[i] = 0.0;
3999e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
4009e1d4b82SJeremy L Thompson             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
4019e1d4b82SJeremy L Thompson           }
4029e1d4b82SJeremy L Thompson         }
4039e1d4b82SJeremy L Thompson         // Contract y and z direction
4049e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4059e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
406dc7b9553SJeremy L Thompson         const CeedScalar zz = dim == 2 ? dz : z;
407dc7b9553SJeremy L Thompson 
4089e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
409dc7b9553SJeremy L Thompson           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
4109e1d4b82SJeremy L Thompson         }
4119e1d4b82SJeremy L Thompson       }
4129e1d4b82SJeremy L Thompson     }
4139e1d4b82SJeremy L Thompson   }
4149e1d4b82SJeremy L Thompson }
4159e1d4b82SJeremy L Thompson 
4169e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
4179e1d4b82SJeremy L Thompson // 3D derivatives transpose
4189e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
419f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)4209e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
4219e1d4b82SJeremy L Thompson                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
4229e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
4239e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
4249e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
4259e1d4b82SJeremy L Thompson 
426dc7b9553SJeremy L Thompson     // Get z contraction values
427802d760aSJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
428dc7b9553SJeremy L Thompson     const CeedScalar z = chebyshev_x[k];
429802d760aSJeremy L Thompson 
430dc7b9553SJeremy L Thompson     ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
431dc7b9553SJeremy L Thompson     const CeedScalar dz = chebyshev_x[k];
432dc7b9553SJeremy L Thompson 
433dc7b9553SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4349e1d4b82SJeremy L Thompson       // Clear shared memory
4359e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
4369e1d4b82SJeremy L Thompson       __syncthreads();
437802d760aSJeremy L Thompson       // Gradient directions
4389e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
4399e1d4b82SJeremy L Thompson         // Contract y and z direction
4409e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4419e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
442dc7b9553SJeremy L Thompson         const CeedScalar zz  = dim == 2 ? dz : z;
443360be29cSJeremy L Thompson         const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
444dc7b9553SJeremy L Thompson 
4459e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
446360be29cSJeremy L Thompson           buffer[i] = chebyshev_x[i] * r_u * zz;
4479e1d4b82SJeremy L Thompson         }
4489e1d4b82SJeremy L Thompson         // Contract x direction
4499e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
4509e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
4519e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
4529e1d4b82SJeremy L Thompson           // Note: shifting to avoid atomic adds
453c6cb50faSJeremy L Thompson           const CeedInt ii = (i + data.t_id_y) % Q_1D;
4549e1d4b82SJeremy L Thompson 
4559e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
456c6cb50faSJeremy L Thompson             const CeedInt jj = (j + data.t_id_x) % Q_1D;
4579e1d4b82SJeremy L Thompson 
45849337e26SJeremy L Thompson             if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
4599e1d4b82SJeremy L Thompson           }
4609e1d4b82SJeremy L Thompson         }
4619e1d4b82SJeremy L Thompson       }
4629e1d4b82SJeremy L Thompson       // Pull from shared to register
4639e1d4b82SJeremy L Thompson       __syncthreads();
4649e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
4659e1d4b82SJeremy L Thompson     }
4669e1d4b82SJeremy L Thompson   }
4679e1d4b82SJeremy L Thompson }
468