1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e1d4b82SJeremy L Thompson //
49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59e1d4b82SJeremy L Thompson //
69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed
79e1d4b82SJeremy L Thompson
89e1d4b82SJeremy L Thompson /// @file
99e1d4b82SJeremy L Thompson /// Internal header for HIP shared memory tensor product basis AtPoints templates
109e1d4b82SJeremy L Thompson #include <ceed/types.h>
119e1d4b82SJeremy L Thompson
129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
139e1d4b82SJeremy L Thompson // Chebyshev values
149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
159e1d4b82SJeremy L Thompson template <int Q_1D>
ChebyshevPolynomialsAtPoint(const CeedScalar x,CeedScalar * chebyshev_x)169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
179e1d4b82SJeremy L Thompson chebyshev_x[0] = 1.0;
189e1d4b82SJeremy L Thompson chebyshev_x[1] = 2 * x;
199e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
209e1d4b82SJeremy L Thompson }
219e1d4b82SJeremy L Thompson
229e1d4b82SJeremy L Thompson template <int Q_1D>
ChebyshevDerivativeAtPoint(const CeedScalar x,CeedScalar * chebyshev_dx)239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[3];
259e1d4b82SJeremy L Thompson
269e1d4b82SJeremy L Thompson chebyshev_x[1] = 1.0;
279e1d4b82SJeremy L Thompson chebyshev_x[2] = 2 * x;
289e1d4b82SJeremy L Thompson chebyshev_dx[0] = 0.0;
299e1d4b82SJeremy L Thompson chebyshev_dx[1] = 2.0;
309e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) {
319e1d4b82SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
329e1d4b82SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
339e1d4b82SJeremy L Thompson }
349e1d4b82SJeremy L Thompson }
359e1d4b82SJeremy L Thompson
369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
379e1d4b82SJeremy L Thompson // 1D
389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
399e1d4b82SJeremy L Thompson
409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
419e1d4b82SJeremy L Thompson // 1D interpolate to points
429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
43f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
459e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
479e1d4b82SJeremy L Thompson
489e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
499e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
509e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
519e1d4b82SJeremy L Thompson // Load coefficients
52d6c19ee8SJeremy L Thompson __syncthreads();
539e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
549e1d4b82SJeremy L Thompson __syncthreads();
559e1d4b82SJeremy L Thompson // Contract x direction
569e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
579e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i];
589e1d4b82SJeremy L Thompson }
599e1d4b82SJeremy L Thompson }
609e1d4b82SJeremy L Thompson }
619e1d4b82SJeremy L Thompson
629e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
639e1d4b82SJeremy L Thompson // 1D interpolate transpose
649e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
65f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)669e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
679e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
689e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
699e1d4b82SJeremy L Thompson
709e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
719e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
729e1d4b82SJeremy L Thompson // Clear shared memory
739e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
749e1d4b82SJeremy L Thompson __syncthreads();
759e1d4b82SJeremy L Thompson // Contract x direction
769e1d4b82SJeremy L Thompson if (p < NUM_POINTS) {
779e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
78a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
799e1d4b82SJeremy L Thompson }
809e1d4b82SJeremy L Thompson }
819e1d4b82SJeremy L Thompson // Pull from shared to register
829e1d4b82SJeremy L Thompson __syncthreads();
834eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
849e1d4b82SJeremy L Thompson }
859e1d4b82SJeremy L Thompson }
869e1d4b82SJeremy L Thompson
879e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
889e1d4b82SJeremy L Thompson // 1D derivatives at points
899e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
90f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)919e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
929e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
939e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
949e1d4b82SJeremy L Thompson
959e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
969e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
979e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98d6c19ee8SJeremy L Thompson __syncthreads();
999e1d4b82SJeremy L Thompson // Load coefficients
1009e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
1019e1d4b82SJeremy L Thompson __syncthreads();
1029e1d4b82SJeremy L Thompson // Contract x direction
1039e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1049e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i];
1059e1d4b82SJeremy L Thompson }
1069e1d4b82SJeremy L Thompson }
1079e1d4b82SJeremy L Thompson }
1089e1d4b82SJeremy L Thompson
1099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1109e1d4b82SJeremy L Thompson // 1D derivatives transpose
1119e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
112f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)1139e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
1149e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
1159e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
1169e1d4b82SJeremy L Thompson
1179e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
1189e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1199e1d4b82SJeremy L Thompson // Clear shared memory
1209e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
1219e1d4b82SJeremy L Thompson __syncthreads();
1229e1d4b82SJeremy L Thompson // Contract x direction
1239e1d4b82SJeremy L Thompson if (p < NUM_POINTS) {
1249e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
125a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
1269e1d4b82SJeremy L Thompson }
1279e1d4b82SJeremy L Thompson }
1289e1d4b82SJeremy L Thompson // Pull from shared to register
1299e1d4b82SJeremy L Thompson __syncthreads();
1304eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
1319e1d4b82SJeremy L Thompson }
1329e1d4b82SJeremy L Thompson }
1339e1d4b82SJeremy L Thompson
1349e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1359e1d4b82SJeremy L Thompson // 2D
1369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1379e1d4b82SJeremy L Thompson
1389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1399e1d4b82SJeremy L Thompson // 2D interpolate to points
1409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
141f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)1429e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
1439e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
1449e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
1459e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1469e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
1479e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
1489e1d4b82SJeremy L Thompson
1499e1d4b82SJeremy L Thompson // Load coefficients
150d6c19ee8SJeremy L Thompson __syncthreads();
1519e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
1529e1d4b82SJeremy L Thompson __syncthreads();
1539e1d4b82SJeremy L Thompson // Contract x direction
1549e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1559e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1569e1d4b82SJeremy L Thompson buffer[i] = 0.0;
1579e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
1589e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
1599e1d4b82SJeremy L Thompson }
1609e1d4b82SJeremy L Thompson }
1619e1d4b82SJeremy L Thompson // Contract y direction
1629e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
1639e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1649e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i];
1659e1d4b82SJeremy L Thompson }
1669e1d4b82SJeremy L Thompson }
1679e1d4b82SJeremy L Thompson }
1689e1d4b82SJeremy L Thompson
1699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1709e1d4b82SJeremy L Thompson // 2D interpolate transpose
1719e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
172f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)1739e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
1749e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
1759e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1769e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
1779e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
1789e1d4b82SJeremy L Thompson
1799e1d4b82SJeremy L Thompson // Clear shared memory
1809e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
1819e1d4b82SJeremy L Thompson __syncthreads();
1829e1d4b82SJeremy L Thompson // Contract y direction
1839e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
184360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
185360be29cSJeremy L Thompson
1869e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
187360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u;
1889e1d4b82SJeremy L Thompson }
1899e1d4b82SJeremy L Thompson // Contract x direction
1909e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1919e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1929e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
193c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
1949e1d4b82SJeremy L Thompson
1959e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
196c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
1979e1d4b82SJeremy L Thompson
19849337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
1999e1d4b82SJeremy L Thompson }
2009e1d4b82SJeremy L Thompson }
2019e1d4b82SJeremy L Thompson // Pull from shared to register
2029e1d4b82SJeremy L Thompson __syncthreads();
2039e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2049e1d4b82SJeremy L Thompson }
2059e1d4b82SJeremy L Thompson }
2069e1d4b82SJeremy L Thompson
2079e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2089e1d4b82SJeremy L Thompson // 2D derivatives at points
2099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
210f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)2119e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
2129e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
2139e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
2149e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2159e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
2169e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
2179e1d4b82SJeremy L Thompson
2189e1d4b82SJeremy L Thompson // Load coefficients
219d6c19ee8SJeremy L Thompson __syncthreads();
2209e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
2219e1d4b82SJeremy L Thompson __syncthreads();
2229e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) {
2239e1d4b82SJeremy L Thompson // Contract x direction
2249e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2259e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2269e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
2279e1d4b82SJeremy L Thompson buffer[i] = 0.0;
2289e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
2299e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
2309e1d4b82SJeremy L Thompson }
2319e1d4b82SJeremy L Thompson }
2329e1d4b82SJeremy L Thompson // Contract y direction
2339e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2349e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
2359e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
2369e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
2379e1d4b82SJeremy L Thompson }
2389e1d4b82SJeremy L Thompson }
2399e1d4b82SJeremy L Thompson }
2409e1d4b82SJeremy L Thompson }
2419e1d4b82SJeremy L Thompson
2429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2439e1d4b82SJeremy L Thompson // 2D derivatives transpose
2449e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
245f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)2469e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
2479e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
2489e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2499e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
2509e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
2519e1d4b82SJeremy L Thompson
2529e1d4b82SJeremy L Thompson // Clear shared memory
2539e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
2549e1d4b82SJeremy L Thompson __syncthreads();
2559e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) {
2569e1d4b82SJeremy L Thompson // Contract y direction
2579e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2589e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
259360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
260360be29cSJeremy L Thompson
2619e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
262360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u;
2639e1d4b82SJeremy L Thompson }
2649e1d4b82SJeremy L Thompson // Contract x direction
2659e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2669e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2679e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
2689e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
269c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
2709e1d4b82SJeremy L Thompson
2719e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
272c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
2739e1d4b82SJeremy L Thompson
27449337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
2759e1d4b82SJeremy L Thompson }
2769e1d4b82SJeremy L Thompson }
2779e1d4b82SJeremy L Thompson }
2789e1d4b82SJeremy L Thompson // Pull from shared to register
2799e1d4b82SJeremy L Thompson __syncthreads();
2809e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2819e1d4b82SJeremy L Thompson }
2829e1d4b82SJeremy L Thompson }
2839e1d4b82SJeremy L Thompson
2849e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2859e1d4b82SJeremy L Thompson // 3D
2869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2879e1d4b82SJeremy L Thompson
2889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2899e1d4b82SJeremy L Thompson // 3D interpolate to points
2909e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
291f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)2929e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
2939e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
2949e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
2959e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
2969e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
2979e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
2989e1d4b82SJeremy L Thompson
299802d760aSJeremy L Thompson // Get z contraction value
300802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
301802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k];
302802d760aSJeremy L Thompson
303802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3049e1d4b82SJeremy L Thompson // Load coefficients
305d6c19ee8SJeremy L Thompson __syncthreads();
3069e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3079e1d4b82SJeremy L Thompson __syncthreads();
3089e1d4b82SJeremy L Thompson // Contract x direction
3099e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3109e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3119e1d4b82SJeremy L Thompson buffer[i] = 0.0;
3129e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
3139e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
3149e1d4b82SJeremy L Thompson }
3159e1d4b82SJeremy L Thompson }
3169e1d4b82SJeremy L Thompson // Contract y and z direction
3179e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3189e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3199e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i] * z;
3209e1d4b82SJeremy L Thompson }
3219e1d4b82SJeremy L Thompson }
3229e1d4b82SJeremy L Thompson }
3239e1d4b82SJeremy L Thompson }
3249e1d4b82SJeremy L Thompson
3259e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3269e1d4b82SJeremy L Thompson // 3D interpolate transpose
3279e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
328f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)3299e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
3309e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
3319e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
3329e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
3339e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
3349e1d4b82SJeremy L Thompson
335802d760aSJeremy L Thompson // Get z contraction value
336802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
337802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k];
338802d760aSJeremy L Thompson
339802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3409e1d4b82SJeremy L Thompson // Clear shared memory
3419e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
3429e1d4b82SJeremy L Thompson __syncthreads();
3439e1d4b82SJeremy L Thompson // Contract y and z direction
3449e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
345360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
346360be29cSJeremy L Thompson
3479e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
348360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u * z;
3499e1d4b82SJeremy L Thompson }
3509e1d4b82SJeremy L Thompson // Contract x direction
3519e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3529e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3539e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
354c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
3559e1d4b82SJeremy L Thompson
3569e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
357c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
3589e1d4b82SJeremy L Thompson
35949337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
3609e1d4b82SJeremy L Thompson }
3619e1d4b82SJeremy L Thompson }
3629e1d4b82SJeremy L Thompson // Pull from shared to register
3639e1d4b82SJeremy L Thompson __syncthreads();
3649e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
3659e1d4b82SJeremy L Thompson }
3669e1d4b82SJeremy L Thompson }
3679e1d4b82SJeremy L Thompson }
3689e1d4b82SJeremy L Thompson
3699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3709e1d4b82SJeremy L Thompson // 3D derivatives at points
3719e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
372f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)3739e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
3749e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
3759e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
3769e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
3779e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
3789e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
3799e1d4b82SJeremy L Thompson
380dc7b9553SJeremy L Thompson // Get z contraction values
381802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
382dc7b9553SJeremy L Thompson const CeedScalar z = chebyshev_x[k];
383802d760aSJeremy L Thompson
384dc7b9553SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
385dc7b9553SJeremy L Thompson const CeedScalar dz = chebyshev_x[k];
386dc7b9553SJeremy L Thompson
387dc7b9553SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3889e1d4b82SJeremy L Thompson // Load coefficients
389d6c19ee8SJeremy L Thompson __syncthreads();
3909e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3919e1d4b82SJeremy L Thompson __syncthreads();
392802d760aSJeremy L Thompson // Gradient directions
3939e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) {
3949e1d4b82SJeremy L Thompson // Contract x direction
3959e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
3969e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3979e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3989e1d4b82SJeremy L Thompson buffer[i] = 0.0;
3999e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
4009e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
4019e1d4b82SJeremy L Thompson }
4029e1d4b82SJeremy L Thompson }
4039e1d4b82SJeremy L Thompson // Contract y and z direction
4049e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4059e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
406dc7b9553SJeremy L Thompson const CeedScalar zz = dim == 2 ? dz : z;
407dc7b9553SJeremy L Thompson
4089e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
409dc7b9553SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
4109e1d4b82SJeremy L Thompson }
4119e1d4b82SJeremy L Thompson }
4129e1d4b82SJeremy L Thompson }
4139e1d4b82SJeremy L Thompson }
4149e1d4b82SJeremy L Thompson }
4159e1d4b82SJeremy L Thompson
4169e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
4179e1d4b82SJeremy L Thompson // 3D derivatives transpose
4189e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
419f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)4209e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
4219e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
4229e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
4239e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
4249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
4259e1d4b82SJeremy L Thompson
426dc7b9553SJeremy L Thompson // Get z contraction values
427802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
428dc7b9553SJeremy L Thompson const CeedScalar z = chebyshev_x[k];
429802d760aSJeremy L Thompson
430dc7b9553SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
431dc7b9553SJeremy L Thompson const CeedScalar dz = chebyshev_x[k];
432dc7b9553SJeremy L Thompson
433dc7b9553SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4349e1d4b82SJeremy L Thompson // Clear shared memory
4359e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
4369e1d4b82SJeremy L Thompson __syncthreads();
437802d760aSJeremy L Thompson // Gradient directions
4389e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) {
4399e1d4b82SJeremy L Thompson // Contract y and z direction
4409e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4419e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
442dc7b9553SJeremy L Thompson const CeedScalar zz = dim == 2 ? dz : z;
443360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
444dc7b9553SJeremy L Thompson
4459e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
446360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u * zz;
4479e1d4b82SJeremy L Thompson }
4489e1d4b82SJeremy L Thompson // Contract x direction
4499e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
4509e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
4519e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
4529e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
453c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
4549e1d4b82SJeremy L Thompson
4559e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
456c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
4579e1d4b82SJeremy L Thompson
45849337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
4599e1d4b82SJeremy L Thompson }
4609e1d4b82SJeremy L Thompson }
4619e1d4b82SJeremy L Thompson }
4629e1d4b82SJeremy L Thompson // Pull from shared to register
4639e1d4b82SJeremy L Thompson __syncthreads();
4649e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
4659e1d4b82SJeremy L Thompson }
4669e1d4b82SJeremy L Thompson }
4679e1d4b82SJeremy L Thompson }
468