1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e1d4b82SJeremy L Thompson //
49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59e1d4b82SJeremy L Thompson //
69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed
79e1d4b82SJeremy L Thompson
89e1d4b82SJeremy L Thompson /// @file
99e1d4b82SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis AtPoints templates
109e1d4b82SJeremy L Thompson #include <ceed/types.h>
119e1d4b82SJeremy L Thompson
129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
139e1d4b82SJeremy L Thompson // Chebyshev values
149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
159e1d4b82SJeremy L Thompson template <int Q_1D>
ChebyshevPolynomialsAtPoint(const CeedScalar x,CeedScalar * chebyshev_x)169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
179e1d4b82SJeremy L Thompson chebyshev_x[0] = 1.0;
189e1d4b82SJeremy L Thompson chebyshev_x[1] = 2 * x;
199e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
209e1d4b82SJeremy L Thompson }
219e1d4b82SJeremy L Thompson
229e1d4b82SJeremy L Thompson template <int Q_1D>
ChebyshevDerivativeAtPoint(const CeedScalar x,CeedScalar * chebyshev_dx)239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[3];
259e1d4b82SJeremy L Thompson
269e1d4b82SJeremy L Thompson chebyshev_x[1] = 1.0;
279e1d4b82SJeremy L Thompson chebyshev_x[2] = 2 * x;
289e1d4b82SJeremy L Thompson chebyshev_dx[0] = 0.0;
299e1d4b82SJeremy L Thompson chebyshev_dx[1] = 2.0;
309e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) {
319e1d4b82SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
329e1d4b82SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
339e1d4b82SJeremy L Thompson }
349e1d4b82SJeremy L Thompson }
359e1d4b82SJeremy L Thompson
369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
379e1d4b82SJeremy L Thompson // 1D
389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
399e1d4b82SJeremy L Thompson
409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
419e1d4b82SJeremy L Thompson // 1D interpolate to points
429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
43f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
459e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
479e1d4b82SJeremy L Thompson
489e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
499e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
509e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
519e1d4b82SJeremy L Thompson // Load coefficients
529e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
539e1d4b82SJeremy L Thompson __syncthreads();
549e1d4b82SJeremy L Thompson // Contract x direction
559e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
569e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i];
579e1d4b82SJeremy L Thompson }
589e1d4b82SJeremy L Thompson }
599e1d4b82SJeremy L Thompson }
609e1d4b82SJeremy L Thompson
619e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
629e1d4b82SJeremy L Thompson // 1D interpolate transpose
639e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
64f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)659e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
669e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) {
679e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
689e1d4b82SJeremy L Thompson
699e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
709e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
719e1d4b82SJeremy L Thompson // Clear shared memory
729e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
739e1d4b82SJeremy L Thompson __syncthreads();
749e1d4b82SJeremy L Thompson // Contract x direction
759e1d4b82SJeremy L Thompson if (p < NUM_POINTS) {
769e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
77dc7b9553SJeremy L Thompson atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
789e1d4b82SJeremy L Thompson }
799e1d4b82SJeremy L Thompson }
809e1d4b82SJeremy L Thompson // Pull from shared to register
819e1d4b82SJeremy L Thompson __syncthreads();
824eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
839e1d4b82SJeremy L Thompson }
849e1d4b82SJeremy L Thompson }
859e1d4b82SJeremy L Thompson
869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
879e1d4b82SJeremy L Thompson // 1D derivatives at points
889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
89f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)909e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
919e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
929e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
939e1d4b82SJeremy L Thompson
949e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
959e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
969e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
979e1d4b82SJeremy L Thompson // Load coefficients
98d6c19ee8SJeremy L Thompson __syncthreads();
999e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
1009e1d4b82SJeremy L Thompson __syncthreads();
1019e1d4b82SJeremy L Thompson // Contract x direction
1029e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1039e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i];
1049e1d4b82SJeremy L Thompson }
1059e1d4b82SJeremy L Thompson }
1069e1d4b82SJeremy L Thompson }
1079e1d4b82SJeremy L Thompson
1089e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1099e1d4b82SJeremy L Thompson // 1D derivatives transpose
1109e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
111f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)1129e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
1139e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) {
1149e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
1159e1d4b82SJeremy L Thompson
1169e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
1179e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1189e1d4b82SJeremy L Thompson // Clear shared memory
1199e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
1209e1d4b82SJeremy L Thompson __syncthreads();
1219e1d4b82SJeremy L Thompson // Contract x direction
1229e1d4b82SJeremy L Thompson if (p < NUM_POINTS) {
1239e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
124dc7b9553SJeremy L Thompson atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
1259e1d4b82SJeremy L Thompson }
1269e1d4b82SJeremy L Thompson }
1279e1d4b82SJeremy L Thompson // Pull from shared to register
1289e1d4b82SJeremy L Thompson __syncthreads();
1294eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
1309e1d4b82SJeremy L Thompson }
1319e1d4b82SJeremy L Thompson }
1329e1d4b82SJeremy L Thompson
1339e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1349e1d4b82SJeremy L Thompson // 2D
1359e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1369e1d4b82SJeremy L Thompson
1379e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1389e1d4b82SJeremy L Thompson // 2D interpolate to points
1399e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
140f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)1419e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
1429e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
1439e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
1449e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1459e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
1469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
1479e1d4b82SJeremy L Thompson
1489e1d4b82SJeremy L Thompson // Load coefficients
149d6c19ee8SJeremy L Thompson __syncthreads();
1509e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
1519e1d4b82SJeremy L Thompson __syncthreads();
1529e1d4b82SJeremy L Thompson // Contract x direction
1539e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1549e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1559e1d4b82SJeremy L Thompson buffer[i] = 0.0;
1569e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
1579e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
1589e1d4b82SJeremy L Thompson }
1599e1d4b82SJeremy L Thompson }
1609e1d4b82SJeremy L Thompson // Contract y direction
1619e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
1629e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1639e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i];
1649e1d4b82SJeremy L Thompson }
1659e1d4b82SJeremy L Thompson }
1669e1d4b82SJeremy L Thompson }
1679e1d4b82SJeremy L Thompson
1689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1699e1d4b82SJeremy L Thompson // 2D interpolate transpose
1709e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
171f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)1729e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
1739e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) {
1749e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1759e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
1769e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
1779e1d4b82SJeremy L Thompson
1789e1d4b82SJeremy L Thompson // Clear shared memory
1799e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
1809e1d4b82SJeremy L Thompson __syncthreads();
1819e1d4b82SJeremy L Thompson // Contract y direction
1829e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
183360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
184360be29cSJeremy L Thompson
1859e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
186360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u;
1879e1d4b82SJeremy L Thompson }
1889e1d4b82SJeremy L Thompson // Contract x direction
1899e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1909e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
1919e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
192c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
1939e1d4b82SJeremy L Thompson
1949e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
195c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
1969e1d4b82SJeremy L Thompson
19749337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
1989e1d4b82SJeremy L Thompson }
1999e1d4b82SJeremy L Thompson }
2009e1d4b82SJeremy L Thompson // Pull from shared to register
2019e1d4b82SJeremy L Thompson __syncthreads();
2029e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2039e1d4b82SJeremy L Thompson }
2049e1d4b82SJeremy L Thompson }
2059e1d4b82SJeremy L Thompson
2069e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2079e1d4b82SJeremy L Thompson // 2D derivatives at points
2089e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
209f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)2109e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
2119e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
2129e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
2139e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2149e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
2159e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
2169e1d4b82SJeremy L Thompson
2179e1d4b82SJeremy L Thompson // Load coefficients
218d6c19ee8SJeremy L Thompson __syncthreads();
2199e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
2209e1d4b82SJeremy L Thompson __syncthreads();
2219e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) {
2229e1d4b82SJeremy L Thompson // Contract x direction
2239e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2249e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2259e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
2269e1d4b82SJeremy L Thompson buffer[i] = 0.0;
2279e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
2289e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
2299e1d4b82SJeremy L Thompson }
2309e1d4b82SJeremy L Thompson }
2319e1d4b82SJeremy L Thompson // Contract y direction
2329e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2339e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
2349e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
2359e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
2369e1d4b82SJeremy L Thompson }
2379e1d4b82SJeremy L Thompson }
2389e1d4b82SJeremy L Thompson }
2399e1d4b82SJeremy L Thompson }
2409e1d4b82SJeremy L Thompson
2419e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2429e1d4b82SJeremy L Thompson // 2D derivatives transpose
2439e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
244f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)2459e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
2469e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) {
2479e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2489e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
2499e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
2509e1d4b82SJeremy L Thompson
2519e1d4b82SJeremy L Thompson // Clear shared memory
2529e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
2539e1d4b82SJeremy L Thompson __syncthreads();
2549e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) {
2559e1d4b82SJeremy L Thompson // Contract y direction
2569e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2579e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
258360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
259360be29cSJeremy L Thompson
2609e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
261360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u;
2629e1d4b82SJeremy L Thompson }
2639e1d4b82SJeremy L Thompson // Contract x direction
2649e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2659e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2669e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
2679e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
268c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
2699e1d4b82SJeremy L Thompson
2709e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
271c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
2729e1d4b82SJeremy L Thompson
27349337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
2749e1d4b82SJeremy L Thompson }
2759e1d4b82SJeremy L Thompson }
2769e1d4b82SJeremy L Thompson }
2779e1d4b82SJeremy L Thompson // Pull from shared to register
2789e1d4b82SJeremy L Thompson __syncthreads();
2799e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2809e1d4b82SJeremy L Thompson }
2819e1d4b82SJeremy L Thompson }
2829e1d4b82SJeremy L Thompson
2839e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2849e1d4b82SJeremy L Thompson // 3D
2859e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2869e1d4b82SJeremy L Thompson
2879e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2889e1d4b82SJeremy L Thompson // 3D interpolate to points
2899e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
290f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)2919e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
2929e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
2939e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
2949e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
2959e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
2969e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
2979e1d4b82SJeremy L Thompson
298802d760aSJeremy L Thompson // Get z contraction value
299802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
300802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k];
301802d760aSJeremy L Thompson
302802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3039e1d4b82SJeremy L Thompson // Load coefficients
304d6c19ee8SJeremy L Thompson __syncthreads();
3059e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3069e1d4b82SJeremy L Thompson __syncthreads();
3079e1d4b82SJeremy L Thompson // Contract x direction
3089e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3099e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3109e1d4b82SJeremy L Thompson buffer[i] = 0.0;
3119e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
3129e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
3139e1d4b82SJeremy L Thompson }
3149e1d4b82SJeremy L Thompson }
3159e1d4b82SJeremy L Thompson // Contract y and z direction
3169e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3179e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3189e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i] * z;
3199e1d4b82SJeremy L Thompson }
3209e1d4b82SJeremy L Thompson }
3219e1d4b82SJeremy L Thompson }
3229e1d4b82SJeremy L Thompson }
3239e1d4b82SJeremy L Thompson
3249e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3259e1d4b82SJeremy L Thompson // 3D interpolate transpose
3269e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
327f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)3289e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
3299e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) {
3309e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
3319e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
3329e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
3339e1d4b82SJeremy L Thompson
334802d760aSJeremy L Thompson // Get z contraction value
335802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
336802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k];
337802d760aSJeremy L Thompson
338802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3399e1d4b82SJeremy L Thompson // Clear shared memory
3409e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
3419e1d4b82SJeremy L Thompson __syncthreads();
3429e1d4b82SJeremy L Thompson // Contract y and z direction
3439e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
344360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
345360be29cSJeremy L Thompson
3469e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
347360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u * z;
3489e1d4b82SJeremy L Thompson }
3499e1d4b82SJeremy L Thompson // Contract x direction
3509e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3519e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3529e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
353c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
3549e1d4b82SJeremy L Thompson
3559e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
356c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
3579e1d4b82SJeremy L Thompson
35849337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
3599e1d4b82SJeremy L Thompson }
3609e1d4b82SJeremy L Thompson }
3619e1d4b82SJeremy L Thompson // Pull from shared to register
3629e1d4b82SJeremy L Thompson __syncthreads();
3639e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
3649e1d4b82SJeremy L Thompson }
3659e1d4b82SJeremy L Thompson }
3669e1d4b82SJeremy L Thompson }
3679e1d4b82SJeremy L Thompson
3689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3699e1d4b82SJeremy L Thompson // 3D derivatives at points
3709e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
371f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)3729e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
3739e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) {
3749e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
3759e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
3769e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
3779e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
3789e1d4b82SJeremy L Thompson
379dc7b9553SJeremy L Thompson // Get z contraction values
380802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
381dc7b9553SJeremy L Thompson const CeedScalar z = chebyshev_x[k];
382802d760aSJeremy L Thompson
383dc7b9553SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
384dc7b9553SJeremy L Thompson const CeedScalar dz = chebyshev_x[k];
385dc7b9553SJeremy L Thompson
386dc7b9553SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3879e1d4b82SJeremy L Thompson // Load coefficients
388d6c19ee8SJeremy L Thompson __syncthreads();
3899e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3909e1d4b82SJeremy L Thompson __syncthreads();
391802d760aSJeremy L Thompson // Gradient directions
3929e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) {
3939e1d4b82SJeremy L Thompson // Contract x direction
3949e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
3959e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3969e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3979e1d4b82SJeremy L Thompson buffer[i] = 0.0;
3989e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
3999e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
4009e1d4b82SJeremy L Thompson }
4019e1d4b82SJeremy L Thompson }
4029e1d4b82SJeremy L Thompson // Contract y and z direction
4039e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4049e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
405dc7b9553SJeremy L Thompson const CeedScalar zz = dim == 2 ? dz : z;
406dc7b9553SJeremy L Thompson
4079e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
408dc7b9553SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
4099e1d4b82SJeremy L Thompson }
4109e1d4b82SJeremy L Thompson }
4119e1d4b82SJeremy L Thompson }
4129e1d4b82SJeremy L Thompson }
4139e1d4b82SJeremy L Thompson }
4149e1d4b82SJeremy L Thompson
4159e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
4169e1d4b82SJeremy L Thompson // 3D derivatives transpose
4179e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
418f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)4199e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
4209e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) {
4219e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) {
4229e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D];
4239e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D];
4249e1d4b82SJeremy L Thompson
425dc7b9553SJeremy L Thompson // Get z contraction values
426802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
427dc7b9553SJeremy L Thompson const CeedScalar z = chebyshev_x[k];
428802d760aSJeremy L Thompson
429dc7b9553SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
430dc7b9553SJeremy L Thompson const CeedScalar dz = chebyshev_x[k];
431dc7b9553SJeremy L Thompson
432dc7b9553SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4339e1d4b82SJeremy L Thompson // Clear shared memory
4349e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
4359e1d4b82SJeremy L Thompson __syncthreads();
436802d760aSJeremy L Thompson // Gradient directions
4379e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) {
4389e1d4b82SJeremy L Thompson // Contract y and z direction
4399e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4409e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
441dc7b9553SJeremy L Thompson const CeedScalar zz = dim == 2 ? dz : z;
44249337e26SJeremy L Thompson const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0;
443dc7b9553SJeremy L Thompson
4449e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
445360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u * zz;
4469e1d4b82SJeremy L Thompson }
44749337e26SJeremy L Thompson
4489e1d4b82SJeremy L Thompson // Contract x direction
4499e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
4509e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
4519e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
4529e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds
453c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D;
4549e1d4b82SJeremy L Thompson
4559e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) {
456c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D;
4579e1d4b82SJeremy L Thompson
45849337e26SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
4599e1d4b82SJeremy L Thompson }
4609e1d4b82SJeremy L Thompson }
4619e1d4b82SJeremy L Thompson }
4629e1d4b82SJeremy L Thompson // Pull from shared to register
4639e1d4b82SJeremy L Thompson __syncthreads();
4649e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
4659e1d4b82SJeremy L Thompson }
4669e1d4b82SJeremy L Thompson }
4679e1d4b82SJeremy L Thompson }
468