1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for HIP shared memory tensor product basis AtPoints templates 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 139e1d4b82SJeremy L Thompson // Chebyshev values 149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 159e1d4b82SJeremy L Thompson template <int Q_1D> 169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 179e1d4b82SJeremy L Thompson chebyshev_x[0] = 1.0; 189e1d4b82SJeremy L Thompson chebyshev_x[1] = 2 * x; 199e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 209e1d4b82SJeremy L Thompson } 219e1d4b82SJeremy L Thompson 229e1d4b82SJeremy L Thompson template <int Q_1D> 239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[3]; 259e1d4b82SJeremy L Thompson 269e1d4b82SJeremy L Thompson chebyshev_x[1] = 1.0; 279e1d4b82SJeremy L Thompson chebyshev_x[2] = 2 * x; 289e1d4b82SJeremy L Thompson chebyshev_dx[0] = 0.0; 299e1d4b82SJeremy L Thompson chebyshev_dx[1] = 2.0; 309e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 319e1d4b82SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 329e1d4b82SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 339e1d4b82SJeremy L Thompson } 349e1d4b82SJeremy L Thompson } 359e1d4b82SJeremy L Thompson 369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 379e1d4b82SJeremy L Thompson // 1D 389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 399e1d4b82SJeremy L Thompson 409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 419e1d4b82SJeremy L Thompson // 1D interpolate to points 429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 43f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 459e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 479e1d4b82SJeremy L Thompson 489e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 499e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 509e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 519e1d4b82SJeremy L Thompson // Load coefficients 52d6c19ee8SJeremy L Thompson __syncthreads(); 539e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 549e1d4b82SJeremy L Thompson __syncthreads(); 559e1d4b82SJeremy L Thompson // Contract x direction 569e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 579e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 589e1d4b82SJeremy L Thompson } 599e1d4b82SJeremy L Thompson } 609e1d4b82SJeremy L Thompson } 619e1d4b82SJeremy L Thompson 629e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 639e1d4b82SJeremy L Thompson // 1D interpolate transpose 649e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 65f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 669e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 679e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 689e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 699e1d4b82SJeremy L Thompson 709e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 719e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 729e1d4b82SJeremy L Thompson // Clear shared memory 739e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 749e1d4b82SJeremy L Thompson __syncthreads(); 759e1d4b82SJeremy L Thompson // Contract x direction 769e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 779e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 78a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 799e1d4b82SJeremy L Thompson } 809e1d4b82SJeremy L Thompson } 819e1d4b82SJeremy L Thompson // Pull from shared to register 829e1d4b82SJeremy L Thompson __syncthreads(); 834eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 849e1d4b82SJeremy L Thompson } 859e1d4b82SJeremy L Thompson } 869e1d4b82SJeremy L Thompson 879e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 889e1d4b82SJeremy L Thompson // 1D derivatives at points 899e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 90f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 919e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 929e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 939e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 949e1d4b82SJeremy L Thompson 959e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 969e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 979e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 98d6c19ee8SJeremy L Thompson __syncthreads(); 999e1d4b82SJeremy L Thompson // Load coefficients 1009e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 1019e1d4b82SJeremy L Thompson __syncthreads(); 1029e1d4b82SJeremy L Thompson // Contract x direction 1039e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1049e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 1059e1d4b82SJeremy L Thompson } 1069e1d4b82SJeremy L Thompson } 1079e1d4b82SJeremy L Thompson } 1089e1d4b82SJeremy L Thompson 1099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1109e1d4b82SJeremy L Thompson // 1D derivatives transpose 1119e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 112f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1139e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 1149e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 1159e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1169e1d4b82SJeremy L Thompson 1179e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 1189e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1199e1d4b82SJeremy L Thompson // Clear shared memory 1209e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 1219e1d4b82SJeremy L Thompson __syncthreads(); 1229e1d4b82SJeremy L Thompson // Contract x direction 1239e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 1249e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 125a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 1269e1d4b82SJeremy L Thompson } 1279e1d4b82SJeremy L Thompson } 1289e1d4b82SJeremy L Thompson // Pull from shared to register 1299e1d4b82SJeremy L Thompson __syncthreads(); 1304eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 1319e1d4b82SJeremy L Thompson } 1329e1d4b82SJeremy L Thompson } 1339e1d4b82SJeremy L Thompson 1349e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1359e1d4b82SJeremy L Thompson // 2D 1369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1379e1d4b82SJeremy L Thompson 1389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1399e1d4b82SJeremy L Thompson // 2D interpolate to points 1409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 141f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1429e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 1439e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 1449e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 1459e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1469e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1479e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1489e1d4b82SJeremy L Thompson 1499e1d4b82SJeremy L Thompson // Load coefficients 150d6c19ee8SJeremy L Thompson __syncthreads(); 1519e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 1529e1d4b82SJeremy L Thompson __syncthreads(); 1539e1d4b82SJeremy L Thompson // Contract x direction 1549e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1559e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1569e1d4b82SJeremy L Thompson buffer[i] = 0.0; 1579e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 1589e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 1599e1d4b82SJeremy L Thompson } 1609e1d4b82SJeremy L Thompson } 1619e1d4b82SJeremy L Thompson // Contract y direction 1629e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 1639e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1649e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i]; 1659e1d4b82SJeremy L Thompson } 1669e1d4b82SJeremy L Thompson } 1679e1d4b82SJeremy L Thompson } 1689e1d4b82SJeremy L Thompson 1699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1709e1d4b82SJeremy L Thompson // 2D interpolate transpose 1719e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 172f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1739e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 1749e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 1759e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1769e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1779e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1789e1d4b82SJeremy L Thompson 1799e1d4b82SJeremy L Thompson // Clear shared memory 1809e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 1819e1d4b82SJeremy L Thompson __syncthreads(); 1829e1d4b82SJeremy L Thompson // Contract y direction 1839e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 184360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; 185360be29cSJeremy L Thompson 1869e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 187360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u; 1889e1d4b82SJeremy L Thompson } 1899e1d4b82SJeremy L Thompson // Contract x direction 1909e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1919e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1929e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 193*c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D; 1949e1d4b82SJeremy L Thompson 1959e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 196*c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D; 1979e1d4b82SJeremy L Thompson 1989e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 1999e1d4b82SJeremy L Thompson } 2009e1d4b82SJeremy L Thompson } 2019e1d4b82SJeremy L Thompson // Pull from shared to register 2029e1d4b82SJeremy L Thompson __syncthreads(); 2039e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2049e1d4b82SJeremy L Thompson } 2059e1d4b82SJeremy L Thompson } 2069e1d4b82SJeremy L Thompson 2079e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2089e1d4b82SJeremy L Thompson // 2D derivatives at points 2099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 210f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2119e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 2129e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2139e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 2149e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2159e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2169e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2179e1d4b82SJeremy L Thompson 2189e1d4b82SJeremy L Thompson // Load coefficients 219d6c19ee8SJeremy L Thompson __syncthreads(); 2209e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 2219e1d4b82SJeremy L Thompson __syncthreads(); 2229e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2239e1d4b82SJeremy L Thompson // Contract x direction 2249e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2259e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2269e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2279e1d4b82SJeremy L Thompson buffer[i] = 0.0; 2289e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 2299e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 2309e1d4b82SJeremy L Thompson } 2319e1d4b82SJeremy L Thompson } 2329e1d4b82SJeremy L Thompson // Contract y direction 2339e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2349e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 2359e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2369e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 2379e1d4b82SJeremy L Thompson } 2389e1d4b82SJeremy L Thompson } 2399e1d4b82SJeremy L Thompson } 2409e1d4b82SJeremy L Thompson } 2419e1d4b82SJeremy L Thompson 2429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2439e1d4b82SJeremy L Thompson // 2D derivatives transpose 2449e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 245f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2469e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 2479e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 2489e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2499e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2509e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2519e1d4b82SJeremy L Thompson 2529e1d4b82SJeremy L Thompson // Clear shared memory 2539e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 2549e1d4b82SJeremy L Thompson __syncthreads(); 2559e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2569e1d4b82SJeremy L Thompson // Contract y direction 2579e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2589e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 259360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0; 260360be29cSJeremy L Thompson 2619e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 262360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u; 2639e1d4b82SJeremy L Thompson } 2649e1d4b82SJeremy L Thompson // Contract x direction 2659e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2669e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2679e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2689e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 269*c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D; 2709e1d4b82SJeremy L Thompson 2719e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 272*c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D; 2739e1d4b82SJeremy L Thompson 2749e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 2759e1d4b82SJeremy L Thompson } 2769e1d4b82SJeremy L Thompson } 2779e1d4b82SJeremy L Thompson } 2789e1d4b82SJeremy L Thompson // Pull from shared to register 2799e1d4b82SJeremy L Thompson __syncthreads(); 2809e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2819e1d4b82SJeremy L Thompson } 2829e1d4b82SJeremy L Thompson } 2839e1d4b82SJeremy L Thompson 2849e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2859e1d4b82SJeremy L Thompson // 3D 2869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2879e1d4b82SJeremy L Thompson 2889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2899e1d4b82SJeremy L Thompson // 3D interpolate to points 2909e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 291f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2929e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 2939e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2949e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 2959e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 2969e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2979e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2989e1d4b82SJeremy L Thompson 299802d760aSJeremy L Thompson // Get z contraction value 300802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 301802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 302802d760aSJeremy L Thompson 303802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3049e1d4b82SJeremy L Thompson // Load coefficients 305d6c19ee8SJeremy L Thompson __syncthreads(); 3069e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 3079e1d4b82SJeremy L Thompson __syncthreads(); 3089e1d4b82SJeremy L Thompson // Contract x direction 3099e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3109e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3119e1d4b82SJeremy L Thompson buffer[i] = 0.0; 3129e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 3139e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 3149e1d4b82SJeremy L Thompson } 3159e1d4b82SJeremy L Thompson } 3169e1d4b82SJeremy L Thompson // Contract y and z direction 3179e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3189e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3199e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i] * z; 3209e1d4b82SJeremy L Thompson } 3219e1d4b82SJeremy L Thompson } 3229e1d4b82SJeremy L Thompson } 3239e1d4b82SJeremy L Thompson } 3249e1d4b82SJeremy L Thompson 3259e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3269e1d4b82SJeremy L Thompson // 3D interpolate transpose 3279e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 328f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 3299e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 3309e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 3319e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3329e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3339e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3349e1d4b82SJeremy L Thompson 335802d760aSJeremy L Thompson // Get z contraction value 336802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 337802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 338802d760aSJeremy L Thompson 339802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3409e1d4b82SJeremy L Thompson // Clear shared memory 3419e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 3429e1d4b82SJeremy L Thompson __syncthreads(); 3439e1d4b82SJeremy L Thompson // Contract y and z direction 3449e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 345360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; 346360be29cSJeremy L Thompson 3479e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 348360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u * z; 3499e1d4b82SJeremy L Thompson } 3509e1d4b82SJeremy L Thompson // Contract x direction 3519e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3529e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3539e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 354*c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D; 3559e1d4b82SJeremy L Thompson 3569e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 357*c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D; 3589e1d4b82SJeremy L Thompson 3599e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 3609e1d4b82SJeremy L Thompson } 3619e1d4b82SJeremy L Thompson } 3629e1d4b82SJeremy L Thompson // Pull from shared to register 3639e1d4b82SJeremy L Thompson __syncthreads(); 3649e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 3659e1d4b82SJeremy L Thompson } 3669e1d4b82SJeremy L Thompson } 3679e1d4b82SJeremy L Thompson } 3689e1d4b82SJeremy L Thompson 3699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3709e1d4b82SJeremy L Thompson // 3D derivatives at points 3719e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 372f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 3739e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 3749e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 3759e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 3769e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3779e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3789e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3799e1d4b82SJeremy L Thompson 380dc7b9553SJeremy L Thompson // Get z contraction values 381802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 382dc7b9553SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 383802d760aSJeremy L Thompson 384dc7b9553SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 385dc7b9553SJeremy L Thompson const CeedScalar dz = chebyshev_x[k]; 386dc7b9553SJeremy L Thompson 387dc7b9553SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3889e1d4b82SJeremy L Thompson // Load coefficients 389d6c19ee8SJeremy L Thompson __syncthreads(); 3909e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 3919e1d4b82SJeremy L Thompson __syncthreads(); 392802d760aSJeremy L Thompson // Gradient directions 3939e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 3949e1d4b82SJeremy L Thompson // Contract x direction 3959e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 3969e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3979e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3989e1d4b82SJeremy L Thompson buffer[i] = 0.0; 3999e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 4009e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 4019e1d4b82SJeremy L Thompson } 4029e1d4b82SJeremy L Thompson } 4039e1d4b82SJeremy L Thompson // Contract y and z direction 4049e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 4059e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 406dc7b9553SJeremy L Thompson const CeedScalar zz = dim == 2 ? dz : z; 407dc7b9553SJeremy L Thompson 4089e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 409dc7b9553SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz; 4109e1d4b82SJeremy L Thompson } 4119e1d4b82SJeremy L Thompson } 4129e1d4b82SJeremy L Thompson } 4139e1d4b82SJeremy L Thompson } 4149e1d4b82SJeremy L Thompson } 4159e1d4b82SJeremy L Thompson 4169e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 4179e1d4b82SJeremy L Thompson // 3D derivatives transpose 4189e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 419f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 4209e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 4219e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 4229e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 4239e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 4249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 4259e1d4b82SJeremy L Thompson 426dc7b9553SJeremy L Thompson // Get z contraction values 427802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 428dc7b9553SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 429802d760aSJeremy L Thompson 430dc7b9553SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 431dc7b9553SJeremy L Thompson const CeedScalar dz = chebyshev_x[k]; 432dc7b9553SJeremy L Thompson 433dc7b9553SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4349e1d4b82SJeremy L Thompson // Clear shared memory 4359e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 4369e1d4b82SJeremy L Thompson __syncthreads(); 437802d760aSJeremy L Thompson // Gradient directions 4389e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 4399e1d4b82SJeremy L Thompson // Contract y and z direction 4409e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 4419e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 442dc7b9553SJeremy L Thompson const CeedScalar zz = dim == 2 ? dz : z; 443360be29cSJeremy L Thompson const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0; 444dc7b9553SJeremy L Thompson 4459e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 446360be29cSJeremy L Thompson buffer[i] = chebyshev_x[i] * r_u * zz; 4479e1d4b82SJeremy L Thompson } 4489e1d4b82SJeremy L Thompson // Contract x direction 4499e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 4509e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 4519e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4529e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 453*c6cb50faSJeremy L Thompson const CeedInt ii = (i + data.t_id_y) % Q_1D; 4549e1d4b82SJeremy L Thompson 4559e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 456*c6cb50faSJeremy L Thompson const CeedInt jj = (j + data.t_id_x) % Q_1D; 4579e1d4b82SJeremy L Thompson 4589e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 4599e1d4b82SJeremy L Thompson } 4609e1d4b82SJeremy L Thompson } 4619e1d4b82SJeremy L Thompson } 4629e1d4b82SJeremy L Thompson // Pull from shared to register 4639e1d4b82SJeremy L Thompson __syncthreads(); 4649e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 4659e1d4b82SJeremy L Thompson } 4669e1d4b82SJeremy L Thompson } 4679e1d4b82SJeremy L Thompson } 468