1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for HIP shared memory tensor product basis AtPoints templates 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 139e1d4b82SJeremy L Thompson // Chebyshev values 149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 159e1d4b82SJeremy L Thompson template <int Q_1D> 169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 179e1d4b82SJeremy L Thompson chebyshev_x[0] = 1.0; 189e1d4b82SJeremy L Thompson chebyshev_x[1] = 2 * x; 199e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 209e1d4b82SJeremy L Thompson } 219e1d4b82SJeremy L Thompson 229e1d4b82SJeremy L Thompson template <int Q_1D> 239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[3]; 259e1d4b82SJeremy L Thompson 269e1d4b82SJeremy L Thompson chebyshev_x[1] = 1.0; 279e1d4b82SJeremy L Thompson chebyshev_x[2] = 2 * x; 289e1d4b82SJeremy L Thompson chebyshev_dx[0] = 0.0; 299e1d4b82SJeremy L Thompson chebyshev_dx[1] = 2.0; 309e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 319e1d4b82SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 329e1d4b82SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 339e1d4b82SJeremy L Thompson } 349e1d4b82SJeremy L Thompson } 359e1d4b82SJeremy L Thompson 369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 379e1d4b82SJeremy L Thompson // 1D 389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 399e1d4b82SJeremy L Thompson 409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 419e1d4b82SJeremy L Thompson // 1D interpolate to points 429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 43f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 459e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 479e1d4b82SJeremy L Thompson 489e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 499e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 509e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 519e1d4b82SJeremy L Thompson // Load coefficients 52*d6c19ee8SJeremy L Thompson __syncthreads(); 539e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 549e1d4b82SJeremy L Thompson __syncthreads(); 559e1d4b82SJeremy L Thompson // Contract x direction 569e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 579e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 589e1d4b82SJeremy L Thompson } 599e1d4b82SJeremy L Thompson } 609e1d4b82SJeremy L Thompson } 619e1d4b82SJeremy L Thompson 629e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 639e1d4b82SJeremy L Thompson // 1D interpolate transpose 649e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 65f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 669e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 679e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 689e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 699e1d4b82SJeremy L Thompson 709e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 719e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 729e1d4b82SJeremy L Thompson // Clear shared memory 739e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 749e1d4b82SJeremy L Thompson __syncthreads(); 759e1d4b82SJeremy L Thompson // Contract x direction 769e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 779e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 78a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 799e1d4b82SJeremy L Thompson } 809e1d4b82SJeremy L Thompson } 819e1d4b82SJeremy L Thompson // Pull from shared to register 829e1d4b82SJeremy L Thompson __syncthreads(); 834eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 849e1d4b82SJeremy L Thompson } 859e1d4b82SJeremy L Thompson } 869e1d4b82SJeremy L Thompson 879e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 889e1d4b82SJeremy L Thompson // 1D derivatives at points 899e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 90f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 919e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 929e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 939e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 949e1d4b82SJeremy L Thompson 959e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 969e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 979e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 98*d6c19ee8SJeremy L Thompson __syncthreads(); 999e1d4b82SJeremy L Thompson // Load coefficients 1009e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 1019e1d4b82SJeremy L Thompson __syncthreads(); 1029e1d4b82SJeremy L Thompson // Contract x direction 1039e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1049e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 1059e1d4b82SJeremy L Thompson } 1069e1d4b82SJeremy L Thompson } 1079e1d4b82SJeremy L Thompson } 1089e1d4b82SJeremy L Thompson 1099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1109e1d4b82SJeremy L Thompson // 1D derivatives transpose 1119e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 112f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1139e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 1149e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 1159e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1169e1d4b82SJeremy L Thompson 1179e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 1189e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1199e1d4b82SJeremy L Thompson // Clear shared memory 1209e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 1219e1d4b82SJeremy L Thompson __syncthreads(); 1229e1d4b82SJeremy L Thompson // Contract x direction 1239e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 1249e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 125a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 1269e1d4b82SJeremy L Thompson } 1279e1d4b82SJeremy L Thompson } 1289e1d4b82SJeremy L Thompson // Pull from shared to register 1299e1d4b82SJeremy L Thompson __syncthreads(); 1304eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 1319e1d4b82SJeremy L Thompson } 1329e1d4b82SJeremy L Thompson } 1339e1d4b82SJeremy L Thompson 1349e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1359e1d4b82SJeremy L Thompson // 2D 1369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1379e1d4b82SJeremy L Thompson 1389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1399e1d4b82SJeremy L Thompson // 2D interpolate to points 1409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 141f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1429e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 1439e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 1449e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 1459e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1469e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1479e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1489e1d4b82SJeremy L Thompson 1499e1d4b82SJeremy L Thompson // Load coefficients 150*d6c19ee8SJeremy L Thompson __syncthreads(); 1519e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 1529e1d4b82SJeremy L Thompson __syncthreads(); 1539e1d4b82SJeremy L Thompson // Contract x direction 1549e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1559e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1569e1d4b82SJeremy L Thompson buffer[i] = 0.0; 1579e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 1589e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 1599e1d4b82SJeremy L Thompson } 1609e1d4b82SJeremy L Thompson } 1619e1d4b82SJeremy L Thompson // Contract y direction 1629e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 1639e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1649e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i]; 1659e1d4b82SJeremy L Thompson } 1669e1d4b82SJeremy L Thompson } 1679e1d4b82SJeremy L Thompson } 1689e1d4b82SJeremy L Thompson 1699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1709e1d4b82SJeremy L Thompson // 2D interpolate transpose 1719e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 172f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1739e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 1749e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 1759e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1769e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1779e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1789e1d4b82SJeremy L Thompson 1799e1d4b82SJeremy L Thompson // Clear shared memory 1809e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 1819e1d4b82SJeremy L Thompson __syncthreads(); 1829e1d4b82SJeremy L Thompson // Contract y direction 1839e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 1849e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1859e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp]; 1869e1d4b82SJeremy L Thompson } 1879e1d4b82SJeremy L Thompson // Contract x direction 1889e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1899e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 1909e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1919e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 192a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 1939e1d4b82SJeremy L Thompson 1949e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 195a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 1969e1d4b82SJeremy L Thompson 1979e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 1989e1d4b82SJeremy L Thompson } 1999e1d4b82SJeremy L Thompson } 2009e1d4b82SJeremy L Thompson } 2019e1d4b82SJeremy L Thompson // Pull from shared to register 2029e1d4b82SJeremy L Thompson __syncthreads(); 2039e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2049e1d4b82SJeremy L Thompson } 2059e1d4b82SJeremy L Thompson } 2069e1d4b82SJeremy L Thompson 2079e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2089e1d4b82SJeremy L Thompson // 2D derivatives at points 2099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 210f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2119e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 2129e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2139e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 2149e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2159e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2169e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2179e1d4b82SJeremy L Thompson 2189e1d4b82SJeremy L Thompson // Load coefficients 219*d6c19ee8SJeremy L Thompson __syncthreads(); 2209e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 2219e1d4b82SJeremy L Thompson __syncthreads(); 2229e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2239e1d4b82SJeremy L Thompson // Contract x direction 2249e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2259e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2269e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2279e1d4b82SJeremy L Thompson buffer[i] = 0.0; 2289e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 2299e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 2309e1d4b82SJeremy L Thompson } 2319e1d4b82SJeremy L Thompson } 2329e1d4b82SJeremy L Thompson // Contract y direction 2339e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2349e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 2359e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2369e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 2379e1d4b82SJeremy L Thompson } 2389e1d4b82SJeremy L Thompson } 2399e1d4b82SJeremy L Thompson } 2409e1d4b82SJeremy L Thompson } 2419e1d4b82SJeremy L Thompson 2429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2439e1d4b82SJeremy L Thompson // 2D derivatives transpose 2449e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 245f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2469e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 2479e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 2489e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2499e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2509e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2519e1d4b82SJeremy L Thompson 2529e1d4b82SJeremy L Thompson // Clear shared memory 2539e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 2549e1d4b82SJeremy L Thompson __syncthreads(); 2559e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2569e1d4b82SJeremy L Thompson // Contract y direction 2579e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2589e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 2599e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2609e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 2619e1d4b82SJeremy L Thompson } 2629e1d4b82SJeremy L Thompson // Contract x direction 2639e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2649e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2659e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 2669e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2679e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 268a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 2699e1d4b82SJeremy L Thompson 2709e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 271a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 2729e1d4b82SJeremy L Thompson 2739e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 2749e1d4b82SJeremy L Thompson } 2759e1d4b82SJeremy L Thompson } 2769e1d4b82SJeremy L Thompson } 2779e1d4b82SJeremy L Thompson } 2789e1d4b82SJeremy L Thompson // Pull from shared to register 2799e1d4b82SJeremy L Thompson __syncthreads(); 2809e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2819e1d4b82SJeremy L Thompson } 2829e1d4b82SJeremy L Thompson } 2839e1d4b82SJeremy L Thompson 2849e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2859e1d4b82SJeremy L Thompson // 3D 2869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2879e1d4b82SJeremy L Thompson 2889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2899e1d4b82SJeremy L Thompson // 3D interpolate to points 2909e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 291f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2929e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 2939e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2949e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 2959e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2969e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 2979e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2989e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2999e1d4b82SJeremy L Thompson 3009e1d4b82SJeremy L Thompson // Load coefficients 301*d6c19ee8SJeremy L Thompson __syncthreads(); 3029e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 3039e1d4b82SJeremy L Thompson __syncthreads(); 3049e1d4b82SJeremy L Thompson // Contract x direction 3059e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3069e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3079e1d4b82SJeremy L Thompson buffer[i] = 0.0; 3089e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 3099e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 3109e1d4b82SJeremy L Thompson } 3119e1d4b82SJeremy L Thompson } 3129e1d4b82SJeremy L Thompson // Contract y and z direction 3139e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 3149e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 3159e1d4b82SJeremy L Thompson 3169e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3179e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3189e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i] * z; 3199e1d4b82SJeremy L Thompson } 3209e1d4b82SJeremy L Thompson } 3219e1d4b82SJeremy L Thompson } 3229e1d4b82SJeremy L Thompson } 3239e1d4b82SJeremy L Thompson 3249e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3259e1d4b82SJeremy L Thompson // 3D interpolate transpose 3269e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 327f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 3289e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 3299e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 3309e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3319e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3329e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3339e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3349e1d4b82SJeremy L Thompson 3359e1d4b82SJeremy L Thompson // Clear shared memory 3369e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 3379e1d4b82SJeremy L Thompson __syncthreads(); 3389e1d4b82SJeremy L Thompson // Contract y and z direction 3399e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 3409e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 3419e1d4b82SJeremy L Thompson 3429e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3439e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3449e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp] * z; 3459e1d4b82SJeremy L Thompson } 3469e1d4b82SJeremy L Thompson // Contract x direction 3479e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3489e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 3499e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3509e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 351a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 3529e1d4b82SJeremy L Thompson 3539e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 354a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 3559e1d4b82SJeremy L Thompson 3569e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 3579e1d4b82SJeremy L Thompson } 3589e1d4b82SJeremy L Thompson } 3599e1d4b82SJeremy L Thompson } 3609e1d4b82SJeremy L Thompson // Pull from shared to register 3619e1d4b82SJeremy L Thompson __syncthreads(); 3629e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 3639e1d4b82SJeremy L Thompson } 3649e1d4b82SJeremy L Thompson } 3659e1d4b82SJeremy L Thompson } 3669e1d4b82SJeremy L Thompson 3679e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3689e1d4b82SJeremy L Thompson // 3D derivatives at points 3699e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 370f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 3719e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 3729e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 3739e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 3749e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3759e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3769e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3779e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3789e1d4b82SJeremy L Thompson 3799e1d4b82SJeremy L Thompson // Load coefficients 380*d6c19ee8SJeremy L Thompson __syncthreads(); 3819e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 3829e1d4b82SJeremy L Thompson __syncthreads(); 3839e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 3849e1d4b82SJeremy L Thompson // Contract x direction 3859e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 3869e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3879e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3889e1d4b82SJeremy L Thompson buffer[i] = 0.0; 3899e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 3909e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 3919e1d4b82SJeremy L Thompson } 3929e1d4b82SJeremy L Thompson } 3939e1d4b82SJeremy L Thompson // Contract y and z direction 3949e1d4b82SJeremy L Thompson if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 3959e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 3969e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 3979e1d4b82SJeremy L Thompson 3989e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 3999e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 4009e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4019e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 4029e1d4b82SJeremy L Thompson } 4039e1d4b82SJeremy L Thompson } 4049e1d4b82SJeremy L Thompson } 4059e1d4b82SJeremy L Thompson } 4069e1d4b82SJeremy L Thompson } 4079e1d4b82SJeremy L Thompson 4089e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 4099e1d4b82SJeremy L Thompson // 3D derivatives transpose 4109e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 411f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 4129e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 4139e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 4149e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4159e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 4169e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 4179e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 4189e1d4b82SJeremy L Thompson 4199e1d4b82SJeremy L Thompson // Clear shared memory 4209e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 4219e1d4b82SJeremy L Thompson __syncthreads(); 4229e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 4239e1d4b82SJeremy L Thompson // Contract y and z direction 4249e1d4b82SJeremy L Thompson if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 4259e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 4269e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 4279e1d4b82SJeremy L Thompson 4289e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 4299e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 4309e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4319e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 4329e1d4b82SJeremy L Thompson } 4339e1d4b82SJeremy L Thompson // Contract x direction 4349e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 4359e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 4369e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 4379e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4389e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 439a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 4409e1d4b82SJeremy L Thompson 4419e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 442a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 4439e1d4b82SJeremy L Thompson 4449e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 4459e1d4b82SJeremy L Thompson } 4469e1d4b82SJeremy L Thompson } 4479e1d4b82SJeremy L Thompson } 4489e1d4b82SJeremy L Thompson } 4499e1d4b82SJeremy L Thompson // Pull from shared to register 4509e1d4b82SJeremy L Thompson __syncthreads(); 4519e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 4529e1d4b82SJeremy L Thompson } 4539e1d4b82SJeremy L Thompson } 4549e1d4b82SJeremy L Thompson } 455