1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis AtPoints templates 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 139e1d4b82SJeremy L Thompson // Chebyshev values 149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 159e1d4b82SJeremy L Thompson template <int Q_1D> 169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 179e1d4b82SJeremy L Thompson chebyshev_x[0] = 1.0; 189e1d4b82SJeremy L Thompson chebyshev_x[1] = 2 * x; 199e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 209e1d4b82SJeremy L Thompson } 219e1d4b82SJeremy L Thompson 229e1d4b82SJeremy L Thompson template <int Q_1D> 239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[3]; 259e1d4b82SJeremy L Thompson 269e1d4b82SJeremy L Thompson chebyshev_x[1] = 1.0; 279e1d4b82SJeremy L Thompson chebyshev_x[2] = 2 * x; 289e1d4b82SJeremy L Thompson chebyshev_dx[0] = 0.0; 299e1d4b82SJeremy L Thompson chebyshev_dx[1] = 2.0; 309e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 319e1d4b82SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 329e1d4b82SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 339e1d4b82SJeremy L Thompson } 349e1d4b82SJeremy L Thompson } 359e1d4b82SJeremy L Thompson 369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 379e1d4b82SJeremy L Thompson // 1D 389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 399e1d4b82SJeremy L Thompson 409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 419e1d4b82SJeremy L Thompson // 1D interpolate to points 429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 43f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 459e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 479e1d4b82SJeremy L Thompson 489e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 499e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 509e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 519e1d4b82SJeremy L Thompson // Load coefficients 529e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 539e1d4b82SJeremy L Thompson __syncthreads(); 549e1d4b82SJeremy L Thompson // Contract x direction 559e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 569e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 579e1d4b82SJeremy L Thompson } 589e1d4b82SJeremy L Thompson } 599e1d4b82SJeremy L Thompson } 609e1d4b82SJeremy L Thompson 619e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 629e1d4b82SJeremy L Thompson // 1D interpolate transpose 639e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 64f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 659e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 669e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 679e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 689e1d4b82SJeremy L Thompson 699e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 709e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 719e1d4b82SJeremy L Thompson // Clear shared memory 729e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 739e1d4b82SJeremy L Thompson __syncthreads(); 749e1d4b82SJeremy L Thompson // Contract x direction 759e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 769e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 77a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 789e1d4b82SJeremy L Thompson } 799e1d4b82SJeremy L Thompson } 809e1d4b82SJeremy L Thompson // Pull from shared to register 819e1d4b82SJeremy L Thompson __syncthreads(); 824eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 839e1d4b82SJeremy L Thompson } 849e1d4b82SJeremy L Thompson } 859e1d4b82SJeremy L Thompson 869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 879e1d4b82SJeremy L Thompson // 1D derivatives at points 889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 89f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 909e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 919e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 929e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 939e1d4b82SJeremy L Thompson 949e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 959e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 969e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 979e1d4b82SJeremy L Thompson // Load coefficients 98d6c19ee8SJeremy L Thompson __syncthreads(); 999e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 1009e1d4b82SJeremy L Thompson __syncthreads(); 1019e1d4b82SJeremy L Thompson // Contract x direction 1029e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1039e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 1049e1d4b82SJeremy L Thompson } 1059e1d4b82SJeremy L Thompson } 1069e1d4b82SJeremy L Thompson } 1079e1d4b82SJeremy L Thompson 1089e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1099e1d4b82SJeremy L Thompson // 1D derivatives transpose 1109e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 111f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1129e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 1139e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 1149e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1159e1d4b82SJeremy L Thompson 1169e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 1179e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1189e1d4b82SJeremy L Thompson // Clear shared memory 1199e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 1209e1d4b82SJeremy L Thompson __syncthreads(); 1219e1d4b82SJeremy L Thompson // Contract x direction 1229e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 1239e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 124a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 1259e1d4b82SJeremy L Thompson } 1269e1d4b82SJeremy L Thompson } 1279e1d4b82SJeremy L Thompson // Pull from shared to register 1289e1d4b82SJeremy L Thompson __syncthreads(); 1294eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 1309e1d4b82SJeremy L Thompson } 1319e1d4b82SJeremy L Thompson } 1329e1d4b82SJeremy L Thompson 1339e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1349e1d4b82SJeremy L Thompson // 2D 1359e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1369e1d4b82SJeremy L Thompson 1379e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1389e1d4b82SJeremy L Thompson // 2D interpolate to points 1399e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 140f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1419e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 1429e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 1439e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 1449e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1459e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1479e1d4b82SJeremy L Thompson 1489e1d4b82SJeremy L Thompson // Load coefficients 149d6c19ee8SJeremy L Thompson __syncthreads(); 1509e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 1519e1d4b82SJeremy L Thompson __syncthreads(); 1529e1d4b82SJeremy L Thompson // Contract x direction 1539e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1549e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1559e1d4b82SJeremy L Thompson buffer[i] = 0.0; 1569e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 1579e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 1589e1d4b82SJeremy L Thompson } 1599e1d4b82SJeremy L Thompson } 1609e1d4b82SJeremy L Thompson // Contract y direction 1619e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 1629e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1639e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i]; 1649e1d4b82SJeremy L Thompson } 1659e1d4b82SJeremy L Thompson } 1669e1d4b82SJeremy L Thompson } 1679e1d4b82SJeremy L Thompson 1689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1699e1d4b82SJeremy L Thompson // 2D interpolate transpose 1709e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 171f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 1729e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 1739e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 1749e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1759e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1769e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1779e1d4b82SJeremy L Thompson 1789e1d4b82SJeremy L Thompson // Clear shared memory 1799e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 1809e1d4b82SJeremy L Thompson __syncthreads(); 1819e1d4b82SJeremy L Thompson // Contract y direction 1829e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 1839e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1849e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp]; 1859e1d4b82SJeremy L Thompson } 1869e1d4b82SJeremy L Thompson // Contract x direction 1879e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1889e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 1899e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1909e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 191a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 1929e1d4b82SJeremy L Thompson 1939e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 194a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 1959e1d4b82SJeremy L Thompson 1969e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 1979e1d4b82SJeremy L Thompson } 1989e1d4b82SJeremy L Thompson } 1999e1d4b82SJeremy L Thompson } 2009e1d4b82SJeremy L Thompson // Pull from shared to register 2019e1d4b82SJeremy L Thompson __syncthreads(); 2029e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2039e1d4b82SJeremy L Thompson } 2049e1d4b82SJeremy L Thompson } 2059e1d4b82SJeremy L Thompson 2069e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2079e1d4b82SJeremy L Thompson // 2D derivatives at points 2089e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 209f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2109e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 2119e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2129e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 2139e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2149e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2159e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2169e1d4b82SJeremy L Thompson 2179e1d4b82SJeremy L Thompson // Load coefficients 218d6c19ee8SJeremy L Thompson __syncthreads(); 2199e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 2209e1d4b82SJeremy L Thompson __syncthreads(); 2219e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2229e1d4b82SJeremy L Thompson // Contract x direction 2239e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2249e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2259e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2269e1d4b82SJeremy L Thompson buffer[i] = 0.0; 2279e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 2289e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 2299e1d4b82SJeremy L Thompson } 2309e1d4b82SJeremy L Thompson } 2319e1d4b82SJeremy L Thompson // Contract y direction 2329e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2339e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 2349e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2359e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 2369e1d4b82SJeremy L Thompson } 2379e1d4b82SJeremy L Thompson } 2389e1d4b82SJeremy L Thompson } 2399e1d4b82SJeremy L Thompson } 2409e1d4b82SJeremy L Thompson 2419e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2429e1d4b82SJeremy L Thompson // 2D derivatives transpose 2439e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 244f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2459e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 2469e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 2479e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2489e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2499e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2509e1d4b82SJeremy L Thompson 2519e1d4b82SJeremy L Thompson // Clear shared memory 2529e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 2539e1d4b82SJeremy L Thompson __syncthreads(); 2549e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2559e1d4b82SJeremy L Thompson // Contract y direction 2569e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2579e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 2589e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2599e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 2609e1d4b82SJeremy L Thompson } 2619e1d4b82SJeremy L Thompson // Contract x direction 2629e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2639e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2649e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 2659e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2669e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 267a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 2689e1d4b82SJeremy L Thompson 2699e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 270a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 2719e1d4b82SJeremy L Thompson 2729e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 2739e1d4b82SJeremy L Thompson } 2749e1d4b82SJeremy L Thompson } 2759e1d4b82SJeremy L Thompson } 2769e1d4b82SJeremy L Thompson } 2779e1d4b82SJeremy L Thompson // Pull from shared to register 2789e1d4b82SJeremy L Thompson __syncthreads(); 2799e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2809e1d4b82SJeremy L Thompson } 2819e1d4b82SJeremy L Thompson } 2829e1d4b82SJeremy L Thompson 2839e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2849e1d4b82SJeremy L Thompson // 3D 2859e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2869e1d4b82SJeremy L Thompson 2879e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2889e1d4b82SJeremy L Thompson // 3D interpolate to points 2899e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 290f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 2919e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 2929e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2939e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 2949e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 2959e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2969e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2979e1d4b82SJeremy L Thompson 298*802d760aSJeremy L Thompson // Get z contraction value 299*802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 300*802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 301*802d760aSJeremy L Thompson 302*802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3039e1d4b82SJeremy L Thompson // Load coefficients 304d6c19ee8SJeremy L Thompson __syncthreads(); 3059e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 3069e1d4b82SJeremy L Thompson __syncthreads(); 3079e1d4b82SJeremy L Thompson // Contract x direction 3089e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3099e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3109e1d4b82SJeremy L Thompson buffer[i] = 0.0; 3119e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 3129e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 3139e1d4b82SJeremy L Thompson } 3149e1d4b82SJeremy L Thompson } 3159e1d4b82SJeremy L Thompson // Contract y and z direction 3169e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3179e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3189e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i] * z; 3199e1d4b82SJeremy L Thompson } 3209e1d4b82SJeremy L Thompson } 3219e1d4b82SJeremy L Thompson } 3229e1d4b82SJeremy L Thompson } 3239e1d4b82SJeremy L Thompson 3249e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3259e1d4b82SJeremy L Thompson // 3D interpolate transpose 3269e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 327f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 3289e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 3299e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 3309e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3319e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3329e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3339e1d4b82SJeremy L Thompson 334*802d760aSJeremy L Thompson // Get z contraction value 335*802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 336*802d760aSJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 337*802d760aSJeremy L Thompson 338*802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3399e1d4b82SJeremy L Thompson // Clear shared memory 3409e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 3419e1d4b82SJeremy L Thompson __syncthreads(); 3429e1d4b82SJeremy L Thompson // Contract y and z direction 3439e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3449e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3459e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp] * z; 3469e1d4b82SJeremy L Thompson } 3479e1d4b82SJeremy L Thompson // Contract x direction 3489e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3499e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 3509e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3519e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 352a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 3539e1d4b82SJeremy L Thompson 3549e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 355a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 3569e1d4b82SJeremy L Thompson 3579e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 3589e1d4b82SJeremy L Thompson } 3599e1d4b82SJeremy L Thompson } 3609e1d4b82SJeremy L Thompson } 3619e1d4b82SJeremy L Thompson // Pull from shared to register 3629e1d4b82SJeremy L Thompson __syncthreads(); 3639e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 3649e1d4b82SJeremy L Thompson } 3659e1d4b82SJeremy L Thompson } 3669e1d4b82SJeremy L Thompson } 3679e1d4b82SJeremy L Thompson 3689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3699e1d4b82SJeremy L Thompson // 3D derivatives at points 3709e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 371f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 3729e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 3739e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 3749e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 3759e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3769e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3779e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3789e1d4b82SJeremy L Thompson 379*802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 380*802d760aSJeremy L Thompson // Get z contraction value 381*802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 382*802d760aSJeremy L Thompson CeedScalar z = chebyshev_x[k]; 383*802d760aSJeremy L Thompson 3849e1d4b82SJeremy L Thompson // Load coefficients 385d6c19ee8SJeremy L Thompson __syncthreads(); 3869e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 3879e1d4b82SJeremy L Thompson __syncthreads(); 388*802d760aSJeremy L Thompson // Gradient directions 3899e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 390*802d760aSJeremy L Thompson // Update z value for final pass 391*802d760aSJeremy L Thompson if (dim == 2) { 392*802d760aSJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 393*802d760aSJeremy L Thompson z = chebyshev_x[k]; 394*802d760aSJeremy L Thompson } 3959e1d4b82SJeremy L Thompson // Contract x direction 3969e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 3979e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3989e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3999e1d4b82SJeremy L Thompson buffer[i] = 0.0; 4009e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 4019e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 4029e1d4b82SJeremy L Thompson } 4039e1d4b82SJeremy L Thompson } 4049e1d4b82SJeremy L Thompson // Contract y and z direction 4059e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 4069e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 4079e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4089e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 4099e1d4b82SJeremy L Thompson } 4109e1d4b82SJeremy L Thompson } 4119e1d4b82SJeremy L Thompson } 4129e1d4b82SJeremy L Thompson } 4139e1d4b82SJeremy L Thompson } 4149e1d4b82SJeremy L Thompson 4159e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 4169e1d4b82SJeremy L Thompson // 3D derivatives transpose 4179e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 418f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 4199e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 4209e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 4219e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 4229e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 4239e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 4249e1d4b82SJeremy L Thompson 425*802d760aSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 426*802d760aSJeremy L Thompson // Get z contraction value 427*802d760aSJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 428*802d760aSJeremy L Thompson CeedScalar z = chebyshev_x[k]; 429*802d760aSJeremy L Thompson 4309e1d4b82SJeremy L Thompson // Clear shared memory 4319e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 4329e1d4b82SJeremy L Thompson __syncthreads(); 433*802d760aSJeremy L Thompson // Gradient directions 4349e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 435*802d760aSJeremy L Thompson // Update z value for final pass 436*802d760aSJeremy L Thompson if (dim == 2) { 437*802d760aSJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 438*802d760aSJeremy L Thompson z = chebyshev_x[k]; 439*802d760aSJeremy L Thompson } 4409e1d4b82SJeremy L Thompson // Contract y and z direction 4419e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 4429e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 4439e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4449e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 4459e1d4b82SJeremy L Thompson } 4469e1d4b82SJeremy L Thompson // Contract x direction 4479e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 4489e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 4499e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 4509e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4519e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 452a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 4539e1d4b82SJeremy L Thompson 4549e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 455a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 4569e1d4b82SJeremy L Thompson 4579e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 4589e1d4b82SJeremy L Thompson } 4599e1d4b82SJeremy L Thompson } 4609e1d4b82SJeremy L Thompson } 4619e1d4b82SJeremy L Thompson } 4629e1d4b82SJeremy L Thompson // Pull from shared to register 4639e1d4b82SJeremy L Thompson __syncthreads(); 4649e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 4659e1d4b82SJeremy L Thompson } 4669e1d4b82SJeremy L Thompson } 4679e1d4b82SJeremy L Thompson } 468