19e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis AtPoints templates 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 139e1d4b82SJeremy L Thompson // Chebyshev values 149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 159e1d4b82SJeremy L Thompson template <int Q_1D> 169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 179e1d4b82SJeremy L Thompson chebyshev_x[0] = 1.0; 189e1d4b82SJeremy L Thompson chebyshev_x[1] = 2 * x; 199e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 209e1d4b82SJeremy L Thompson } 219e1d4b82SJeremy L Thompson 229e1d4b82SJeremy L Thompson template <int Q_1D> 239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 249e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[3]; 259e1d4b82SJeremy L Thompson 269e1d4b82SJeremy L Thompson chebyshev_x[1] = 1.0; 279e1d4b82SJeremy L Thompson chebyshev_x[2] = 2 * x; 289e1d4b82SJeremy L Thompson chebyshev_dx[0] = 0.0; 299e1d4b82SJeremy L Thompson chebyshev_dx[1] = 2.0; 309e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 319e1d4b82SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 329e1d4b82SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 339e1d4b82SJeremy L Thompson } 349e1d4b82SJeremy L Thompson } 359e1d4b82SJeremy L Thompson 369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 379e1d4b82SJeremy L Thompson // 1D 389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 399e1d4b82SJeremy L Thompson 409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 419e1d4b82SJeremy L Thompson // 1D interpolate to points 429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 439e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 459e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 479e1d4b82SJeremy L Thompson 489e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 499e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 509e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 519e1d4b82SJeremy L Thompson // Load coefficients 529e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 539e1d4b82SJeremy L Thompson __syncthreads(); 549e1d4b82SJeremy L Thompson // Contract x direction 559e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 569e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 579e1d4b82SJeremy L Thompson } 589e1d4b82SJeremy L Thompson } 599e1d4b82SJeremy L Thompson } 609e1d4b82SJeremy L Thompson 619e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 629e1d4b82SJeremy L Thompson // 1D interpolate transpose 639e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 649e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 659e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 669e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 679e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 689e1d4b82SJeremy L Thompson 699e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 709e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 719e1d4b82SJeremy L Thompson // Clear shared memory 729e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 739e1d4b82SJeremy L Thompson __syncthreads(); 749e1d4b82SJeremy L Thompson // Contract x direction 759e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 769e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 77*a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 789e1d4b82SJeremy L Thompson } 799e1d4b82SJeremy L Thompson } 809e1d4b82SJeremy L Thompson // Pull from shared to register 819e1d4b82SJeremy L Thompson __syncthreads(); 824eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 839e1d4b82SJeremy L Thompson } 849e1d4b82SJeremy L Thompson } 859e1d4b82SJeremy L Thompson 869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 879e1d4b82SJeremy L Thompson // 1D derivatives at points 889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 899e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 909e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 919e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 929e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 939e1d4b82SJeremy L Thompson 949e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 959e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 969e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 979e1d4b82SJeremy L Thompson // Load coefficients 989e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 999e1d4b82SJeremy L Thompson __syncthreads(); 1009e1d4b82SJeremy L Thompson // Contract x direction 1019e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1029e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 1039e1d4b82SJeremy L Thompson } 1049e1d4b82SJeremy L Thompson } 1059e1d4b82SJeremy L Thompson } 1069e1d4b82SJeremy L Thompson 1079e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1089e1d4b82SJeremy L Thompson // 1D derivatives transpose 1099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1109e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 1119e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 1129e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 1139e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1149e1d4b82SJeremy L Thompson 1159e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 1169e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1179e1d4b82SJeremy L Thompson // Clear shared memory 1189e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 1199e1d4b82SJeremy L Thompson __syncthreads(); 1209e1d4b82SJeremy L Thompson // Contract x direction 1219e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 1229e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 123*a24d84eaSJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 1249e1d4b82SJeremy L Thompson } 1259e1d4b82SJeremy L Thompson } 1269e1d4b82SJeremy L Thompson // Pull from shared to register 1279e1d4b82SJeremy L Thompson __syncthreads(); 1284eda27c2SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 1299e1d4b82SJeremy L Thompson } 1309e1d4b82SJeremy L Thompson } 1319e1d4b82SJeremy L Thompson 1329e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1339e1d4b82SJeremy L Thompson // 2D 1349e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1359e1d4b82SJeremy L Thompson 1369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1379e1d4b82SJeremy L Thompson // 2D interpolate to points 1389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1399e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 1409e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 1419e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 1429e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 1439e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1449e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1459e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1469e1d4b82SJeremy L Thompson 1479e1d4b82SJeremy L Thompson // Load coefficients 1489e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 1499e1d4b82SJeremy L Thompson __syncthreads(); 1509e1d4b82SJeremy L Thompson // Contract x direction 1519e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1529e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1539e1d4b82SJeremy L Thompson buffer[i] = 0.0; 1549e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 1559e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 1569e1d4b82SJeremy L Thompson } 1579e1d4b82SJeremy L Thompson } 1589e1d4b82SJeremy L Thompson // Contract y direction 1599e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 1609e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1619e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i]; 1629e1d4b82SJeremy L Thompson } 1639e1d4b82SJeremy L Thompson } 1649e1d4b82SJeremy L Thompson } 1659e1d4b82SJeremy L Thompson 1669e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1679e1d4b82SJeremy L Thompson // 2D interpolate transpose 1689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1699e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 1709e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 1719e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 1729e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1739e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 1749e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 1759e1d4b82SJeremy L Thompson 1769e1d4b82SJeremy L Thompson // Clear shared memory 1779e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 1789e1d4b82SJeremy L Thompson __syncthreads(); 1799e1d4b82SJeremy L Thompson // Contract y direction 1809e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 1819e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1829e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp]; 1839e1d4b82SJeremy L Thompson } 1849e1d4b82SJeremy L Thompson // Contract x direction 1859e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 1869e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 1879e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 1889e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 189*a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 1909e1d4b82SJeremy L Thompson 1919e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 192*a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 1939e1d4b82SJeremy L Thompson 1949e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 1959e1d4b82SJeremy L Thompson } 1969e1d4b82SJeremy L Thompson } 1979e1d4b82SJeremy L Thompson } 1989e1d4b82SJeremy L Thompson // Pull from shared to register 1999e1d4b82SJeremy L Thompson __syncthreads(); 2009e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2019e1d4b82SJeremy L Thompson } 2029e1d4b82SJeremy L Thompson } 2039e1d4b82SJeremy L Thompson 2049e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2059e1d4b82SJeremy L Thompson // 2D derivatives at points 2069e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2079e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 2089e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 2099e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2109e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 2119e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2129e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2139e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2149e1d4b82SJeremy L Thompson 2159e1d4b82SJeremy L Thompson // Load coefficients 2169e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 2179e1d4b82SJeremy L Thompson __syncthreads(); 2189e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2199e1d4b82SJeremy L Thompson // Contract x direction 2209e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2219e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2229e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2239e1d4b82SJeremy L Thompson buffer[i] = 0.0; 2249e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 2259e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 2269e1d4b82SJeremy L Thompson } 2279e1d4b82SJeremy L Thompson } 2289e1d4b82SJeremy L Thompson // Contract y direction 2299e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2309e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 2319e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2329e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 2339e1d4b82SJeremy L Thompson } 2349e1d4b82SJeremy L Thompson } 2359e1d4b82SJeremy L Thompson } 2369e1d4b82SJeremy L Thompson } 2379e1d4b82SJeremy L Thompson 2389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2399e1d4b82SJeremy L Thompson // 2D derivatives transpose 2409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2419e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 2429e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 2439e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 2449e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2459e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2469e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2479e1d4b82SJeremy L Thompson 2489e1d4b82SJeremy L Thompson // Clear shared memory 2499e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 2509e1d4b82SJeremy L Thompson __syncthreads(); 2519e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 2529e1d4b82SJeremy L Thompson // Contract y direction 2539e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 2549e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 2559e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2569e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 2579e1d4b82SJeremy L Thompson } 2589e1d4b82SJeremy L Thompson // Contract x direction 2599e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 2609e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 2619e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 2629e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2639e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 264*a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 2659e1d4b82SJeremy L Thompson 2669e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 267*a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 2689e1d4b82SJeremy L Thompson 2699e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 2709e1d4b82SJeremy L Thompson } 2719e1d4b82SJeremy L Thompson } 2729e1d4b82SJeremy L Thompson } 2739e1d4b82SJeremy L Thompson } 2749e1d4b82SJeremy L Thompson // Pull from shared to register 2759e1d4b82SJeremy L Thompson __syncthreads(); 2769e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 2779e1d4b82SJeremy L Thompson } 2789e1d4b82SJeremy L Thompson } 2799e1d4b82SJeremy L Thompson 2809e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2819e1d4b82SJeremy L Thompson // 3D 2829e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2839e1d4b82SJeremy L Thompson 2849e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2859e1d4b82SJeremy L Thompson // 3D interpolate to points 2869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2879e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 2889e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 2899e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 2909e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 2919e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2929e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 2939e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 2949e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 2959e1d4b82SJeremy L Thompson 2969e1d4b82SJeremy L Thompson // Load coefficients 2979e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 2989e1d4b82SJeremy L Thompson __syncthreads(); 2999e1d4b82SJeremy L Thompson // Contract x direction 3009e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3019e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3029e1d4b82SJeremy L Thompson buffer[i] = 0.0; 3039e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 3049e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 3059e1d4b82SJeremy L Thompson } 3069e1d4b82SJeremy L Thompson } 3079e1d4b82SJeremy L Thompson // Contract y and z direction 3089e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 3099e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 3109e1d4b82SJeremy L Thompson 3119e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3129e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3139e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i] * z; 3149e1d4b82SJeremy L Thompson } 3159e1d4b82SJeremy L Thompson } 3169e1d4b82SJeremy L Thompson } 3179e1d4b82SJeremy L Thompson } 3189e1d4b82SJeremy L Thompson 3199e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3209e1d4b82SJeremy L Thompson // 3D interpolate transpose 3219e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3229e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 3239e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 3249e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 3259e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3269e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3279e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3289e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3299e1d4b82SJeremy L Thompson 3309e1d4b82SJeremy L Thompson // Clear shared memory 3319e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 3329e1d4b82SJeremy L Thompson __syncthreads(); 3339e1d4b82SJeremy L Thompson // Contract y and z direction 3349e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 3359e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 3369e1d4b82SJeremy L Thompson 3379e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3389e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3399e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp] * z; 3409e1d4b82SJeremy L Thompson } 3419e1d4b82SJeremy L Thompson // Contract x direction 3429e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3439e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 3449e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3459e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 346*a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 3479e1d4b82SJeremy L Thompson 3489e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 349*a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 3509e1d4b82SJeremy L Thompson 3519e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 3529e1d4b82SJeremy L Thompson } 3539e1d4b82SJeremy L Thompson } 3549e1d4b82SJeremy L Thompson } 3559e1d4b82SJeremy L Thompson // Pull from shared to register 3569e1d4b82SJeremy L Thompson __syncthreads(); 3579e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 3589e1d4b82SJeremy L Thompson } 3599e1d4b82SJeremy L Thompson } 3609e1d4b82SJeremy L Thompson } 3619e1d4b82SJeremy L Thompson 3629e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3639e1d4b82SJeremy L Thompson // 3D derivatives at points 3649e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 3659e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 3669e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 3679e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 3689e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 3699e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 3709e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 3719e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 3729e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 3739e1d4b82SJeremy L Thompson 3749e1d4b82SJeremy L Thompson // Load coefficients 3759e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 3769e1d4b82SJeremy L Thompson __syncthreads(); 3779e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 3789e1d4b82SJeremy L Thompson // Contract x direction 3799e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 3809e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 3819e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3829e1d4b82SJeremy L Thompson buffer[i] = 0.0; 3839e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 3849e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 3859e1d4b82SJeremy L Thompson } 3869e1d4b82SJeremy L Thompson } 3879e1d4b82SJeremy L Thompson // Contract y and z direction 3889e1d4b82SJeremy L Thompson if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 3899e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 3909e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 3919e1d4b82SJeremy L Thompson 3929e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 3939e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 3949e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3959e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 3969e1d4b82SJeremy L Thompson } 3979e1d4b82SJeremy L Thompson } 3989e1d4b82SJeremy L Thompson } 3999e1d4b82SJeremy L Thompson } 4009e1d4b82SJeremy L Thompson } 4019e1d4b82SJeremy L Thompson 4029e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 4039e1d4b82SJeremy L Thompson // 3D derivatives transpose 4049e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 4059e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 4069e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 4079e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 4089e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4099e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 4109e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 4119e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 4129e1d4b82SJeremy L Thompson 4139e1d4b82SJeremy L Thompson // Clear shared memory 4149e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 4159e1d4b82SJeremy L Thompson __syncthreads(); 4169e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 4179e1d4b82SJeremy L Thompson // Contract y and z direction 4189e1d4b82SJeremy L Thompson if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 4199e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 4209e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 4219e1d4b82SJeremy L Thompson 4229e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 4239e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 4249e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4259e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 4269e1d4b82SJeremy L Thompson } 4279e1d4b82SJeremy L Thompson // Contract x direction 4289e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 4299e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 4309e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 4319e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4329e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 433*a24d84eaSJeremy L Thompson const CeedInt ii = (i + data.t_id_x) % Q_1D; 4349e1d4b82SJeremy L Thompson 4359e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 436*a24d84eaSJeremy L Thompson const CeedInt jj = (j + data.t_id_y) % Q_1D; 4379e1d4b82SJeremy L Thompson 4389e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 4399e1d4b82SJeremy L Thompson } 4409e1d4b82SJeremy L Thompson } 4419e1d4b82SJeremy L Thompson } 4429e1d4b82SJeremy L Thompson } 4439e1d4b82SJeremy L Thompson // Pull from shared to register 4449e1d4b82SJeremy L Thompson __syncthreads(); 4459e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 4469e1d4b82SJeremy L Thompson } 4479e1d4b82SJeremy L Thompson } 4489e1d4b82SJeremy L Thompson } 449