xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h (revision 802d760a181830887ae39e389e664dcc61030cbc)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e1d4b82SJeremy L Thompson //
49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59e1d4b82SJeremy L Thompson //
69e1d4b82SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
79e1d4b82SJeremy L Thompson 
89e1d4b82SJeremy L Thompson /// @file
99e1d4b82SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis AtPoints templates
109e1d4b82SJeremy L Thompson #include <ceed/types.h>
119e1d4b82SJeremy L Thompson 
129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
139e1d4b82SJeremy L Thompson // Chebyshev values
149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
159e1d4b82SJeremy L Thompson template <int Q_1D>
169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
179e1d4b82SJeremy L Thompson   chebyshev_x[0] = 1.0;
189e1d4b82SJeremy L Thompson   chebyshev_x[1] = 2 * x;
199e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
209e1d4b82SJeremy L Thompson }
219e1d4b82SJeremy L Thompson 
229e1d4b82SJeremy L Thompson template <int Q_1D>
239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
249e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[3];
259e1d4b82SJeremy L Thompson 
269e1d4b82SJeremy L Thompson   chebyshev_x[1]  = 1.0;
279e1d4b82SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
289e1d4b82SJeremy L Thompson   chebyshev_dx[0] = 0.0;
299e1d4b82SJeremy L Thompson   chebyshev_dx[1] = 2.0;
309e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
319e1d4b82SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
329e1d4b82SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
339e1d4b82SJeremy L Thompson   }
349e1d4b82SJeremy L Thompson }
359e1d4b82SJeremy L Thompson 
369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
379e1d4b82SJeremy L Thompson // 1D
389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
399e1d4b82SJeremy L Thompson 
409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
419e1d4b82SJeremy L Thompson // 1D interpolate to points
429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
43f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
459e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
469e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
479e1d4b82SJeremy L Thompson 
489e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
499e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
509e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
519e1d4b82SJeremy L Thompson     // Load coefficients
529e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
539e1d4b82SJeremy L Thompson     __syncthreads();
549e1d4b82SJeremy L Thompson     // Contract x direction
559e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
569e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
579e1d4b82SJeremy L Thompson     }
589e1d4b82SJeremy L Thompson   }
599e1d4b82SJeremy L Thompson }
609e1d4b82SJeremy L Thompson 
619e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
629e1d4b82SJeremy L Thompson // 1D interpolate transpose
639e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
64f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
659e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
669e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
679e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
689e1d4b82SJeremy L Thompson 
699e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
709e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
719e1d4b82SJeremy L Thompson     // Clear shared memory
729e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
739e1d4b82SJeremy L Thompson     __syncthreads();
749e1d4b82SJeremy L Thompson     // Contract x direction
759e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
769e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
77a24d84eaSJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
789e1d4b82SJeremy L Thompson       }
799e1d4b82SJeremy L Thompson     }
809e1d4b82SJeremy L Thompson     // Pull from shared to register
819e1d4b82SJeremy L Thompson     __syncthreads();
824eda27c2SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
839e1d4b82SJeremy L Thompson   }
849e1d4b82SJeremy L Thompson }
859e1d4b82SJeremy L Thompson 
869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
879e1d4b82SJeremy L Thompson // 1D derivatives at points
889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
89f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
909e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
919e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
929e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
939e1d4b82SJeremy L Thompson 
949e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
959e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
969e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
979e1d4b82SJeremy L Thompson     // Load coefficients
98d6c19ee8SJeremy L Thompson     __syncthreads();
999e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
1009e1d4b82SJeremy L Thompson     __syncthreads();
1019e1d4b82SJeremy L Thompson     // Contract x direction
1029e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1039e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
1049e1d4b82SJeremy L Thompson     }
1059e1d4b82SJeremy L Thompson   }
1069e1d4b82SJeremy L Thompson }
1079e1d4b82SJeremy L Thompson 
1089e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1099e1d4b82SJeremy L Thompson // 1D derivatives transpose
1109e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
111f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
1129e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
1139e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
1149e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
1159e1d4b82SJeremy L Thompson 
1169e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
1179e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1189e1d4b82SJeremy L Thompson     // Clear shared memory
1199e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
1209e1d4b82SJeremy L Thompson     __syncthreads();
1219e1d4b82SJeremy L Thompson     // Contract x direction
1229e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
1239e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
124a24d84eaSJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
1259e1d4b82SJeremy L Thompson       }
1269e1d4b82SJeremy L Thompson     }
1279e1d4b82SJeremy L Thompson     // Pull from shared to register
1289e1d4b82SJeremy L Thompson     __syncthreads();
1294eda27c2SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
1309e1d4b82SJeremy L Thompson   }
1319e1d4b82SJeremy L Thompson }
1329e1d4b82SJeremy L Thompson 
1339e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1349e1d4b82SJeremy L Thompson // 2D
1359e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1369e1d4b82SJeremy L Thompson 
1379e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1389e1d4b82SJeremy L Thompson // 2D interpolate to points
1399e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
140f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
1419e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
1429e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
1439e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
1449e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1459e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
1469e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
1479e1d4b82SJeremy L Thompson 
1489e1d4b82SJeremy L Thompson     // Load coefficients
149d6c19ee8SJeremy L Thompson     __syncthreads();
1509e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
1519e1d4b82SJeremy L Thompson     __syncthreads();
1529e1d4b82SJeremy L Thompson     // Contract x direction
1539e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1549e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1559e1d4b82SJeremy L Thompson       buffer[i] = 0.0;
1569e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < Q_1D; j++) {
1579e1d4b82SJeremy L Thompson         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
1589e1d4b82SJeremy L Thompson       }
1599e1d4b82SJeremy L Thompson     }
1609e1d4b82SJeremy L Thompson     // Contract y direction
1619e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
1629e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1639e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * buffer[i];
1649e1d4b82SJeremy L Thompson     }
1659e1d4b82SJeremy L Thompson   }
1669e1d4b82SJeremy L Thompson }
1679e1d4b82SJeremy L Thompson 
1689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1699e1d4b82SJeremy L Thompson // 2D interpolate transpose
1709e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
171f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
1729e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
1739e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
1749e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1759e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
1769e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
1779e1d4b82SJeremy L Thompson 
1789e1d4b82SJeremy L Thompson     // Clear shared memory
1799e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
1809e1d4b82SJeremy L Thompson     __syncthreads();
1819e1d4b82SJeremy L Thompson     // Contract y direction
1829e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
1839e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1849e1d4b82SJeremy L Thompson       buffer[i] = chebyshev_x[i] * r_U[comp];
1859e1d4b82SJeremy L Thompson     }
1869e1d4b82SJeremy L Thompson     // Contract x direction
1879e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1889e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
1899e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
1909e1d4b82SJeremy L Thompson         // Note: shifting to avoid atomic adds
191a24d84eaSJeremy L Thompson         const CeedInt ii = (i + data.t_id_x) % Q_1D;
1929e1d4b82SJeremy L Thompson 
1939e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
194a24d84eaSJeremy L Thompson           const CeedInt jj = (j + data.t_id_y) % Q_1D;
1959e1d4b82SJeremy L Thompson 
1969e1d4b82SJeremy L Thompson           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
1979e1d4b82SJeremy L Thompson         }
1989e1d4b82SJeremy L Thompson       }
1999e1d4b82SJeremy L Thompson     }
2009e1d4b82SJeremy L Thompson     // Pull from shared to register
2019e1d4b82SJeremy L Thompson     __syncthreads();
2029e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2039e1d4b82SJeremy L Thompson   }
2049e1d4b82SJeremy L Thompson }
2059e1d4b82SJeremy L Thompson 
2069e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2079e1d4b82SJeremy L Thompson // 2D derivatives at points
2089e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
209f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
2109e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
2119e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
2129e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
2139e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2149e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2159e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2169e1d4b82SJeremy L Thompson 
2179e1d4b82SJeremy L Thompson     // Load coefficients
218d6c19ee8SJeremy L Thompson     __syncthreads();
2199e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
2209e1d4b82SJeremy L Thompson     __syncthreads();
2219e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
2229e1d4b82SJeremy L Thompson       // Contract x direction
2239e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2249e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2259e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2269e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
2279e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
2289e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
2299e1d4b82SJeremy L Thompson         }
2309e1d4b82SJeremy L Thompson       }
2319e1d4b82SJeremy L Thompson       // Contract y direction
2329e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2339e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
2349e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2359e1d4b82SJeremy L Thompson         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
2369e1d4b82SJeremy L Thompson       }
2379e1d4b82SJeremy L Thompson     }
2389e1d4b82SJeremy L Thompson   }
2399e1d4b82SJeremy L Thompson }
2409e1d4b82SJeremy L Thompson 
2419e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2429e1d4b82SJeremy L Thompson // 2D derivatives transpose
2439e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
244f29bd075SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
2459e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
2469e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
2479e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2489e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2499e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2509e1d4b82SJeremy L Thompson 
2519e1d4b82SJeremy L Thompson     // Clear shared memory
2529e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
2539e1d4b82SJeremy L Thompson     __syncthreads();
2549e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
2559e1d4b82SJeremy L Thompson       // Contract y direction
2569e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2579e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
2589e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2599e1d4b82SJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
2609e1d4b82SJeremy L Thompson       }
2619e1d4b82SJeremy L Thompson       // Contract x direction
2629e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2639e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2649e1d4b82SJeremy L Thompson       if (p < NUM_POINTS) {
2659e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
2669e1d4b82SJeremy L Thompson           // Note: shifting to avoid atomic adds
267a24d84eaSJeremy L Thompson           const CeedInt ii = (i + data.t_id_x) % Q_1D;
2689e1d4b82SJeremy L Thompson 
2699e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
270a24d84eaSJeremy L Thompson             const CeedInt jj = (j + data.t_id_y) % Q_1D;
2719e1d4b82SJeremy L Thompson 
2729e1d4b82SJeremy L Thompson             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
2739e1d4b82SJeremy L Thompson           }
2749e1d4b82SJeremy L Thompson         }
2759e1d4b82SJeremy L Thompson       }
2769e1d4b82SJeremy L Thompson     }
2779e1d4b82SJeremy L Thompson     // Pull from shared to register
2789e1d4b82SJeremy L Thompson     __syncthreads();
2799e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2809e1d4b82SJeremy L Thompson   }
2819e1d4b82SJeremy L Thompson }
2829e1d4b82SJeremy L Thompson 
2839e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2849e1d4b82SJeremy L Thompson // 3D
2859e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2869e1d4b82SJeremy L Thompson 
2879e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2889e1d4b82SJeremy L Thompson // 3D interpolate to points
2899e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
290f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
2919e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
2929e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
2939e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
2949e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
2959e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2969e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2979e1d4b82SJeremy L Thompson 
298*802d760aSJeremy L Thompson     // Get z contraction value
299*802d760aSJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
300*802d760aSJeremy L Thompson     const CeedScalar z = chebyshev_x[k];
301*802d760aSJeremy L Thompson 
302*802d760aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3039e1d4b82SJeremy L Thompson       // Load coefficients
304d6c19ee8SJeremy L Thompson       __syncthreads();
3059e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3069e1d4b82SJeremy L Thompson       __syncthreads();
3079e1d4b82SJeremy L Thompson       // Contract x direction
3089e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3099e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3109e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
3119e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
3129e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
3139e1d4b82SJeremy L Thompson         }
3149e1d4b82SJeremy L Thompson       }
3159e1d4b82SJeremy L Thompson       // Contract y and z direction
3169e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3179e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3189e1d4b82SJeremy L Thompson         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
3199e1d4b82SJeremy L Thompson       }
3209e1d4b82SJeremy L Thompson     }
3219e1d4b82SJeremy L Thompson   }
3229e1d4b82SJeremy L Thompson }
3239e1d4b82SJeremy L Thompson 
3249e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3259e1d4b82SJeremy L Thompson // 3D interpolate transpose
3269e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
327f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
3289e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
3299e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
3309e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
3319e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
3329e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
3339e1d4b82SJeremy L Thompson 
334*802d760aSJeremy L Thompson     // Get z contraction value
335*802d760aSJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
336*802d760aSJeremy L Thompson     const CeedScalar z = chebyshev_x[k];
337*802d760aSJeremy L Thompson 
338*802d760aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3399e1d4b82SJeremy L Thompson       // Clear shared memory
3409e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
3419e1d4b82SJeremy L Thompson       __syncthreads();
3429e1d4b82SJeremy L Thompson       // Contract y and z direction
3439e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3449e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3459e1d4b82SJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
3469e1d4b82SJeremy L Thompson       }
3479e1d4b82SJeremy L Thompson       // Contract x direction
3489e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3499e1d4b82SJeremy L Thompson       if (p < NUM_POINTS) {
3509e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
3519e1d4b82SJeremy L Thompson           // Note: shifting to avoid atomic adds
352a24d84eaSJeremy L Thompson           const CeedInt ii = (i + data.t_id_x) % Q_1D;
3539e1d4b82SJeremy L Thompson 
3549e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
355a24d84eaSJeremy L Thompson             const CeedInt jj = (j + data.t_id_y) % Q_1D;
3569e1d4b82SJeremy L Thompson 
3579e1d4b82SJeremy L Thompson             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
3589e1d4b82SJeremy L Thompson           }
3599e1d4b82SJeremy L Thompson         }
3609e1d4b82SJeremy L Thompson       }
3619e1d4b82SJeremy L Thompson       // Pull from shared to register
3629e1d4b82SJeremy L Thompson       __syncthreads();
3639e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
3649e1d4b82SJeremy L Thompson     }
3659e1d4b82SJeremy L Thompson   }
3669e1d4b82SJeremy L Thompson }
3679e1d4b82SJeremy L Thompson 
3689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3699e1d4b82SJeremy L Thompson // 3D derivatives at points
3709e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
371f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
3729e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
3739e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
3749e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
3759e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
3769e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
3779e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
3789e1d4b82SJeremy L Thompson 
379*802d760aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
380*802d760aSJeremy L Thompson       // Get z contraction value
381*802d760aSJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
382*802d760aSJeremy L Thompson       CeedScalar z = chebyshev_x[k];
383*802d760aSJeremy L Thompson 
3849e1d4b82SJeremy L Thompson       // Load coefficients
385d6c19ee8SJeremy L Thompson       __syncthreads();
3869e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3879e1d4b82SJeremy L Thompson       __syncthreads();
388*802d760aSJeremy L Thompson       // Gradient directions
3899e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
390*802d760aSJeremy L Thompson         // Update z value for final pass
391*802d760aSJeremy L Thompson         if (dim == 2) {
392*802d760aSJeremy L Thompson           ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
393*802d760aSJeremy L Thompson           z = chebyshev_x[k];
394*802d760aSJeremy L Thompson         }
3959e1d4b82SJeremy L Thompson         // Contract x direction
3969e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
3979e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3989e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
3999e1d4b82SJeremy L Thompson           buffer[i] = 0.0;
4009e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
4019e1d4b82SJeremy L Thompson             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
4029e1d4b82SJeremy L Thompson           }
4039e1d4b82SJeremy L Thompson         }
4049e1d4b82SJeremy L Thompson         // Contract y and z direction
4059e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4069e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
4079e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
4089e1d4b82SJeremy L Thompson           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
4099e1d4b82SJeremy L Thompson         }
4109e1d4b82SJeremy L Thompson       }
4119e1d4b82SJeremy L Thompson     }
4129e1d4b82SJeremy L Thompson   }
4139e1d4b82SJeremy L Thompson }
4149e1d4b82SJeremy L Thompson 
4159e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
4169e1d4b82SJeremy L Thompson // 3D derivatives transpose
4179e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
418f725b54bSJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
4199e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
4209e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
4219e1d4b82SJeremy L Thompson   for (CeedInt k = 0; k < Q_1D; k++) {
4229e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
4239e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
4249e1d4b82SJeremy L Thompson 
425*802d760aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
426*802d760aSJeremy L Thompson       // Get z contraction value
427*802d760aSJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
428*802d760aSJeremy L Thompson       CeedScalar z = chebyshev_x[k];
429*802d760aSJeremy L Thompson 
4309e1d4b82SJeremy L Thompson       // Clear shared memory
4319e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
4329e1d4b82SJeremy L Thompson       __syncthreads();
433*802d760aSJeremy L Thompson       // Gradient directions
4349e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
435*802d760aSJeremy L Thompson         // Update z value for final pass
436*802d760aSJeremy L Thompson         if (dim == 2) {
437*802d760aSJeremy L Thompson           ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
438*802d760aSJeremy L Thompson           z = chebyshev_x[k];
439*802d760aSJeremy L Thompson         }
4409e1d4b82SJeremy L Thompson         // Contract y and z direction
4419e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4429e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
4439e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
4449e1d4b82SJeremy L Thompson           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
4459e1d4b82SJeremy L Thompson         }
4469e1d4b82SJeremy L Thompson         // Contract x direction
4479e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
4489e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
4499e1d4b82SJeremy L Thompson         if (p < NUM_POINTS) {
4509e1d4b82SJeremy L Thompson           for (CeedInt i = 0; i < Q_1D; i++) {
4519e1d4b82SJeremy L Thompson             // Note: shifting to avoid atomic adds
452a24d84eaSJeremy L Thompson             const CeedInt ii = (i + data.t_id_x) % Q_1D;
4539e1d4b82SJeremy L Thompson 
4549e1d4b82SJeremy L Thompson             for (CeedInt j = 0; j < Q_1D; j++) {
455a24d84eaSJeremy L Thompson               const CeedInt jj = (j + data.t_id_y) % Q_1D;
4569e1d4b82SJeremy L Thompson 
4579e1d4b82SJeremy L Thompson               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
4589e1d4b82SJeremy L Thompson             }
4599e1d4b82SJeremy L Thompson           }
4609e1d4b82SJeremy L Thompson         }
4619e1d4b82SJeremy L Thompson       }
4629e1d4b82SJeremy L Thompson       // Pull from shared to register
4639e1d4b82SJeremy L Thompson       __syncthreads();
4649e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
4659e1d4b82SJeremy L Thompson     }
4669e1d4b82SJeremy L Thompson   }
4679e1d4b82SJeremy L Thompson }
468