xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h (revision a24d84eaf50532bd6ddb3309c91171c35669c827)
19e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e1d4b82SJeremy L Thompson //
49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59e1d4b82SJeremy L Thompson //
69e1d4b82SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
79e1d4b82SJeremy L Thompson 
89e1d4b82SJeremy L Thompson /// @file
99e1d4b82SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis AtPoints templates
109e1d4b82SJeremy L Thompson #include <ceed/types.h>
119e1d4b82SJeremy L Thompson 
129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
139e1d4b82SJeremy L Thompson // Chebyshev values
149e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
159e1d4b82SJeremy L Thompson template <int Q_1D>
169e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
179e1d4b82SJeremy L Thompson   chebyshev_x[0] = 1.0;
189e1d4b82SJeremy L Thompson   chebyshev_x[1] = 2 * x;
199e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
209e1d4b82SJeremy L Thompson }
219e1d4b82SJeremy L Thompson 
229e1d4b82SJeremy L Thompson template <int Q_1D>
239e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
249e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[3];
259e1d4b82SJeremy L Thompson 
269e1d4b82SJeremy L Thompson   chebyshev_x[1]  = 1.0;
279e1d4b82SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
289e1d4b82SJeremy L Thompson   chebyshev_dx[0] = 0.0;
299e1d4b82SJeremy L Thompson   chebyshev_dx[1] = 2.0;
309e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
319e1d4b82SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
329e1d4b82SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
339e1d4b82SJeremy L Thompson   }
349e1d4b82SJeremy L Thompson }
359e1d4b82SJeremy L Thompson 
369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
379e1d4b82SJeremy L Thompson // 1D
389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
399e1d4b82SJeremy L Thompson 
409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
419e1d4b82SJeremy L Thompson // 1D interpolate to points
429e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
439e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
449e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
459e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
469e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
479e1d4b82SJeremy L Thompson 
489e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
499e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
509e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
519e1d4b82SJeremy L Thompson     // Load coefficients
529e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
539e1d4b82SJeremy L Thompson     __syncthreads();
549e1d4b82SJeremy L Thompson     // Contract x direction
559e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
569e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
579e1d4b82SJeremy L Thompson     }
589e1d4b82SJeremy L Thompson   }
599e1d4b82SJeremy L Thompson }
609e1d4b82SJeremy L Thompson 
619e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
629e1d4b82SJeremy L Thompson // 1D interpolate transpose
639e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
649e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
659e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
669e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
679e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
689e1d4b82SJeremy L Thompson 
699e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
709e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
719e1d4b82SJeremy L Thompson     // Clear shared memory
729e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
739e1d4b82SJeremy L Thompson     __syncthreads();
749e1d4b82SJeremy L Thompson     // Contract x direction
759e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
769e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
77*a24d84eaSJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
789e1d4b82SJeremy L Thompson       }
799e1d4b82SJeremy L Thompson     }
809e1d4b82SJeremy L Thompson     // Pull from shared to register
819e1d4b82SJeremy L Thompson     __syncthreads();
824eda27c2SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
839e1d4b82SJeremy L Thompson   }
849e1d4b82SJeremy L Thompson }
859e1d4b82SJeremy L Thompson 
869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
879e1d4b82SJeremy L Thompson // 1D derivatives at points
889e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
899e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
909e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
919e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
929e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
939e1d4b82SJeremy L Thompson 
949e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
959e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
969e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
979e1d4b82SJeremy L Thompson     // Load coefficients
989e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
999e1d4b82SJeremy L Thompson     __syncthreads();
1009e1d4b82SJeremy L Thompson     // Contract x direction
1019e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1029e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
1039e1d4b82SJeremy L Thompson     }
1049e1d4b82SJeremy L Thompson   }
1059e1d4b82SJeremy L Thompson }
1069e1d4b82SJeremy L Thompson 
1079e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1089e1d4b82SJeremy L Thompson // 1D derivatives transpose
1099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1109e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
1119e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
1129e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
1139e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
1149e1d4b82SJeremy L Thompson 
1159e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
1169e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1179e1d4b82SJeremy L Thompson     // Clear shared memory
1189e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
1199e1d4b82SJeremy L Thompson     __syncthreads();
1209e1d4b82SJeremy L Thompson     // Contract x direction
1219e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
1229e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
123*a24d84eaSJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
1249e1d4b82SJeremy L Thompson       }
1259e1d4b82SJeremy L Thompson     }
1269e1d4b82SJeremy L Thompson     // Pull from shared to register
1279e1d4b82SJeremy L Thompson     __syncthreads();
1284eda27c2SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
1299e1d4b82SJeremy L Thompson   }
1309e1d4b82SJeremy L Thompson }
1319e1d4b82SJeremy L Thompson 
1329e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1339e1d4b82SJeremy L Thompson // 2D
1349e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1359e1d4b82SJeremy L Thompson 
1369e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1379e1d4b82SJeremy L Thompson // 2D interpolate to points
1389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1399e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
1409e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
1419e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
1429e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
1439e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1449e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
1459e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
1469e1d4b82SJeremy L Thompson 
1479e1d4b82SJeremy L Thompson     // Load coefficients
1489e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
1499e1d4b82SJeremy L Thompson     __syncthreads();
1509e1d4b82SJeremy L Thompson     // Contract x direction
1519e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1529e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1539e1d4b82SJeremy L Thompson       buffer[i] = 0.0;
1549e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < Q_1D; j++) {
1559e1d4b82SJeremy L Thompson         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
1569e1d4b82SJeremy L Thompson       }
1579e1d4b82SJeremy L Thompson     }
1589e1d4b82SJeremy L Thompson     // Contract y direction
1599e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
1609e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1619e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * buffer[i];
1629e1d4b82SJeremy L Thompson     }
1639e1d4b82SJeremy L Thompson   }
1649e1d4b82SJeremy L Thompson }
1659e1d4b82SJeremy L Thompson 
1669e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1679e1d4b82SJeremy L Thompson // 2D interpolate transpose
1689e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
1699e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
1709e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
1719e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
1729e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1739e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
1749e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
1759e1d4b82SJeremy L Thompson 
1769e1d4b82SJeremy L Thompson     // Clear shared memory
1779e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
1789e1d4b82SJeremy L Thompson     __syncthreads();
1799e1d4b82SJeremy L Thompson     // Contract y direction
1809e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
1819e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
1829e1d4b82SJeremy L Thompson       buffer[i] = chebyshev_x[i] * r_U[comp];
1839e1d4b82SJeremy L Thompson     }
1849e1d4b82SJeremy L Thompson     // Contract x direction
1859e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
1869e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
1879e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
1889e1d4b82SJeremy L Thompson         // Note: shifting to avoid atomic adds
189*a24d84eaSJeremy L Thompson         const CeedInt ii = (i + data.t_id_x) % Q_1D;
1909e1d4b82SJeremy L Thompson 
1919e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
192*a24d84eaSJeremy L Thompson           const CeedInt jj = (j + data.t_id_y) % Q_1D;
1939e1d4b82SJeremy L Thompson 
1949e1d4b82SJeremy L Thompson           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
1959e1d4b82SJeremy L Thompson         }
1969e1d4b82SJeremy L Thompson       }
1979e1d4b82SJeremy L Thompson     }
1989e1d4b82SJeremy L Thompson     // Pull from shared to register
1999e1d4b82SJeremy L Thompson     __syncthreads();
2009e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2019e1d4b82SJeremy L Thompson   }
2029e1d4b82SJeremy L Thompson }
2039e1d4b82SJeremy L Thompson 
2049e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2059e1d4b82SJeremy L Thompson // 2D derivatives at points
2069e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2079e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
2089e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
2099e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
2109e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
2119e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2129e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2139e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2149e1d4b82SJeremy L Thompson 
2159e1d4b82SJeremy L Thompson     // Load coefficients
2169e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
2179e1d4b82SJeremy L Thompson     __syncthreads();
2189e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
2199e1d4b82SJeremy L Thompson       // Contract x direction
2209e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2219e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2229e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2239e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
2249e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
2259e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
2269e1d4b82SJeremy L Thompson         }
2279e1d4b82SJeremy L Thompson       }
2289e1d4b82SJeremy L Thompson       // Contract y direction
2299e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2309e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
2319e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2329e1d4b82SJeremy L Thompson         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
2339e1d4b82SJeremy L Thompson       }
2349e1d4b82SJeremy L Thompson     }
2359e1d4b82SJeremy L Thompson   }
2369e1d4b82SJeremy L Thompson }
2379e1d4b82SJeremy L Thompson 
2389e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2399e1d4b82SJeremy L Thompson // 2D derivatives transpose
2409e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2419e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
2429e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
2439e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
2449e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2459e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
2469e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
2479e1d4b82SJeremy L Thompson 
2489e1d4b82SJeremy L Thompson     // Clear shared memory
2499e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
2509e1d4b82SJeremy L Thompson     __syncthreads();
2519e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
2529e1d4b82SJeremy L Thompson       // Contract y direction
2539e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
2549e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
2559e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
2569e1d4b82SJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
2579e1d4b82SJeremy L Thompson       }
2589e1d4b82SJeremy L Thompson       // Contract x direction
2599e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
2609e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
2619e1d4b82SJeremy L Thompson       if (p < NUM_POINTS) {
2629e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
2639e1d4b82SJeremy L Thompson           // Note: shifting to avoid atomic adds
264*a24d84eaSJeremy L Thompson           const CeedInt ii = (i + data.t_id_x) % Q_1D;
2659e1d4b82SJeremy L Thompson 
2669e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
267*a24d84eaSJeremy L Thompson             const CeedInt jj = (j + data.t_id_y) % Q_1D;
2689e1d4b82SJeremy L Thompson 
2699e1d4b82SJeremy L Thompson             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
2709e1d4b82SJeremy L Thompson           }
2719e1d4b82SJeremy L Thompson         }
2729e1d4b82SJeremy L Thompson       }
2739e1d4b82SJeremy L Thompson     }
2749e1d4b82SJeremy L Thompson     // Pull from shared to register
2759e1d4b82SJeremy L Thompson     __syncthreads();
2769e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
2779e1d4b82SJeremy L Thompson   }
2789e1d4b82SJeremy L Thompson }
2799e1d4b82SJeremy L Thompson 
2809e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2819e1d4b82SJeremy L Thompson // 3D
2829e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2839e1d4b82SJeremy L Thompson 
2849e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2859e1d4b82SJeremy L Thompson // 3D interpolate to points
2869e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
2879e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
2889e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
2899e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
2909e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
2919e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2929e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
2939e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
2949e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
2959e1d4b82SJeremy L Thompson 
2969e1d4b82SJeremy L Thompson       // Load coefficients
2979e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
2989e1d4b82SJeremy L Thompson       __syncthreads();
2999e1d4b82SJeremy L Thompson       // Contract x direction
3009e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3019e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3029e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
3039e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
3049e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
3059e1d4b82SJeremy L Thompson         }
3069e1d4b82SJeremy L Thompson       }
3079e1d4b82SJeremy L Thompson       // Contract y and z direction
3089e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
3099e1d4b82SJeremy L Thompson       const CeedScalar z = chebyshev_x[k];
3109e1d4b82SJeremy L Thompson 
3119e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3129e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3139e1d4b82SJeremy L Thompson         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
3149e1d4b82SJeremy L Thompson       }
3159e1d4b82SJeremy L Thompson     }
3169e1d4b82SJeremy L Thompson   }
3179e1d4b82SJeremy L Thompson }
3189e1d4b82SJeremy L Thompson 
3199e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3209e1d4b82SJeremy L Thompson // 3D interpolate transpose
3219e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3229e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
3239e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
3249e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
3259e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3269e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
3279e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
3289e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
3299e1d4b82SJeremy L Thompson 
3309e1d4b82SJeremy L Thompson       // Clear shared memory
3319e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
3329e1d4b82SJeremy L Thompson       __syncthreads();
3339e1d4b82SJeremy L Thompson       // Contract y and z direction
3349e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
3359e1d4b82SJeremy L Thompson       const CeedScalar z = chebyshev_x[k];
3369e1d4b82SJeremy L Thompson 
3379e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3389e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
3399e1d4b82SJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
3409e1d4b82SJeremy L Thompson       }
3419e1d4b82SJeremy L Thompson       // Contract x direction
3429e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3439e1d4b82SJeremy L Thompson       if (p < NUM_POINTS) {
3449e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
3459e1d4b82SJeremy L Thompson           // Note: shifting to avoid atomic adds
346*a24d84eaSJeremy L Thompson           const CeedInt ii = (i + data.t_id_x) % Q_1D;
3479e1d4b82SJeremy L Thompson 
3489e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
349*a24d84eaSJeremy L Thompson             const CeedInt jj = (j + data.t_id_y) % Q_1D;
3509e1d4b82SJeremy L Thompson 
3519e1d4b82SJeremy L Thompson             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
3529e1d4b82SJeremy L Thompson           }
3539e1d4b82SJeremy L Thompson         }
3549e1d4b82SJeremy L Thompson       }
3559e1d4b82SJeremy L Thompson       // Pull from shared to register
3569e1d4b82SJeremy L Thompson       __syncthreads();
3579e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
3589e1d4b82SJeremy L Thompson     }
3599e1d4b82SJeremy L Thompson   }
3609e1d4b82SJeremy L Thompson }
3619e1d4b82SJeremy L Thompson 
3629e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3639e1d4b82SJeremy L Thompson // 3D derivatives at points
3649e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
3659e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
3669e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
3679e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
3689e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
3699e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
3709e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
3719e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
3729e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
3739e1d4b82SJeremy L Thompson 
3749e1d4b82SJeremy L Thompson       // Load coefficients
3759e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
3769e1d4b82SJeremy L Thompson       __syncthreads();
3779e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
3789e1d4b82SJeremy L Thompson         // Contract x direction
3799e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
3809e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
3819e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
3829e1d4b82SJeremy L Thompson           buffer[i] = 0.0;
3839e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
3849e1d4b82SJeremy L Thompson             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
3859e1d4b82SJeremy L Thompson           }
3869e1d4b82SJeremy L Thompson         }
3879e1d4b82SJeremy L Thompson         // Contract y and z direction
3889e1d4b82SJeremy L Thompson         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
3899e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
3909e1d4b82SJeremy L Thompson         const CeedScalar z = chebyshev_x[k];
3919e1d4b82SJeremy L Thompson 
3929e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
3939e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
3949e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
3959e1d4b82SJeremy L Thompson           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
3969e1d4b82SJeremy L Thompson         }
3979e1d4b82SJeremy L Thompson       }
3989e1d4b82SJeremy L Thompson     }
3999e1d4b82SJeremy L Thompson   }
4009e1d4b82SJeremy L Thompson }
4019e1d4b82SJeremy L Thompson 
4029e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
4039e1d4b82SJeremy L Thompson // 3D derivatives transpose
4049e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
4059e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
4069e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
4079e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
4089e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4099e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
4109e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
4119e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
4129e1d4b82SJeremy L Thompson 
4139e1d4b82SJeremy L Thompson       // Clear shared memory
4149e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
4159e1d4b82SJeremy L Thompson       __syncthreads();
4169e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
4179e1d4b82SJeremy L Thompson         // Contract y and z direction
4189e1d4b82SJeremy L Thompson         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
4199e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
4209e1d4b82SJeremy L Thompson         const CeedScalar z = chebyshev_x[k];
4219e1d4b82SJeremy L Thompson 
4229e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
4239e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
4249e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
4259e1d4b82SJeremy L Thompson           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
4269e1d4b82SJeremy L Thompson         }
4279e1d4b82SJeremy L Thompson         // Contract x direction
4289e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
4299e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
4309e1d4b82SJeremy L Thompson         if (p < NUM_POINTS) {
4319e1d4b82SJeremy L Thompson           for (CeedInt i = 0; i < Q_1D; i++) {
4329e1d4b82SJeremy L Thompson             // Note: shifting to avoid atomic adds
433*a24d84eaSJeremy L Thompson             const CeedInt ii = (i + data.t_id_x) % Q_1D;
4349e1d4b82SJeremy L Thompson 
4359e1d4b82SJeremy L Thompson             for (CeedInt j = 0; j < Q_1D; j++) {
436*a24d84eaSJeremy L Thompson               const CeedInt jj = (j + data.t_id_y) % Q_1D;
4379e1d4b82SJeremy L Thompson 
4389e1d4b82SJeremy L Thompson               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
4399e1d4b82SJeremy L Thompson             }
4409e1d4b82SJeremy L Thompson           }
4419e1d4b82SJeremy L Thompson         }
4429e1d4b82SJeremy L Thompson       }
4439e1d4b82SJeremy L Thompson       // Pull from shared to register
4449e1d4b82SJeremy L Thompson       __syncthreads();
4459e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
4469e1d4b82SJeremy L Thompson     }
4479e1d4b82SJeremy L Thompson   }
4489e1d4b82SJeremy L Thompson }
449