xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h (revision c0b5abf0f23b15c4f0ada76f8abe9f8d2b6fa247)
11c21e869SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
21c21e869SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
31c21e869SJeremy L Thompson //
41c21e869SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
51c21e869SJeremy L Thompson //
61c21e869SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
71c21e869SJeremy L Thompson 
81c21e869SJeremy L Thompson /// @file
91c21e869SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
101c21e869SJeremy L Thompson 
11*c0b5abf0SJeremy L Thompson #include <ceed/types.h>
121c21e869SJeremy L Thompson 
131c21e869SJeremy L Thompson //------------------------------------------------------------------------------
141c21e869SJeremy L Thompson // Chebyshev values
151c21e869SJeremy L Thompson //------------------------------------------------------------------------------
161c21e869SJeremy L Thompson template <int Q_1D>
171c21e869SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
181c21e869SJeremy L Thompson   chebyshev_x[0] = 1.0;
191c21e869SJeremy L Thompson   chebyshev_x[1] = 2 * x;
201c21e869SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
211c21e869SJeremy L Thompson }
221c21e869SJeremy L Thompson 
231c21e869SJeremy L Thompson template <int Q_1D>
241c21e869SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
251c21e869SJeremy L Thompson   CeedScalar chebyshev_x[3];
261c21e869SJeremy L Thompson 
271c21e869SJeremy L Thompson   chebyshev_x[1]  = 1.0;
281c21e869SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
291c21e869SJeremy L Thompson   chebyshev_dx[0] = 0.0;
301c21e869SJeremy L Thompson   chebyshev_dx[1] = 2.0;
311c21e869SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
3280c135a8SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
3380c135a8SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
341c21e869SJeremy L Thompson   }
351c21e869SJeremy L Thompson }
361c21e869SJeremy L Thompson 
371c21e869SJeremy L Thompson //------------------------------------------------------------------------------
381c21e869SJeremy L Thompson // Tensor Basis Kernels AtPoints
391c21e869SJeremy L Thompson //------------------------------------------------------------------------------
401c21e869SJeremy L Thompson 
411c21e869SJeremy L Thompson //------------------------------------------------------------------------------
421c21e869SJeremy L Thompson // Interp
431c21e869SJeremy L Thompson //------------------------------------------------------------------------------
441c21e869SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
45111870feSJeremy L Thompson                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
46111870feSJeremy L Thompson                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
471c21e869SJeremy L Thompson   const CeedInt i = threadIdx.x;
481c21e869SJeremy L Thompson 
49f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
501c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
511c21e869SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
521c21e869SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
531c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
541c21e869SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
551c21e869SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
561c21e869SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
571c21e869SJeremy L Thompson   }
581c21e869SJeremy L Thompson 
591c21e869SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
601c21e869SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
611c21e869SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
621c21e869SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
631c21e869SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
641c21e869SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
651c21e869SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
661c21e869SJeremy L Thompson 
671c21e869SJeremy L Thompson   // Apply basis element by element
681c21e869SJeremy L Thompson   if (is_transpose) {
691c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
701c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
71db2becc9SJeremy L Thompson         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
72db2becc9SJeremy L Thompson         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
731c21e869SJeremy L Thompson         CeedInt           pre   = 1;
741c21e869SJeremy L Thompson         CeedInt           post  = 1;
751c21e869SJeremy L Thompson 
761c21e869SJeremy L Thompson         // Clear Chebyshev coeffs
771c21e869SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
781c21e869SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
791c21e869SJeremy L Thompson         }
801c21e869SJeremy L Thompson 
811c21e869SJeremy L Thompson         // Map from point
822d10e82cSJeremy L Thompson         __syncthreads();
832d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
84111870feSJeremy L Thompson           if (p >= points_per_elem[elem]) continue;
851c21e869SJeremy L Thompson           pre  = 1;
861c21e869SJeremy L Thompson           post = 1;
871c21e869SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
881c21e869SJeremy L Thompson             // Update buffers used
891c21e869SJeremy L Thompson             pre /= 1;
90db2becc9SJeremy L Thompson             const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
911c21e869SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
921c21e869SJeremy L Thompson 
931c21e869SJeremy L Thompson             // Build Chebyshev polynomial values
941c21e869SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
951c21e869SJeremy L Thompson 
961c21e869SJeremy L Thompson             // Contract along middle index
971c21e869SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
981c21e869SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
991c21e869SJeremy L Thompson                 if (d == BASIS_DIM - 1) {
100ad8059fcSJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
1011c21e869SJeremy L Thompson                 } else {
1021c21e869SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
1031c21e869SJeremy L Thompson                 }
1041c21e869SJeremy L Thompson               }
1051c21e869SJeremy L Thompson             }
1061c21e869SJeremy L Thompson             post *= Q;
1071c21e869SJeremy L Thompson           }
1081c21e869SJeremy L Thompson         }
1091c21e869SJeremy L Thompson 
1101c21e869SJeremy L Thompson         // Map from coefficients
1111c21e869SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
1121c21e869SJeremy L Thompson         post = 1;
1131c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
1141c21e869SJeremy L Thompson           __syncthreads();
1151c21e869SJeremy L Thompson           // Update buffers used
1161c21e869SJeremy L Thompson           pre /= Q;
1171c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
1181c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
1191c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
1201c21e869SJeremy L Thompson 
1211c21e869SJeremy L Thompson           // Contract along middle index
1221c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
1231c21e869SJeremy L Thompson             const CeedInt c   = k % post;
1241c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % P;
1251c21e869SJeremy L Thompson             const CeedInt a   = k / (post * P);
1261c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
1271c21e869SJeremy L Thompson 
1281c21e869SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
129db2becc9SJeremy L Thompson             if (d == BASIS_DIM - 1) out[k] += v_k;
130db2becc9SJeremy L Thompson             else out[k] = v_k;
1311c21e869SJeremy L Thompson           }
1321c21e869SJeremy L Thompson           post *= P;
1331c21e869SJeremy L Thompson         }
1341c21e869SJeremy L Thompson       }
1351c21e869SJeremy L Thompson     }
1361c21e869SJeremy L Thompson   } else {
1371c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
1381c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
139db2becc9SJeremy L Thompson         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
140db2becc9SJeremy L Thompson         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
1411c21e869SJeremy L Thompson         CeedInt           pre   = u_size;
1421c21e869SJeremy L Thompson         CeedInt           post  = 1;
1431c21e869SJeremy L Thompson 
1441c21e869SJeremy L Thompson         // Map to coefficients
1451c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
1461c21e869SJeremy L Thompson           __syncthreads();
1471c21e869SJeremy L Thompson           // Update buffers used
1481c21e869SJeremy L Thompson           pre /= P;
1491c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
1501c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
1511c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
1521c21e869SJeremy L Thompson 
1531c21e869SJeremy L Thompson           // Contract along middle index
1541c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
1551c21e869SJeremy L Thompson             const CeedInt c   = k % post;
1561c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
1571c21e869SJeremy L Thompson             const CeedInt a   = k / (post * Q);
1581c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
1591c21e869SJeremy L Thompson 
1601c21e869SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
1611c21e869SJeremy L Thompson             out[k] = v_k;
1621c21e869SJeremy L Thompson           }
1631c21e869SJeremy L Thompson           post *= Q;
1641c21e869SJeremy L Thompson         }
1651c21e869SJeremy L Thompson 
1661c21e869SJeremy L Thompson         // Map to point
1671c21e869SJeremy L Thompson         __syncthreads();
1682d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
1691c21e869SJeremy L Thompson           pre  = BASIS_NUM_QPTS;
1701c21e869SJeremy L Thompson           post = 1;
1711c21e869SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
1721c21e869SJeremy L Thompson             // Update buffers used
1731c21e869SJeremy L Thompson             pre /= Q;
1741c21e869SJeremy L Thompson             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
175db2becc9SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
1761c21e869SJeremy L Thompson 
1771c21e869SJeremy L Thompson             // Build Chebyshev polynomial values
1781c21e869SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
1791c21e869SJeremy L Thompson 
1801c21e869SJeremy L Thompson             // Contract along middle index
1811c21e869SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
1821c21e869SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
1831c21e869SJeremy L Thompson                 CeedScalar v_k = 0;
1841c21e869SJeremy L Thompson 
1851c21e869SJeremy L Thompson                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
1861c21e869SJeremy L Thompson                 out[a * post + c] = v_k;
1871c21e869SJeremy L Thompson               }
1881c21e869SJeremy L Thompson             }
1891c21e869SJeremy L Thompson             post *= 1;
1901c21e869SJeremy L Thompson           }
1911c21e869SJeremy L Thompson         }
1921c21e869SJeremy L Thompson       }
1931c21e869SJeremy L Thompson     }
1941c21e869SJeremy L Thompson   }
1951c21e869SJeremy L Thompson }
1961c21e869SJeremy L Thompson 
1971c21e869SJeremy L Thompson //------------------------------------------------------------------------------
1981c21e869SJeremy L Thompson // Grad
1991c21e869SJeremy L Thompson //------------------------------------------------------------------------------
2001c21e869SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
201111870feSJeremy L Thompson                                         const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
202111870feSJeremy L Thompson                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
2031c21e869SJeremy L Thompson   const CeedInt i = threadIdx.x;
2041c21e869SJeremy L Thompson 
205f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
2061c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
2071c21e869SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
2081c21e869SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
2091c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
2101c21e869SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
2111c21e869SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
2121c21e869SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
2131c21e869SJeremy L Thompson   }
2141c21e869SJeremy L Thompson 
2151c21e869SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
2161c21e869SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
2171c21e869SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
2181c21e869SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
2191c21e869SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
2201c21e869SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
2211c21e869SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
2221c21e869SJeremy L Thompson   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
2231c21e869SJeremy L Thompson   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
2241c21e869SJeremy L Thompson 
2251c21e869SJeremy L Thompson   // Apply basis element by element
2261c21e869SJeremy L Thompson   if (is_transpose) {
2271c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
2281c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
229db2becc9SJeremy L Thompson         CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
2301c21e869SJeremy L Thompson         CeedInt     pre   = 1;
2311c21e869SJeremy L Thompson         CeedInt     post  = 1;
2321c21e869SJeremy L Thompson 
2331c21e869SJeremy L Thompson         // Clear Chebyshev coeffs
2341c21e869SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
2351c21e869SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
2361c21e869SJeremy L Thompson         }
2371c21e869SJeremy L Thompson 
2381c21e869SJeremy L Thompson         // Map from point
2392d10e82cSJeremy L Thompson         __syncthreads();
2402d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
241111870feSJeremy L Thompson           if (p >= points_per_elem[elem]) continue;
2421c21e869SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
243db2becc9SJeremy L Thompson             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
2441c21e869SJeremy L Thompson 
2451c21e869SJeremy L Thompson             pre  = 1;
2461c21e869SJeremy L Thompson             post = 1;
2471c21e869SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
2481c21e869SJeremy L Thompson               // Update buffers used
2491c21e869SJeremy L Thompson               pre /= 1;
250db2becc9SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
2511c21e869SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
2521c21e869SJeremy L Thompson 
2531c21e869SJeremy L Thompson               // Build Chebyshev polynomial values
2541c21e869SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
2551c21e869SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
2561c21e869SJeremy L Thompson 
2571c21e869SJeremy L Thompson               // Contract along middle index
2581c21e869SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
2591c21e869SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
2601c21e869SJeremy L Thompson                   if (dim_2 == BASIS_DIM - 1) {
261ad8059fcSJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
2621c21e869SJeremy L Thompson                   } else {
2631c21e869SJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
2641c21e869SJeremy L Thompson                   }
2651c21e869SJeremy L Thompson                 }
2661c21e869SJeremy L Thompson               }
2671c21e869SJeremy L Thompson               post *= Q;
2681c21e869SJeremy L Thompson             }
2691c21e869SJeremy L Thompson           }
2701c21e869SJeremy L Thompson         }
2711c21e869SJeremy L Thompson 
2721c21e869SJeremy L Thompson         // Map from coefficients
2731c21e869SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
2741c21e869SJeremy L Thompson         post = 1;
2751c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
2761c21e869SJeremy L Thompson           __syncthreads();
2771c21e869SJeremy L Thompson           // Update buffers used
2781c21e869SJeremy L Thompson           pre /= Q;
2791c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
2801c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
2811c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
2821c21e869SJeremy L Thompson 
2831c21e869SJeremy L Thompson           // Contract along middle index
2841c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
2851c21e869SJeremy L Thompson             const CeedInt c   = k % post;
2861c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % P;
2871c21e869SJeremy L Thompson             const CeedInt a   = k / (post * P);
2881c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
2891c21e869SJeremy L Thompson 
2901c21e869SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
291db2becc9SJeremy L Thompson             if (d == BASIS_DIM - 1) out[k] += v_k;
292db2becc9SJeremy L Thompson             else out[k] = v_k;
2931c21e869SJeremy L Thompson           }
2941c21e869SJeremy L Thompson           post *= P;
2951c21e869SJeremy L Thompson         }
2961c21e869SJeremy L Thompson       }
2971c21e869SJeremy L Thompson     }
2981c21e869SJeremy L Thompson   } else {
2991c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
3001c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
301db2becc9SJeremy L Thompson         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
3021c21e869SJeremy L Thompson         CeedInt           pre   = u_size;
3031c21e869SJeremy L Thompson         CeedInt           post  = 1;
3041c21e869SJeremy L Thompson 
3051c21e869SJeremy L Thompson         // Map to coefficients
3061c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
3071c21e869SJeremy L Thompson           __syncthreads();
3081c21e869SJeremy L Thompson           // Update buffers used
3091c21e869SJeremy L Thompson           pre /= P;
3101c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
3111c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
3121c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
3131c21e869SJeremy L Thompson 
3141c21e869SJeremy L Thompson           // Contract along middle index
3151c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
3161c21e869SJeremy L Thompson             const CeedInt c   = k % post;
3171c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
3181c21e869SJeremy L Thompson             const CeedInt a   = k / (post * Q);
3191c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
3201c21e869SJeremy L Thompson 
3211c21e869SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
3221c21e869SJeremy L Thompson             out[k] = v_k;
3231c21e869SJeremy L Thompson           }
3241c21e869SJeremy L Thompson           post *= Q;
3251c21e869SJeremy L Thompson         }
3261c21e869SJeremy L Thompson 
3271c21e869SJeremy L Thompson         // Map to point
3281c21e869SJeremy L Thompson         __syncthreads();
3292d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
3301c21e869SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
331db2becc9SJeremy L Thompson             CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
3321c21e869SJeremy L Thompson 
3331c21e869SJeremy L Thompson             pre  = BASIS_NUM_QPTS;
3341c21e869SJeremy L Thompson             post = 1;
3351c21e869SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
3361c21e869SJeremy L Thompson               // Update buffers used
3371c21e869SJeremy L Thompson               pre /= Q;
3381c21e869SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
339db2becc9SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
3401c21e869SJeremy L Thompson 
3411c21e869SJeremy L Thompson               // Build Chebyshev polynomial values
3421c21e869SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
3431c21e869SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
3441c21e869SJeremy L Thompson 
3451c21e869SJeremy L Thompson               // Contract along middle index
3461c21e869SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
3471c21e869SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
3481c21e869SJeremy L Thompson                   CeedScalar v_k = 0;
3491c21e869SJeremy L Thompson 
3501c21e869SJeremy L Thompson                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
3511c21e869SJeremy L Thompson                   out[a * post + c] = v_k;
3521c21e869SJeremy L Thompson                 }
3531c21e869SJeremy L Thompson               }
3541c21e869SJeremy L Thompson               post *= 1;
3551c21e869SJeremy L Thompson             }
3561c21e869SJeremy L Thompson           }
3571c21e869SJeremy L Thompson         }
3581c21e869SJeremy L Thompson       }
3591c21e869SJeremy L Thompson     }
3601c21e869SJeremy L Thompson   }
3611c21e869SJeremy L Thompson }
362