xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h (revision f7c9815f6bdd2aac16e9c7ed574a2a61404669c3)
134d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
334d14614SJeremy L Thompson //
434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
534d14614SJeremy L Thompson //
634d14614SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
734d14614SJeremy L Thompson 
834d14614SJeremy L Thompson /// @file
934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
1034d14614SJeremy L Thompson 
1134d14614SJeremy L Thompson #include <ceed.h>
1234d14614SJeremy L Thompson 
1334d14614SJeremy L Thompson //------------------------------------------------------------------------------
1434d14614SJeremy L Thompson // Chebyshev values
1534d14614SJeremy L Thompson //------------------------------------------------------------------------------
1634d14614SJeremy L Thompson template <int Q_1D>
1734d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
1834d14614SJeremy L Thompson   chebyshev_x[0] = 1.0;
1934d14614SJeremy L Thompson   chebyshev_x[1] = 2 * x;
2034d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
2134d14614SJeremy L Thompson }
2234d14614SJeremy L Thompson 
2334d14614SJeremy L Thompson template <int Q_1D>
2434d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
2534d14614SJeremy L Thompson   CeedScalar chebyshev_x[3];
2634d14614SJeremy L Thompson 
2734d14614SJeremy L Thompson   chebyshev_x[1]  = 1.0;
2834d14614SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
2934d14614SJeremy L Thompson   chebyshev_dx[0] = 0.0;
3034d14614SJeremy L Thompson   chebyshev_dx[1] = 2.0;
3134d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
3234d14614SJeremy L Thompson     chebyshev_x[0]  = chebyshev_x[1];
3334d14614SJeremy L Thompson     chebyshev_x[1]  = chebyshev_x[2];
3434d14614SJeremy L Thompson     chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
3534d14614SJeremy L Thompson     chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
3634d14614SJeremy L Thompson   }
3734d14614SJeremy L Thompson }
3834d14614SJeremy L Thompson 
3934d14614SJeremy L Thompson //------------------------------------------------------------------------------
4034d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints
4134d14614SJeremy L Thompson //------------------------------------------------------------------------------
4234d14614SJeremy L Thompson 
4334d14614SJeremy L Thompson //------------------------------------------------------------------------------
4434d14614SJeremy L Thompson // Interp
4534d14614SJeremy L Thompson //------------------------------------------------------------------------------
4634d14614SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
4734d14614SJeremy L Thompson                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
4834d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
4934d14614SJeremy L Thompson 
50*f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
5134d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
5234d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
5334d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
5434d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
5534d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
5634d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
5734d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
5834d14614SJeremy L Thompson   }
5934d14614SJeremy L Thompson 
6034d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
6134d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
6234d14614SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
6334d14614SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
6434d14614SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
6534d14614SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
6634d14614SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
6734d14614SJeremy L Thompson 
6834d14614SJeremy L Thompson   // Apply basis element by element
6934d14614SJeremy L Thompson   if (is_transpose) {
7034d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
7134d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
7234d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
7334d14614SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
7434d14614SJeremy L Thompson         CeedInt           pre   = 1;
7534d14614SJeremy L Thompson         CeedInt           post  = 1;
7634d14614SJeremy L Thompson 
7734d14614SJeremy L Thompson         // Clear Chebyshev coeffs
7834d14614SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
7934d14614SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
8034d14614SJeremy L Thompson         }
8134d14614SJeremy L Thompson 
8234d14614SJeremy L Thompson         // Map from point
832d10e82cSJeremy L Thompson         __syncthreads();
842d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
8534d14614SJeremy L Thompson           pre  = 1;
8634d14614SJeremy L Thompson           post = 1;
8734d14614SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
8834d14614SJeremy L Thompson             // Update buffers used
8934d14614SJeremy L Thompson             pre /= 1;
9034d14614SJeremy L Thompson             const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
9134d14614SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
9234d14614SJeremy L Thompson 
9334d14614SJeremy L Thompson             // Build Chebyshev polynomial values
9434d14614SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
9534d14614SJeremy L Thompson 
9634d14614SJeremy L Thompson             // Contract along middle index
9734d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
9834d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
9934d14614SJeremy L Thompson                 if (d == BASIS_DIM - 1) {
1002d10e82cSJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
10134d14614SJeremy L Thompson                 } else {
10234d14614SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
10334d14614SJeremy L Thompson                 }
10434d14614SJeremy L Thompson               }
10534d14614SJeremy L Thompson             }
10634d14614SJeremy L Thompson             post *= Q;
10734d14614SJeremy L Thompson           }
10834d14614SJeremy L Thompson         }
10934d14614SJeremy L Thompson 
11034d14614SJeremy L Thompson         // Map from coefficients
11134d14614SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
11234d14614SJeremy L Thompson         post = 1;
11334d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
11434d14614SJeremy L Thompson           __syncthreads();
11534d14614SJeremy L Thompson           // Update buffers used
11634d14614SJeremy L Thompson           pre /= Q;
11734d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
11834d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
11934d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
12034d14614SJeremy L Thompson 
12134d14614SJeremy L Thompson           // Contract along middle index
12234d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
12334d14614SJeremy L Thompson             const CeedInt c   = k % post;
12434d14614SJeremy L Thompson             const CeedInt j   = (k / post) % P;
12534d14614SJeremy L Thompson             const CeedInt a   = k / (post * P);
12634d14614SJeremy L Thompson             CeedScalar    v_k = 0;
12734d14614SJeremy L Thompson 
12834d14614SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
12934d14614SJeremy L Thompson             out[k] = v_k;
13034d14614SJeremy L Thompson           }
13134d14614SJeremy L Thompson           post *= P;
13234d14614SJeremy L Thompson         }
13334d14614SJeremy L Thompson       }
13434d14614SJeremy L Thompson     }
13534d14614SJeremy L Thompson   } else {
13634d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
13734d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
13834d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
13934d14614SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
14034d14614SJeremy L Thompson         CeedInt           pre   = u_size;
14134d14614SJeremy L Thompson         CeedInt           post  = 1;
14234d14614SJeremy L Thompson 
14334d14614SJeremy L Thompson         // Map to coefficients
14434d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
14534d14614SJeremy L Thompson           __syncthreads();
14634d14614SJeremy L Thompson           // Update buffers used
14734d14614SJeremy L Thompson           pre /= P;
14834d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
14934d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
15034d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
15134d14614SJeremy L Thompson 
15234d14614SJeremy L Thompson           // Contract along middle index
15334d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
15434d14614SJeremy L Thompson             const CeedInt c   = k % post;
15534d14614SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
15634d14614SJeremy L Thompson             const CeedInt a   = k / (post * Q);
15734d14614SJeremy L Thompson             CeedScalar    v_k = 0;
15834d14614SJeremy L Thompson 
15934d14614SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
16034d14614SJeremy L Thompson             out[k] = v_k;
16134d14614SJeremy L Thompson           }
16234d14614SJeremy L Thompson           post *= Q;
16334d14614SJeremy L Thompson         }
16434d14614SJeremy L Thompson 
16534d14614SJeremy L Thompson         // Map to point
16634d14614SJeremy L Thompson         __syncthreads();
1672d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
16834d14614SJeremy L Thompson           pre  = BASIS_NUM_QPTS;
16934d14614SJeremy L Thompson           post = 1;
17034d14614SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
17134d14614SJeremy L Thompson             // Update buffers used
17234d14614SJeremy L Thompson             pre /= Q;
17334d14614SJeremy L Thompson             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
17434d14614SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
17534d14614SJeremy L Thompson 
17634d14614SJeremy L Thompson             // Build Chebyshev polynomial values
17734d14614SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
17834d14614SJeremy L Thompson 
17934d14614SJeremy L Thompson             // Contract along middle index
18034d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
18134d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
18234d14614SJeremy L Thompson                 CeedScalar v_k = 0;
18334d14614SJeremy L Thompson 
18434d14614SJeremy L Thompson                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
18534d14614SJeremy L Thompson                 out[a * post + c] = v_k;
18634d14614SJeremy L Thompson               }
18734d14614SJeremy L Thompson             }
18834d14614SJeremy L Thompson             post *= 1;
18934d14614SJeremy L Thompson           }
19034d14614SJeremy L Thompson         }
19134d14614SJeremy L Thompson       }
19234d14614SJeremy L Thompson     }
19334d14614SJeremy L Thompson   }
19434d14614SJeremy L Thompson }
19534d14614SJeremy L Thompson 
19634d14614SJeremy L Thompson //------------------------------------------------------------------------------
19734d14614SJeremy L Thompson // Grad
19834d14614SJeremy L Thompson //------------------------------------------------------------------------------
19934d14614SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
20034d14614SJeremy L Thompson                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
20134d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
20234d14614SJeremy L Thompson 
203*f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
20434d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
20534d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
20634d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
20734d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
20834d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
20934d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
21034d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
21134d14614SJeremy L Thompson   }
21234d14614SJeremy L Thompson 
21334d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
21434d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
21534d14614SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
21634d14614SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
21734d14614SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
21834d14614SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
21934d14614SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
22034d14614SJeremy L Thompson   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
22134d14614SJeremy L Thompson   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
22234d14614SJeremy L Thompson 
22334d14614SJeremy L Thompson   // Apply basis element by element
22434d14614SJeremy L Thompson   if (is_transpose) {
22534d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
22634d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
22734d14614SJeremy L Thompson         CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
22834d14614SJeremy L Thompson         CeedInt     pre   = 1;
22934d14614SJeremy L Thompson         CeedInt     post  = 1;
23034d14614SJeremy L Thompson 
23134d14614SJeremy L Thompson         // Clear Chebyshev coeffs
23234d14614SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
23334d14614SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
23434d14614SJeremy L Thompson         }
23534d14614SJeremy L Thompson 
23634d14614SJeremy L Thompson         // Map from point
2372d10e82cSJeremy L Thompson         __syncthreads();
2382d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
23934d14614SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
24034d14614SJeremy L Thompson             const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
24134d14614SJeremy L Thompson 
24234d14614SJeremy L Thompson             pre  = 1;
24334d14614SJeremy L Thompson             post = 1;
24434d14614SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
24534d14614SJeremy L Thompson               // Update buffers used
24634d14614SJeremy L Thompson               pre /= 1;
24734d14614SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
24834d14614SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
24934d14614SJeremy L Thompson 
25034d14614SJeremy L Thompson               // Build Chebyshev polynomial values
25134d14614SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
25234d14614SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
25334d14614SJeremy L Thompson 
25434d14614SJeremy L Thompson               // Contract along middle index
25534d14614SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
25634d14614SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
25734d14614SJeremy L Thompson                   if (dim_2 == BASIS_DIM - 1) {
2582d10e82cSJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]);
25934d14614SJeremy L Thompson                   } else {
26034d14614SJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
26134d14614SJeremy L Thompson                   }
26234d14614SJeremy L Thompson                 }
26334d14614SJeremy L Thompson               }
26434d14614SJeremy L Thompson               post *= Q;
26534d14614SJeremy L Thompson             }
26634d14614SJeremy L Thompson           }
26734d14614SJeremy L Thompson         }
26834d14614SJeremy L Thompson 
26934d14614SJeremy L Thompson         // Map from coefficients
27034d14614SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
27134d14614SJeremy L Thompson         post = 1;
27234d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
27334d14614SJeremy L Thompson           __syncthreads();
27434d14614SJeremy L Thompson           // Update buffers used
27534d14614SJeremy L Thompson           pre /= Q;
27634d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
27734d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
27834d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
27934d14614SJeremy L Thompson 
28034d14614SJeremy L Thompson           // Contract along middle index
28134d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
28234d14614SJeremy L Thompson             const CeedInt c   = k % post;
28334d14614SJeremy L Thompson             const CeedInt j   = (k / post) % P;
28434d14614SJeremy L Thompson             const CeedInt a   = k / (post * P);
28534d14614SJeremy L Thompson             CeedScalar    v_k = 0;
28634d14614SJeremy L Thompson 
28734d14614SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
28834d14614SJeremy L Thompson             out[k] = v_k;
28934d14614SJeremy L Thompson           }
29034d14614SJeremy L Thompson           post *= P;
29134d14614SJeremy L Thompson         }
29234d14614SJeremy L Thompson       }
29334d14614SJeremy L Thompson     }
29434d14614SJeremy L Thompson   } else {
29534d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
29634d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
29734d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
29834d14614SJeremy L Thompson         CeedInt           pre   = u_size;
29934d14614SJeremy L Thompson         CeedInt           post  = 1;
30034d14614SJeremy L Thompson 
30134d14614SJeremy L Thompson         // Map to coefficients
30234d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
30334d14614SJeremy L Thompson           __syncthreads();
30434d14614SJeremy L Thompson           // Update buffers used
30534d14614SJeremy L Thompson           pre /= P;
30634d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
30734d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
30834d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
30934d14614SJeremy L Thompson 
31034d14614SJeremy L Thompson           // Contract along middle index
31134d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
31234d14614SJeremy L Thompson             const CeedInt c   = k % post;
31334d14614SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
31434d14614SJeremy L Thompson             const CeedInt a   = k / (post * Q);
31534d14614SJeremy L Thompson             CeedScalar    v_k = 0;
31634d14614SJeremy L Thompson 
31734d14614SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
31834d14614SJeremy L Thompson             out[k] = v_k;
31934d14614SJeremy L Thompson           }
32034d14614SJeremy L Thompson           post *= Q;
32134d14614SJeremy L Thompson         }
32234d14614SJeremy L Thompson 
32334d14614SJeremy L Thompson         // Map to point
32434d14614SJeremy L Thompson         __syncthreads();
3252d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
32634d14614SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
32734d14614SJeremy L Thompson             CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
32834d14614SJeremy L Thompson 
32934d14614SJeremy L Thompson             pre  = BASIS_NUM_QPTS;
33034d14614SJeremy L Thompson             post = 1;
33134d14614SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
33234d14614SJeremy L Thompson               // Update buffers used
33334d14614SJeremy L Thompson               pre /= Q;
33434d14614SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
33534d14614SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
33634d14614SJeremy L Thompson 
33734d14614SJeremy L Thompson               // Build Chebyshev polynomial values
33834d14614SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
33934d14614SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
34034d14614SJeremy L Thompson 
34134d14614SJeremy L Thompson               // Contract along middle index
34234d14614SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
34334d14614SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
34434d14614SJeremy L Thompson                   CeedScalar v_k = 0;
34534d14614SJeremy L Thompson 
34634d14614SJeremy L Thompson                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
34734d14614SJeremy L Thompson                   out[a * post + c] = v_k;
34834d14614SJeremy L Thompson                 }
34934d14614SJeremy L Thompson               }
35034d14614SJeremy L Thompson               post *= 1;
35134d14614SJeremy L Thompson             }
35234d14614SJeremy L Thompson           }
35334d14614SJeremy L Thompson         }
35434d14614SJeremy L Thompson       }
35534d14614SJeremy L Thompson     }
35634d14614SJeremy L Thompson   }
35734d14614SJeremy L Thompson }
358