xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h (revision 80c135a87dd608e39d180e7bb5c260aa9fcc10a1)
134d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
334d14614SJeremy L Thompson //
434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
534d14614SJeremy L Thompson //
634d14614SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
734d14614SJeremy L Thompson 
834d14614SJeremy L Thompson /// @file
934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
1034d14614SJeremy L Thompson 
1134d14614SJeremy L Thompson #include <ceed.h>
1234d14614SJeremy L Thompson 
1334d14614SJeremy L Thompson //------------------------------------------------------------------------------
1434d14614SJeremy L Thompson // Chebyshev values
1534d14614SJeremy L Thompson //------------------------------------------------------------------------------
1634d14614SJeremy L Thompson template <int Q_1D>
1734d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
1834d14614SJeremy L Thompson   chebyshev_x[0] = 1.0;
1934d14614SJeremy L Thompson   chebyshev_x[1] = 2 * x;
2034d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
2134d14614SJeremy L Thompson }
2234d14614SJeremy L Thompson 
2334d14614SJeremy L Thompson template <int Q_1D>
2434d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
2534d14614SJeremy L Thompson   CeedScalar chebyshev_x[3];
2634d14614SJeremy L Thompson 
2734d14614SJeremy L Thompson   chebyshev_x[1]  = 1.0;
2834d14614SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
2934d14614SJeremy L Thompson   chebyshev_dx[0] = 0.0;
3034d14614SJeremy L Thompson   chebyshev_dx[1] = 2.0;
3134d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
32*80c135a8SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
33*80c135a8SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
3434d14614SJeremy L Thompson   }
3534d14614SJeremy L Thompson }
3634d14614SJeremy L Thompson 
3734d14614SJeremy L Thompson //------------------------------------------------------------------------------
3834d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints
3934d14614SJeremy L Thompson //------------------------------------------------------------------------------
4034d14614SJeremy L Thompson 
4134d14614SJeremy L Thompson //------------------------------------------------------------------------------
4234d14614SJeremy L Thompson // Interp
4334d14614SJeremy L Thompson //------------------------------------------------------------------------------
4434d14614SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
4534d14614SJeremy L Thompson                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
4634d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
4734d14614SJeremy L Thompson 
48f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
4934d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
5034d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
5134d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
5234d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
5334d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
5434d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
5534d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
5634d14614SJeremy L Thompson   }
5734d14614SJeremy L Thompson 
5834d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
5934d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
6034d14614SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
6134d14614SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
6234d14614SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
6334d14614SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
6434d14614SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
6534d14614SJeremy L Thompson 
6634d14614SJeremy L Thompson   // Apply basis element by element
6734d14614SJeremy L Thompson   if (is_transpose) {
6834d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
6934d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
7034d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
7134d14614SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
7234d14614SJeremy L Thompson         CeedInt           pre   = 1;
7334d14614SJeremy L Thompson         CeedInt           post  = 1;
7434d14614SJeremy L Thompson 
7534d14614SJeremy L Thompson         // Clear Chebyshev coeffs
7634d14614SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
7734d14614SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
7834d14614SJeremy L Thompson         }
7934d14614SJeremy L Thompson 
8034d14614SJeremy L Thompson         // Map from point
812d10e82cSJeremy L Thompson         __syncthreads();
822d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
8334d14614SJeremy L Thompson           pre  = 1;
8434d14614SJeremy L Thompson           post = 1;
8534d14614SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
8634d14614SJeremy L Thompson             // Update buffers used
8734d14614SJeremy L Thompson             pre /= 1;
8834d14614SJeremy L Thompson             const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
8934d14614SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
9034d14614SJeremy L Thompson 
9134d14614SJeremy L Thompson             // Build Chebyshev polynomial values
9234d14614SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
9334d14614SJeremy L Thompson 
9434d14614SJeremy L Thompson             // Contract along middle index
9534d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
9634d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
9734d14614SJeremy L Thompson                 if (d == BASIS_DIM - 1) {
98ad8059fcSJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
9934d14614SJeremy L Thompson                 } else {
10034d14614SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
10134d14614SJeremy L Thompson                 }
10234d14614SJeremy L Thompson               }
10334d14614SJeremy L Thompson             }
10434d14614SJeremy L Thompson             post *= Q;
10534d14614SJeremy L Thompson           }
10634d14614SJeremy L Thompson         }
10734d14614SJeremy L Thompson 
10834d14614SJeremy L Thompson         // Map from coefficients
10934d14614SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
11034d14614SJeremy L Thompson         post = 1;
11134d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
11234d14614SJeremy L Thompson           __syncthreads();
11334d14614SJeremy L Thompson           // Update buffers used
11434d14614SJeremy L Thompson           pre /= Q;
11534d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
11634d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
11734d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
11834d14614SJeremy L Thompson 
11934d14614SJeremy L Thompson           // Contract along middle index
12034d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
12134d14614SJeremy L Thompson             const CeedInt c   = k % post;
12234d14614SJeremy L Thompson             const CeedInt j   = (k / post) % P;
12334d14614SJeremy L Thompson             const CeedInt a   = k / (post * P);
12434d14614SJeremy L Thompson             CeedScalar    v_k = 0;
12534d14614SJeremy L Thompson 
12634d14614SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
12734d14614SJeremy L Thompson             out[k] = v_k;
12834d14614SJeremy L Thompson           }
12934d14614SJeremy L Thompson           post *= P;
13034d14614SJeremy L Thompson         }
13134d14614SJeremy L Thompson       }
13234d14614SJeremy L Thompson     }
13334d14614SJeremy L Thompson   } else {
13434d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
13534d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
13634d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
13734d14614SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
13834d14614SJeremy L Thompson         CeedInt           pre   = u_size;
13934d14614SJeremy L Thompson         CeedInt           post  = 1;
14034d14614SJeremy L Thompson 
14134d14614SJeremy L Thompson         // Map to coefficients
14234d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
14334d14614SJeremy L Thompson           __syncthreads();
14434d14614SJeremy L Thompson           // Update buffers used
14534d14614SJeremy L Thompson           pre /= P;
14634d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
14734d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
14834d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
14934d14614SJeremy L Thompson 
15034d14614SJeremy L Thompson           // Contract along middle index
15134d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
15234d14614SJeremy L Thompson             const CeedInt c   = k % post;
15334d14614SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
15434d14614SJeremy L Thompson             const CeedInt a   = k / (post * Q);
15534d14614SJeremy L Thompson             CeedScalar    v_k = 0;
15634d14614SJeremy L Thompson 
15734d14614SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
15834d14614SJeremy L Thompson             out[k] = v_k;
15934d14614SJeremy L Thompson           }
16034d14614SJeremy L Thompson           post *= Q;
16134d14614SJeremy L Thompson         }
16234d14614SJeremy L Thompson 
16334d14614SJeremy L Thompson         // Map to point
16434d14614SJeremy L Thompson         __syncthreads();
1652d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
16634d14614SJeremy L Thompson           pre  = BASIS_NUM_QPTS;
16734d14614SJeremy L Thompson           post = 1;
16834d14614SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
16934d14614SJeremy L Thompson             // Update buffers used
17034d14614SJeremy L Thompson             pre /= Q;
17134d14614SJeremy L Thompson             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
17234d14614SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
17334d14614SJeremy L Thompson 
17434d14614SJeremy L Thompson             // Build Chebyshev polynomial values
17534d14614SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
17634d14614SJeremy L Thompson 
17734d14614SJeremy L Thompson             // Contract along middle index
17834d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
17934d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
18034d14614SJeremy L Thompson                 CeedScalar v_k = 0;
18134d14614SJeremy L Thompson 
18234d14614SJeremy L Thompson                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
18334d14614SJeremy L Thompson                 out[a * post + c] = v_k;
18434d14614SJeremy L Thompson               }
18534d14614SJeremy L Thompson             }
18634d14614SJeremy L Thompson             post *= 1;
18734d14614SJeremy L Thompson           }
18834d14614SJeremy L Thompson         }
18934d14614SJeremy L Thompson       }
19034d14614SJeremy L Thompson     }
19134d14614SJeremy L Thompson   }
19234d14614SJeremy L Thompson }
19334d14614SJeremy L Thompson 
19434d14614SJeremy L Thompson //------------------------------------------------------------------------------
19534d14614SJeremy L Thompson // Grad
19634d14614SJeremy L Thompson //------------------------------------------------------------------------------
19734d14614SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
19834d14614SJeremy L Thompson                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
19934d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
20034d14614SJeremy L Thompson 
201f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
20234d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
20334d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
20434d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
20534d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
20634d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
20734d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
20834d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
20934d14614SJeremy L Thompson   }
21034d14614SJeremy L Thompson 
21134d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
21234d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
21334d14614SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
21434d14614SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
21534d14614SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
21634d14614SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
21734d14614SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
21834d14614SJeremy L Thompson   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
21934d14614SJeremy L Thompson   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
22034d14614SJeremy L Thompson 
22134d14614SJeremy L Thompson   // Apply basis element by element
22234d14614SJeremy L Thompson   if (is_transpose) {
22334d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
22434d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
22534d14614SJeremy L Thompson         CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
22634d14614SJeremy L Thompson         CeedInt     pre   = 1;
22734d14614SJeremy L Thompson         CeedInt     post  = 1;
22834d14614SJeremy L Thompson 
22934d14614SJeremy L Thompson         // Clear Chebyshev coeffs
23034d14614SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
23134d14614SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
23234d14614SJeremy L Thompson         }
23334d14614SJeremy L Thompson 
23434d14614SJeremy L Thompson         // Map from point
2352d10e82cSJeremy L Thompson         __syncthreads();
2362d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
23734d14614SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
23834d14614SJeremy L Thompson             const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
23934d14614SJeremy L Thompson 
24034d14614SJeremy L Thompson             pre  = 1;
24134d14614SJeremy L Thompson             post = 1;
24234d14614SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
24334d14614SJeremy L Thompson               // Update buffers used
24434d14614SJeremy L Thompson               pre /= 1;
24534d14614SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
24634d14614SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
24734d14614SJeremy L Thompson 
24834d14614SJeremy L Thompson               // Build Chebyshev polynomial values
24934d14614SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
25034d14614SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
25134d14614SJeremy L Thompson 
25234d14614SJeremy L Thompson               // Contract along middle index
25334d14614SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
25434d14614SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
25534d14614SJeremy L Thompson                   if (dim_2 == BASIS_DIM - 1) {
256ad8059fcSJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
25734d14614SJeremy L Thompson                   } else {
25834d14614SJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
25934d14614SJeremy L Thompson                   }
26034d14614SJeremy L Thompson                 }
26134d14614SJeremy L Thompson               }
26234d14614SJeremy L Thompson               post *= Q;
26334d14614SJeremy L Thompson             }
26434d14614SJeremy L Thompson           }
26534d14614SJeremy L Thompson         }
26634d14614SJeremy L Thompson 
26734d14614SJeremy L Thompson         // Map from coefficients
26834d14614SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
26934d14614SJeremy L Thompson         post = 1;
27034d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
27134d14614SJeremy L Thompson           __syncthreads();
27234d14614SJeremy L Thompson           // Update buffers used
27334d14614SJeremy L Thompson           pre /= Q;
27434d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
27534d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
27634d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
27734d14614SJeremy L Thompson 
27834d14614SJeremy L Thompson           // Contract along middle index
27934d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
28034d14614SJeremy L Thompson             const CeedInt c   = k % post;
28134d14614SJeremy L Thompson             const CeedInt j   = (k / post) % P;
28234d14614SJeremy L Thompson             const CeedInt a   = k / (post * P);
28334d14614SJeremy L Thompson             CeedScalar    v_k = 0;
28434d14614SJeremy L Thompson 
28534d14614SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
28634d14614SJeremy L Thompson             out[k] = v_k;
28734d14614SJeremy L Thompson           }
28834d14614SJeremy L Thompson           post *= P;
28934d14614SJeremy L Thompson         }
29034d14614SJeremy L Thompson       }
29134d14614SJeremy L Thompson     }
29234d14614SJeremy L Thompson   } else {
29334d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
29434d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
29534d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
29634d14614SJeremy L Thompson         CeedInt           pre   = u_size;
29734d14614SJeremy L Thompson         CeedInt           post  = 1;
29834d14614SJeremy L Thompson 
29934d14614SJeremy L Thompson         // Map to coefficients
30034d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
30134d14614SJeremy L Thompson           __syncthreads();
30234d14614SJeremy L Thompson           // Update buffers used
30334d14614SJeremy L Thompson           pre /= P;
30434d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
30534d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
30634d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
30734d14614SJeremy L Thompson 
30834d14614SJeremy L Thompson           // Contract along middle index
30934d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
31034d14614SJeremy L Thompson             const CeedInt c   = k % post;
31134d14614SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
31234d14614SJeremy L Thompson             const CeedInt a   = k / (post * Q);
31334d14614SJeremy L Thompson             CeedScalar    v_k = 0;
31434d14614SJeremy L Thompson 
31534d14614SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
31634d14614SJeremy L Thompson             out[k] = v_k;
31734d14614SJeremy L Thompson           }
31834d14614SJeremy L Thompson           post *= Q;
31934d14614SJeremy L Thompson         }
32034d14614SJeremy L Thompson 
32134d14614SJeremy L Thompson         // Map to point
32234d14614SJeremy L Thompson         __syncthreads();
3232d10e82cSJeremy L Thompson         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
32434d14614SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
32534d14614SJeremy L Thompson             CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
32634d14614SJeremy L Thompson 
32734d14614SJeremy L Thompson             pre  = BASIS_NUM_QPTS;
32834d14614SJeremy L Thompson             post = 1;
32934d14614SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
33034d14614SJeremy L Thompson               // Update buffers used
33134d14614SJeremy L Thompson               pre /= Q;
33234d14614SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
33334d14614SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
33434d14614SJeremy L Thompson 
33534d14614SJeremy L Thompson               // Build Chebyshev polynomial values
33634d14614SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
33734d14614SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
33834d14614SJeremy L Thompson 
33934d14614SJeremy L Thompson               // Contract along middle index
34034d14614SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
34134d14614SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
34234d14614SJeremy L Thompson                   CeedScalar v_k = 0;
34334d14614SJeremy L Thompson 
34434d14614SJeremy L Thompson                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
34534d14614SJeremy L Thompson                   out[a * post + c] = v_k;
34634d14614SJeremy L Thompson                 }
34734d14614SJeremy L Thompson               }
34834d14614SJeremy L Thompson               post *= 1;
34934d14614SJeremy L Thompson             }
35034d14614SJeremy L Thompson           }
35134d14614SJeremy L Thompson         }
35234d14614SJeremy L Thompson       }
35334d14614SJeremy L Thompson     }
35434d14614SJeremy L Thompson   }
35534d14614SJeremy L Thompson }
356