xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h (revision 81ae61599cc0e14ccce523d66e38bf75c6f7903c)
134d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
334d14614SJeremy L Thompson //
434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
534d14614SJeremy L Thompson //
634d14614SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
734d14614SJeremy L Thompson 
834d14614SJeremy L Thompson /// @file
934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
1134d14614SJeremy L Thompson 
1234d14614SJeremy L Thompson //------------------------------------------------------------------------------
1334d14614SJeremy L Thompson // Chebyshev values
1434d14614SJeremy L Thompson //------------------------------------------------------------------------------
1534d14614SJeremy L Thompson template <int Q_1D>
1634d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
1734d14614SJeremy L Thompson   chebyshev_x[0] = 1.0;
1834d14614SJeremy L Thompson   chebyshev_x[1] = 2 * x;
1934d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
2034d14614SJeremy L Thompson }
2134d14614SJeremy L Thompson 
2234d14614SJeremy L Thompson template <int Q_1D>
2334d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
2434d14614SJeremy L Thompson   CeedScalar chebyshev_x[3];
2534d14614SJeremy L Thompson 
2634d14614SJeremy L Thompson   chebyshev_x[1]  = 1.0;
2734d14614SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
2834d14614SJeremy L Thompson   chebyshev_dx[0] = 0.0;
2934d14614SJeremy L Thompson   chebyshev_dx[1] = 2.0;
3034d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
3180c135a8SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
3280c135a8SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
3334d14614SJeremy L Thompson   }
3434d14614SJeremy L Thompson }
3534d14614SJeremy L Thompson 
3634d14614SJeremy L Thompson //------------------------------------------------------------------------------
3734d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints
3834d14614SJeremy L Thompson //------------------------------------------------------------------------------
3934d14614SJeremy L Thompson 
4034d14614SJeremy L Thompson //------------------------------------------------------------------------------
4134d14614SJeremy L Thompson // Interp
4234d14614SJeremy L Thompson //------------------------------------------------------------------------------
43*81ae6159SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
44111870feSJeremy L Thompson                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
45111870feSJeremy L Thompson                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
4634d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
4734d14614SJeremy L Thompson 
48f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
4934d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
5034d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
5134d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
5234d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
5334d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
5434d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
5534d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
5634d14614SJeremy L Thompson   }
5734d14614SJeremy L Thompson 
5834d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
5934d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
60*81ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_NODES;
61*81ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_PTS;
62*81ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
63*81ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
64*81ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_NODES;
6534d14614SJeremy L Thompson 
6634d14614SJeremy L Thompson   // Apply basis element by element
67*81ae6159SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
68*81ae6159SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
69*81ae6159SJeremy L Thompson       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
70*81ae6159SJeremy L Thompson       CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
71*81ae6159SJeremy L Thompson       CeedInt           pre   = u_size;
72*81ae6159SJeremy L Thompson       CeedInt           post  = 1;
73*81ae6159SJeremy L Thompson 
74*81ae6159SJeremy L Thompson       // Map to coefficients
75*81ae6159SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
76*81ae6159SJeremy L Thompson         __syncthreads();
77*81ae6159SJeremy L Thompson         // Update buffers used
78*81ae6159SJeremy L Thompson         pre /= P;
79*81ae6159SJeremy L Thompson         const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
80*81ae6159SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
81*81ae6159SJeremy L Thompson         const CeedInt     writeLen = pre * post * Q;
82*81ae6159SJeremy L Thompson 
83*81ae6159SJeremy L Thompson         // Contract along middle index
84*81ae6159SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
85*81ae6159SJeremy L Thompson           const CeedInt c   = k % post;
86*81ae6159SJeremy L Thompson           const CeedInt j   = (k / post) % Q;
87*81ae6159SJeremy L Thompson           const CeedInt a   = k / (post * Q);
88*81ae6159SJeremy L Thompson           CeedScalar    v_k = 0;
89*81ae6159SJeremy L Thompson 
90*81ae6159SJeremy L Thompson           for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
91*81ae6159SJeremy L Thompson           out[k] = v_k;
92*81ae6159SJeremy L Thompson         }
93*81ae6159SJeremy L Thompson         post *= Q;
94*81ae6159SJeremy L Thompson       }
95*81ae6159SJeremy L Thompson 
96*81ae6159SJeremy L Thompson       // Map to point
97*81ae6159SJeremy L Thompson       __syncthreads();
98*81ae6159SJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
99*81ae6159SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
100*81ae6159SJeremy L Thompson         post = 1;
101*81ae6159SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
102*81ae6159SJeremy L Thompson           // Update buffers used
103*81ae6159SJeremy L Thompson           pre /= Q;
104*81ae6159SJeremy L Thompson           const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
105*81ae6159SJeremy L Thompson           CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
106*81ae6159SJeremy L Thompson 
107*81ae6159SJeremy L Thompson           // Build Chebyshev polynomial values
108*81ae6159SJeremy L Thompson           ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
109*81ae6159SJeremy L Thompson 
110*81ae6159SJeremy L Thompson           // Contract along middle index
111*81ae6159SJeremy L Thompson           for (CeedInt a = 0; a < pre; a++) {
112*81ae6159SJeremy L Thompson             for (CeedInt c = 0; c < post; c++) {
113*81ae6159SJeremy L Thompson               CeedScalar v_k = 0;
114*81ae6159SJeremy L Thompson 
115*81ae6159SJeremy L Thompson               for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
116*81ae6159SJeremy L Thompson               out[a * post + c] = v_k;
117*81ae6159SJeremy L Thompson             }
118*81ae6159SJeremy L Thompson           }
119*81ae6159SJeremy L Thompson           post *= 1;
120*81ae6159SJeremy L Thompson         }
121*81ae6159SJeremy L Thompson       }
122*81ae6159SJeremy L Thompson     }
123*81ae6159SJeremy L Thompson   }
124*81ae6159SJeremy L Thompson }
125*81ae6159SJeremy L Thompson 
126*81ae6159SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
127*81ae6159SJeremy L Thompson                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
128*81ae6159SJeremy L Thompson                                                    const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
129*81ae6159SJeremy L Thompson   const CeedInt i = threadIdx.x;
130*81ae6159SJeremy L Thompson 
131*81ae6159SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
132*81ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
133*81ae6159SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
134*81ae6159SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
135*81ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
136*81ae6159SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
137*81ae6159SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
138*81ae6159SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
139*81ae6159SJeremy L Thompson   }
140*81ae6159SJeremy L Thompson 
141*81ae6159SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
142*81ae6159SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
143*81ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_PTS;
144*81ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_NODES;
145*81ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
146*81ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
147*81ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_PTS;
148*81ae6159SJeremy L Thompson 
149*81ae6159SJeremy L Thompson   // Apply basis element by element
15034d14614SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
15134d14614SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
152db2becc9SJeremy L Thompson       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
153db2becc9SJeremy L Thompson       CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
15434d14614SJeremy L Thompson       CeedInt           pre   = 1;
15534d14614SJeremy L Thompson       CeedInt           post  = 1;
15634d14614SJeremy L Thompson 
15734d14614SJeremy L Thompson       // Clear Chebyshev coeffs
15834d14614SJeremy L Thompson       for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
15934d14614SJeremy L Thompson         s_chebyshev_coeffs[k] = 0.0;
16034d14614SJeremy L Thompson       }
16134d14614SJeremy L Thompson 
16234d14614SJeremy L Thompson       // Map from point
1632d10e82cSJeremy L Thompson       __syncthreads();
1642d10e82cSJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
165111870feSJeremy L Thompson         if (p >= points_per_elem[elem]) continue;
16634d14614SJeremy L Thompson         pre  = 1;
16734d14614SJeremy L Thompson         post = 1;
16834d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
16934d14614SJeremy L Thompson           // Update buffers used
17034d14614SJeremy L Thompson           pre /= 1;
171db2becc9SJeremy L Thompson           const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
17234d14614SJeremy L Thompson           CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
17334d14614SJeremy L Thompson 
17434d14614SJeremy L Thompson           // Build Chebyshev polynomial values
17534d14614SJeremy L Thompson           ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
17634d14614SJeremy L Thompson 
17734d14614SJeremy L Thompson           // Contract along middle index
17834d14614SJeremy L Thompson           for (CeedInt a = 0; a < pre; a++) {
17934d14614SJeremy L Thompson             for (CeedInt c = 0; c < post; c++) {
18034d14614SJeremy L Thompson               if (d == BASIS_DIM - 1) {
181ad8059fcSJeremy L Thompson                 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
18234d14614SJeremy L Thompson               } else {
18334d14614SJeremy L Thompson                 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
18434d14614SJeremy L Thompson               }
18534d14614SJeremy L Thompson             }
18634d14614SJeremy L Thompson           }
18734d14614SJeremy L Thompson           post *= Q;
18834d14614SJeremy L Thompson         }
18934d14614SJeremy L Thompson       }
19034d14614SJeremy L Thompson 
19134d14614SJeremy L Thompson       // Map from coefficients
19234d14614SJeremy L Thompson       pre  = BASIS_NUM_QPTS;
19334d14614SJeremy L Thompson       post = 1;
19434d14614SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
19534d14614SJeremy L Thompson         __syncthreads();
19634d14614SJeremy L Thompson         // Update buffers used
19734d14614SJeremy L Thompson         pre /= Q;
19834d14614SJeremy L Thompson         const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
19934d14614SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
20034d14614SJeremy L Thompson         const CeedInt     writeLen = pre * post * P;
20134d14614SJeremy L Thompson 
20234d14614SJeremy L Thompson         // Contract along middle index
20334d14614SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
20434d14614SJeremy L Thompson           const CeedInt c   = k % post;
20534d14614SJeremy L Thompson           const CeedInt j   = (k / post) % P;
20634d14614SJeremy L Thompson           const CeedInt a   = k / (post * P);
20734d14614SJeremy L Thompson           CeedScalar    v_k = 0;
20834d14614SJeremy L Thompson 
20934d14614SJeremy L Thompson           for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
210db2becc9SJeremy L Thompson           if (d == BASIS_DIM - 1) out[k] += v_k;
211db2becc9SJeremy L Thompson           else out[k] = v_k;
21234d14614SJeremy L Thompson         }
21334d14614SJeremy L Thompson         post *= P;
21434d14614SJeremy L Thompson       }
21534d14614SJeremy L Thompson     }
21634d14614SJeremy L Thompson   }
217*81ae6159SJeremy L Thompson }
218*81ae6159SJeremy L Thompson 
219*81ae6159SJeremy L Thompson //------------------------------------------------------------------------------
220*81ae6159SJeremy L Thompson // Grad
221*81ae6159SJeremy L Thompson //------------------------------------------------------------------------------
222*81ae6159SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
223*81ae6159SJeremy L Thompson                                         const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
224*81ae6159SJeremy L Thompson                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
225*81ae6159SJeremy L Thompson   const CeedInt i = threadIdx.x;
226*81ae6159SJeremy L Thompson 
227*81ae6159SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
228*81ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
229*81ae6159SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
230*81ae6159SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
231*81ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
232*81ae6159SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
233*81ae6159SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
234*81ae6159SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
235*81ae6159SJeremy L Thompson   }
236*81ae6159SJeremy L Thompson 
237*81ae6159SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
238*81ae6159SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
239*81ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_NODES;
240*81ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_PTS;
241*81ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
242*81ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
243*81ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_NODES;
244*81ae6159SJeremy L Thompson   const CeedInt u_dim_stride  = 0;
245*81ae6159SJeremy L Thompson   const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
246*81ae6159SJeremy L Thompson 
247*81ae6159SJeremy L Thompson   // Apply basis element by element
24834d14614SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
24934d14614SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
250db2becc9SJeremy L Thompson       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
25134d14614SJeremy L Thompson       CeedInt           pre   = u_size;
25234d14614SJeremy L Thompson       CeedInt           post  = 1;
25334d14614SJeremy L Thompson 
25434d14614SJeremy L Thompson       // Map to coefficients
25534d14614SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
25634d14614SJeremy L Thompson         __syncthreads();
25734d14614SJeremy L Thompson         // Update buffers used
25834d14614SJeremy L Thompson         pre /= P;
25934d14614SJeremy L Thompson         const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
26034d14614SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
26134d14614SJeremy L Thompson         const CeedInt     writeLen = pre * post * Q;
26234d14614SJeremy L Thompson 
26334d14614SJeremy L Thompson         // Contract along middle index
26434d14614SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
26534d14614SJeremy L Thompson           const CeedInt c   = k % post;
26634d14614SJeremy L Thompson           const CeedInt j   = (k / post) % Q;
26734d14614SJeremy L Thompson           const CeedInt a   = k / (post * Q);
26834d14614SJeremy L Thompson           CeedScalar    v_k = 0;
26934d14614SJeremy L Thompson 
27034d14614SJeremy L Thompson           for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
27134d14614SJeremy L Thompson           out[k] = v_k;
27234d14614SJeremy L Thompson         }
27334d14614SJeremy L Thompson         post *= Q;
27434d14614SJeremy L Thompson       }
27534d14614SJeremy L Thompson 
27634d14614SJeremy L Thompson       // Map to point
27734d14614SJeremy L Thompson       __syncthreads();
2782d10e82cSJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
279*81ae6159SJeremy L Thompson         for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
280*81ae6159SJeremy L Thompson           CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
281*81ae6159SJeremy L Thompson 
28234d14614SJeremy L Thompson           pre  = BASIS_NUM_QPTS;
28334d14614SJeremy L Thompson           post = 1;
284*81ae6159SJeremy L Thompson           for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
28534d14614SJeremy L Thompson             // Update buffers used
28634d14614SJeremy L Thompson             pre /= Q;
287*81ae6159SJeremy L Thompson             const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
288*81ae6159SJeremy L Thompson             CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
28934d14614SJeremy L Thompson 
29034d14614SJeremy L Thompson             // Build Chebyshev polynomial values
291*81ae6159SJeremy L Thompson             if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
292*81ae6159SJeremy L Thompson             else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
29334d14614SJeremy L Thompson 
29434d14614SJeremy L Thompson             // Contract along middle index
29534d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
29634d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
29734d14614SJeremy L Thompson                 CeedScalar v_k = 0;
29834d14614SJeremy L Thompson 
29934d14614SJeremy L Thompson                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
30034d14614SJeremy L Thompson                 out[a * post + c] = v_k;
30134d14614SJeremy L Thompson               }
30234d14614SJeremy L Thompson             }
30334d14614SJeremy L Thompson             post *= 1;
30434d14614SJeremy L Thompson           }
30534d14614SJeremy L Thompson         }
30634d14614SJeremy L Thompson       }
30734d14614SJeremy L Thompson     }
30834d14614SJeremy L Thompson   }
30934d14614SJeremy L Thompson }
31034d14614SJeremy L Thompson 
311*81ae6159SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
312111870feSJeremy L Thompson                                                  const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
313111870feSJeremy L Thompson                                                  const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
31434d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
31534d14614SJeremy L Thompson 
316f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
31734d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
31834d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
31934d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
32034d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
32134d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
32234d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
32334d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
32434d14614SJeremy L Thompson   }
32534d14614SJeremy L Thompson 
32634d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
32734d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
328*81ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_PTS;
329*81ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_NODES;
330*81ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
331*81ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
332*81ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_PTS;
333*81ae6159SJeremy L Thompson   const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
334*81ae6159SJeremy L Thompson   const CeedInt v_dim_stride  = 0;
33534d14614SJeremy L Thompson 
33634d14614SJeremy L Thompson   // Apply basis element by element
33734d14614SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
33834d14614SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
339db2becc9SJeremy L Thompson       CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
34034d14614SJeremy L Thompson       CeedInt     pre   = 1;
34134d14614SJeremy L Thompson       CeedInt     post  = 1;
34234d14614SJeremy L Thompson 
34334d14614SJeremy L Thompson       // Clear Chebyshev coeffs
34434d14614SJeremy L Thompson       for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
34534d14614SJeremy L Thompson         s_chebyshev_coeffs[k] = 0.0;
34634d14614SJeremy L Thompson       }
34734d14614SJeremy L Thompson 
34834d14614SJeremy L Thompson       // Map from point
3492d10e82cSJeremy L Thompson       __syncthreads();
3502d10e82cSJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
351111870feSJeremy L Thompson         if (p >= points_per_elem[elem]) continue;
35234d14614SJeremy L Thompson         for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
353db2becc9SJeremy L Thompson           const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
35434d14614SJeremy L Thompson 
35534d14614SJeremy L Thompson           pre  = 1;
35634d14614SJeremy L Thompson           post = 1;
35734d14614SJeremy L Thompson           for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
35834d14614SJeremy L Thompson             // Update buffers used
35934d14614SJeremy L Thompson             pre /= 1;
36034d14614SJeremy L Thompson             const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
36134d14614SJeremy L Thompson             CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
36234d14614SJeremy L Thompson 
36334d14614SJeremy L Thompson             // Build Chebyshev polynomial values
36434d14614SJeremy L Thompson             if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
36534d14614SJeremy L Thompson             else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
36634d14614SJeremy L Thompson 
36734d14614SJeremy L Thompson             // Contract along middle index
36834d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
36934d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
37034d14614SJeremy L Thompson                 if (dim_2 == BASIS_DIM - 1) {
371ad8059fcSJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
37234d14614SJeremy L Thompson                 } else {
37334d14614SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
37434d14614SJeremy L Thompson                 }
37534d14614SJeremy L Thompson               }
37634d14614SJeremy L Thompson             }
37734d14614SJeremy L Thompson             post *= Q;
37834d14614SJeremy L Thompson           }
37934d14614SJeremy L Thompson         }
38034d14614SJeremy L Thompson       }
38134d14614SJeremy L Thompson 
38234d14614SJeremy L Thompson       // Map from coefficients
38334d14614SJeremy L Thompson       pre  = BASIS_NUM_QPTS;
38434d14614SJeremy L Thompson       post = 1;
38534d14614SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
38634d14614SJeremy L Thompson         __syncthreads();
38734d14614SJeremy L Thompson         // Update buffers used
38834d14614SJeremy L Thompson         pre /= Q;
38934d14614SJeremy L Thompson         const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
39034d14614SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
39134d14614SJeremy L Thompson         const CeedInt     writeLen = pre * post * P;
39234d14614SJeremy L Thompson 
39334d14614SJeremy L Thompson         // Contract along middle index
39434d14614SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
39534d14614SJeremy L Thompson           const CeedInt c   = k % post;
39634d14614SJeremy L Thompson           const CeedInt j   = (k / post) % P;
39734d14614SJeremy L Thompson           const CeedInt a   = k / (post * P);
39834d14614SJeremy L Thompson           CeedScalar    v_k = 0;
39934d14614SJeremy L Thompson 
40034d14614SJeremy L Thompson           for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
401db2becc9SJeremy L Thompson           if (d == BASIS_DIM - 1) out[k] += v_k;
402db2becc9SJeremy L Thompson           else out[k] = v_k;
40334d14614SJeremy L Thompson         }
40434d14614SJeremy L Thompson         post *= P;
40534d14614SJeremy L Thompson       }
40634d14614SJeremy L Thompson     }
40734d14614SJeremy L Thompson   }
40834d14614SJeremy L Thompson }
409