xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h (revision 1c21e86942aba8686e330a1715ed048a6983247b)
1*1c21e869SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*1c21e869SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*1c21e869SJeremy L Thompson //
4*1c21e869SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*1c21e869SJeremy L Thompson //
6*1c21e869SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*1c21e869SJeremy L Thompson 
8*1c21e869SJeremy L Thompson /// @file
9*1c21e869SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
10*1c21e869SJeremy L Thompson 
11*1c21e869SJeremy L Thompson #include <ceed.h>
12*1c21e869SJeremy L Thompson 
13*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
14*1c21e869SJeremy L Thompson // Chebyshev values
15*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
16*1c21e869SJeremy L Thompson template <int Q_1D>
17*1c21e869SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
18*1c21e869SJeremy L Thompson   chebyshev_x[0] = 1.0;
19*1c21e869SJeremy L Thompson   chebyshev_x[1] = 2 * x;
20*1c21e869SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
21*1c21e869SJeremy L Thompson }
22*1c21e869SJeremy L Thompson 
23*1c21e869SJeremy L Thompson template <int Q_1D>
24*1c21e869SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
25*1c21e869SJeremy L Thompson   CeedScalar chebyshev_x[3];
26*1c21e869SJeremy L Thompson 
27*1c21e869SJeremy L Thompson   chebyshev_x[1]  = 1.0;
28*1c21e869SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
29*1c21e869SJeremy L Thompson   chebyshev_dx[0] = 0.0;
30*1c21e869SJeremy L Thompson   chebyshev_dx[1] = 2.0;
31*1c21e869SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
32*1c21e869SJeremy L Thompson     chebyshev_x[0]  = chebyshev_x[1];
33*1c21e869SJeremy L Thompson     chebyshev_x[1]  = chebyshev_x[2];
34*1c21e869SJeremy L Thompson     chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
35*1c21e869SJeremy L Thompson     chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
36*1c21e869SJeremy L Thompson   }
37*1c21e869SJeremy L Thompson }
38*1c21e869SJeremy L Thompson 
39*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
40*1c21e869SJeremy L Thompson // Tensor Basis Kernels AtPoints
41*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
42*1c21e869SJeremy L Thompson 
43*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
44*1c21e869SJeremy L Thompson // Interp
45*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
46*1c21e869SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
47*1c21e869SJeremy L Thompson                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
48*1c21e869SJeremy L Thompson   const CeedInt i = threadIdx.x;
49*1c21e869SJeremy L Thompson 
50*1c21e869SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
51*1c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
52*1c21e869SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
53*1c21e869SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
54*1c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
55*1c21e869SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
56*1c21e869SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
57*1c21e869SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
58*1c21e869SJeremy L Thompson   }
59*1c21e869SJeremy L Thompson 
60*1c21e869SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
61*1c21e869SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
62*1c21e869SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
63*1c21e869SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
64*1c21e869SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
65*1c21e869SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
66*1c21e869SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
67*1c21e869SJeremy L Thompson 
68*1c21e869SJeremy L Thompson   // Apply basis element by element
69*1c21e869SJeremy L Thompson   if (is_transpose) {
70*1c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
71*1c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
72*1c21e869SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
73*1c21e869SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
74*1c21e869SJeremy L Thompson         CeedInt           pre   = 1;
75*1c21e869SJeremy L Thompson         CeedInt           post  = 1;
76*1c21e869SJeremy L Thompson 
77*1c21e869SJeremy L Thompson         // Clear Chebyshev coeffs
78*1c21e869SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
79*1c21e869SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
80*1c21e869SJeremy L Thompson         }
81*1c21e869SJeremy L Thompson 
82*1c21e869SJeremy L Thompson         // Map from point
83*1c21e869SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
84*1c21e869SJeremy L Thompson           pre  = 1;
85*1c21e869SJeremy L Thompson           post = 1;
86*1c21e869SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
87*1c21e869SJeremy L Thompson             // Update buffers used
88*1c21e869SJeremy L Thompson             pre /= 1;
89*1c21e869SJeremy L Thompson             const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
90*1c21e869SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
91*1c21e869SJeremy L Thompson 
92*1c21e869SJeremy L Thompson             // Build Chebyshev polynomial values
93*1c21e869SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
94*1c21e869SJeremy L Thompson 
95*1c21e869SJeremy L Thompson             // Contract along middle index
96*1c21e869SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
97*1c21e869SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
98*1c21e869SJeremy L Thompson                 if (d == BASIS_DIM - 1) {
99*1c21e869SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
100*1c21e869SJeremy L Thompson                 } else {
101*1c21e869SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
102*1c21e869SJeremy L Thompson                 }
103*1c21e869SJeremy L Thompson               }
104*1c21e869SJeremy L Thompson             }
105*1c21e869SJeremy L Thompson             post *= Q;
106*1c21e869SJeremy L Thompson           }
107*1c21e869SJeremy L Thompson         }
108*1c21e869SJeremy L Thompson 
109*1c21e869SJeremy L Thompson         // Map from coefficients
110*1c21e869SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
111*1c21e869SJeremy L Thompson         post = 1;
112*1c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
113*1c21e869SJeremy L Thompson           __syncthreads();
114*1c21e869SJeremy L Thompson           // Update buffers used
115*1c21e869SJeremy L Thompson           pre /= Q;
116*1c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
117*1c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
118*1c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
119*1c21e869SJeremy L Thompson 
120*1c21e869SJeremy L Thompson           // Contract along middle index
121*1c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
122*1c21e869SJeremy L Thompson             const CeedInt c   = k % post;
123*1c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % P;
124*1c21e869SJeremy L Thompson             const CeedInt a   = k / (post * P);
125*1c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
126*1c21e869SJeremy L Thompson 
127*1c21e869SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
128*1c21e869SJeremy L Thompson             out[k] = v_k;
129*1c21e869SJeremy L Thompson           }
130*1c21e869SJeremy L Thompson           post *= P;
131*1c21e869SJeremy L Thompson         }
132*1c21e869SJeremy L Thompson       }
133*1c21e869SJeremy L Thompson     }
134*1c21e869SJeremy L Thompson   } else {
135*1c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
136*1c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
137*1c21e869SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
138*1c21e869SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
139*1c21e869SJeremy L Thompson         CeedInt           pre   = u_size;
140*1c21e869SJeremy L Thompson         CeedInt           post  = 1;
141*1c21e869SJeremy L Thompson 
142*1c21e869SJeremy L Thompson         // Map to coefficients
143*1c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
144*1c21e869SJeremy L Thompson           __syncthreads();
145*1c21e869SJeremy L Thompson           // Update buffers used
146*1c21e869SJeremy L Thompson           pre /= P;
147*1c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
148*1c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
149*1c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
150*1c21e869SJeremy L Thompson 
151*1c21e869SJeremy L Thompson           // Contract along middle index
152*1c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
153*1c21e869SJeremy L Thompson             const CeedInt c   = k % post;
154*1c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
155*1c21e869SJeremy L Thompson             const CeedInt a   = k / (post * Q);
156*1c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
157*1c21e869SJeremy L Thompson 
158*1c21e869SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
159*1c21e869SJeremy L Thompson             out[k] = v_k;
160*1c21e869SJeremy L Thompson           }
161*1c21e869SJeremy L Thompson           post *= Q;
162*1c21e869SJeremy L Thompson         }
163*1c21e869SJeremy L Thompson 
164*1c21e869SJeremy L Thompson         // Map to point
165*1c21e869SJeremy L Thompson         __syncthreads();
166*1c21e869SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
167*1c21e869SJeremy L Thompson           pre  = BASIS_NUM_QPTS;
168*1c21e869SJeremy L Thompson           post = 1;
169*1c21e869SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
170*1c21e869SJeremy L Thompson             // Update buffers used
171*1c21e869SJeremy L Thompson             pre /= Q;
172*1c21e869SJeremy L Thompson             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
173*1c21e869SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
174*1c21e869SJeremy L Thompson 
175*1c21e869SJeremy L Thompson             // Build Chebyshev polynomial values
176*1c21e869SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
177*1c21e869SJeremy L Thompson 
178*1c21e869SJeremy L Thompson             // Contract along middle index
179*1c21e869SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
180*1c21e869SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
181*1c21e869SJeremy L Thompson                 CeedScalar v_k = 0;
182*1c21e869SJeremy L Thompson 
183*1c21e869SJeremy L Thompson                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
184*1c21e869SJeremy L Thompson                 out[a * post + c] = v_k;
185*1c21e869SJeremy L Thompson               }
186*1c21e869SJeremy L Thompson             }
187*1c21e869SJeremy L Thompson             post *= 1;
188*1c21e869SJeremy L Thompson           }
189*1c21e869SJeremy L Thompson         }
190*1c21e869SJeremy L Thompson       }
191*1c21e869SJeremy L Thompson     }
192*1c21e869SJeremy L Thompson   }
193*1c21e869SJeremy L Thompson }
194*1c21e869SJeremy L Thompson 
195*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
196*1c21e869SJeremy L Thompson // Grad
197*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------
198*1c21e869SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
199*1c21e869SJeremy L Thompson                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
200*1c21e869SJeremy L Thompson   const CeedInt i = threadIdx.x;
201*1c21e869SJeremy L Thompson 
202*1c21e869SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
203*1c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
204*1c21e869SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
205*1c21e869SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
206*1c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
207*1c21e869SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
208*1c21e869SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
209*1c21e869SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
210*1c21e869SJeremy L Thompson   }
211*1c21e869SJeremy L Thompson 
212*1c21e869SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
213*1c21e869SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
214*1c21e869SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
215*1c21e869SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
216*1c21e869SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
217*1c21e869SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
218*1c21e869SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
219*1c21e869SJeremy L Thompson   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
220*1c21e869SJeremy L Thompson   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
221*1c21e869SJeremy L Thompson 
222*1c21e869SJeremy L Thompson   // Apply basis element by element
223*1c21e869SJeremy L Thompson   if (is_transpose) {
224*1c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
225*1c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
226*1c21e869SJeremy L Thompson         CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
227*1c21e869SJeremy L Thompson         CeedInt     pre   = 1;
228*1c21e869SJeremy L Thompson         CeedInt     post  = 1;
229*1c21e869SJeremy L Thompson 
230*1c21e869SJeremy L Thompson         // Clear Chebyshev coeffs
231*1c21e869SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
232*1c21e869SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
233*1c21e869SJeremy L Thompson         }
234*1c21e869SJeremy L Thompson 
235*1c21e869SJeremy L Thompson         // Map from point
236*1c21e869SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
237*1c21e869SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
238*1c21e869SJeremy L Thompson             const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
239*1c21e869SJeremy L Thompson 
240*1c21e869SJeremy L Thompson             pre  = 1;
241*1c21e869SJeremy L Thompson             post = 1;
242*1c21e869SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
243*1c21e869SJeremy L Thompson               // Update buffers used
244*1c21e869SJeremy L Thompson               pre /= 1;
245*1c21e869SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
246*1c21e869SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
247*1c21e869SJeremy L Thompson 
248*1c21e869SJeremy L Thompson               // Build Chebyshev polynomial values
249*1c21e869SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
250*1c21e869SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
251*1c21e869SJeremy L Thompson 
252*1c21e869SJeremy L Thompson               // Contract along middle index
253*1c21e869SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
254*1c21e869SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
255*1c21e869SJeremy L Thompson                   if (dim_2 == BASIS_DIM - 1) {
256*1c21e869SJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
257*1c21e869SJeremy L Thompson                   } else {
258*1c21e869SJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
259*1c21e869SJeremy L Thompson                   }
260*1c21e869SJeremy L Thompson                 }
261*1c21e869SJeremy L Thompson               }
262*1c21e869SJeremy L Thompson               post *= Q;
263*1c21e869SJeremy L Thompson             }
264*1c21e869SJeremy L Thompson           }
265*1c21e869SJeremy L Thompson         }
266*1c21e869SJeremy L Thompson 
267*1c21e869SJeremy L Thompson         // Map from coefficients
268*1c21e869SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
269*1c21e869SJeremy L Thompson         post = 1;
270*1c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
271*1c21e869SJeremy L Thompson           __syncthreads();
272*1c21e869SJeremy L Thompson           // Update buffers used
273*1c21e869SJeremy L Thompson           pre /= Q;
274*1c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
275*1c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
276*1c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
277*1c21e869SJeremy L Thompson 
278*1c21e869SJeremy L Thompson           // Contract along middle index
279*1c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
280*1c21e869SJeremy L Thompson             const CeedInt c   = k % post;
281*1c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % P;
282*1c21e869SJeremy L Thompson             const CeedInt a   = k / (post * P);
283*1c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
284*1c21e869SJeremy L Thompson 
285*1c21e869SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
286*1c21e869SJeremy L Thompson             out[k] = v_k;
287*1c21e869SJeremy L Thompson           }
288*1c21e869SJeremy L Thompson           post *= P;
289*1c21e869SJeremy L Thompson         }
290*1c21e869SJeremy L Thompson       }
291*1c21e869SJeremy L Thompson     }
292*1c21e869SJeremy L Thompson   } else {
293*1c21e869SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
294*1c21e869SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
295*1c21e869SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
296*1c21e869SJeremy L Thompson         CeedInt           pre   = u_size;
297*1c21e869SJeremy L Thompson         CeedInt           post  = 1;
298*1c21e869SJeremy L Thompson 
299*1c21e869SJeremy L Thompson         // Map to coefficients
300*1c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
301*1c21e869SJeremy L Thompson           __syncthreads();
302*1c21e869SJeremy L Thompson           // Update buffers used
303*1c21e869SJeremy L Thompson           pre /= P;
304*1c21e869SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
305*1c21e869SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
306*1c21e869SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
307*1c21e869SJeremy L Thompson 
308*1c21e869SJeremy L Thompson           // Contract along middle index
309*1c21e869SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
310*1c21e869SJeremy L Thompson             const CeedInt c   = k % post;
311*1c21e869SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
312*1c21e869SJeremy L Thompson             const CeedInt a   = k / (post * Q);
313*1c21e869SJeremy L Thompson             CeedScalar    v_k = 0;
314*1c21e869SJeremy L Thompson 
315*1c21e869SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
316*1c21e869SJeremy L Thompson             out[k] = v_k;
317*1c21e869SJeremy L Thompson           }
318*1c21e869SJeremy L Thompson           post *= Q;
319*1c21e869SJeremy L Thompson         }
320*1c21e869SJeremy L Thompson 
321*1c21e869SJeremy L Thompson         // Map to point
322*1c21e869SJeremy L Thompson         __syncthreads();
323*1c21e869SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
324*1c21e869SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
325*1c21e869SJeremy L Thompson             CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
326*1c21e869SJeremy L Thompson 
327*1c21e869SJeremy L Thompson             pre  = BASIS_NUM_QPTS;
328*1c21e869SJeremy L Thompson             post = 1;
329*1c21e869SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
330*1c21e869SJeremy L Thompson               // Update buffers used
331*1c21e869SJeremy L Thompson               pre /= Q;
332*1c21e869SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
333*1c21e869SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
334*1c21e869SJeremy L Thompson 
335*1c21e869SJeremy L Thompson               // Build Chebyshev polynomial values
336*1c21e869SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
337*1c21e869SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
338*1c21e869SJeremy L Thompson 
339*1c21e869SJeremy L Thompson               // Contract along middle index
340*1c21e869SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
341*1c21e869SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
342*1c21e869SJeremy L Thompson                   CeedScalar v_k = 0;
343*1c21e869SJeremy L Thompson 
344*1c21e869SJeremy L Thompson                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
345*1c21e869SJeremy L Thompson                   out[a * post + c] = v_k;
346*1c21e869SJeremy L Thompson                 }
347*1c21e869SJeremy L Thompson               }
348*1c21e869SJeremy L Thompson               post *= 1;
349*1c21e869SJeremy L Thompson             }
350*1c21e869SJeremy L Thompson           }
351*1c21e869SJeremy L Thompson         }
352*1c21e869SJeremy L Thompson       }
353*1c21e869SJeremy L Thompson     }
354*1c21e869SJeremy L Thompson   }
355*1c21e869SJeremy L Thompson }
356