xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h (revision 34d146140c2fce42d8bc042f039d47d4ff020864)
1*34d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*34d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*34d14614SJeremy L Thompson //
4*34d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*34d14614SJeremy L Thompson //
6*34d14614SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*34d14614SJeremy L Thompson 
8*34d14614SJeremy L Thompson /// @file
9*34d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
10*34d14614SJeremy L Thompson 
11*34d14614SJeremy L Thompson #include <ceed.h>
12*34d14614SJeremy L Thompson 
13*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
14*34d14614SJeremy L Thompson // Chebyshev values
15*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
16*34d14614SJeremy L Thompson template <int Q_1D>
17*34d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
18*34d14614SJeremy L Thompson   chebyshev_x[0] = 1.0;
19*34d14614SJeremy L Thompson   chebyshev_x[1] = 2 * x;
20*34d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
21*34d14614SJeremy L Thompson }
22*34d14614SJeremy L Thompson 
23*34d14614SJeremy L Thompson template <int Q_1D>
24*34d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
25*34d14614SJeremy L Thompson   CeedScalar chebyshev_x[3];
26*34d14614SJeremy L Thompson 
27*34d14614SJeremy L Thompson   chebyshev_x[1]  = 1.0;
28*34d14614SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
29*34d14614SJeremy L Thompson   chebyshev_dx[0] = 0.0;
30*34d14614SJeremy L Thompson   chebyshev_dx[1] = 2.0;
31*34d14614SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
32*34d14614SJeremy L Thompson     chebyshev_x[0]  = chebyshev_x[1];
33*34d14614SJeremy L Thompson     chebyshev_x[1]  = chebyshev_x[2];
34*34d14614SJeremy L Thompson     chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
35*34d14614SJeremy L Thompson     chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
36*34d14614SJeremy L Thompson   }
37*34d14614SJeremy L Thompson }
38*34d14614SJeremy L Thompson 
39*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
40*34d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints
41*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
42*34d14614SJeremy L Thompson 
43*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
44*34d14614SJeremy L Thompson // Interp
45*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
46*34d14614SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
47*34d14614SJeremy L Thompson                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
48*34d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
49*34d14614SJeremy L Thompson 
50*34d14614SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
51*34d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
52*34d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
53*34d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
54*34d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
55*34d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
56*34d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
57*34d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
58*34d14614SJeremy L Thompson   }
59*34d14614SJeremy L Thompson 
60*34d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
61*34d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
62*34d14614SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
63*34d14614SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
64*34d14614SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
65*34d14614SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
66*34d14614SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
67*34d14614SJeremy L Thompson 
68*34d14614SJeremy L Thompson   // Apply basis element by element
69*34d14614SJeremy L Thompson   if (is_transpose) {
70*34d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
71*34d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
72*34d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
73*34d14614SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
74*34d14614SJeremy L Thompson         CeedInt           pre   = 1;
75*34d14614SJeremy L Thompson         CeedInt           post  = 1;
76*34d14614SJeremy L Thompson 
77*34d14614SJeremy L Thompson         // Clear Chebyshev coeffs
78*34d14614SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
79*34d14614SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
80*34d14614SJeremy L Thompson         }
81*34d14614SJeremy L Thompson 
82*34d14614SJeremy L Thompson         // Map from point
83*34d14614SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
84*34d14614SJeremy L Thompson           pre  = 1;
85*34d14614SJeremy L Thompson           post = 1;
86*34d14614SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
87*34d14614SJeremy L Thompson             // Update buffers used
88*34d14614SJeremy L Thompson             pre /= 1;
89*34d14614SJeremy L Thompson             const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
90*34d14614SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
91*34d14614SJeremy L Thompson 
92*34d14614SJeremy L Thompson             // Build Chebyshev polynomial values
93*34d14614SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
94*34d14614SJeremy L Thompson 
95*34d14614SJeremy L Thompson             // Contract along middle index
96*34d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
97*34d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
98*34d14614SJeremy L Thompson                 if (d == BASIS_DIM - 1) {
99*34d14614SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
100*34d14614SJeremy L Thompson                 } else {
101*34d14614SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
102*34d14614SJeremy L Thompson                 }
103*34d14614SJeremy L Thompson               }
104*34d14614SJeremy L Thompson             }
105*34d14614SJeremy L Thompson             post *= Q;
106*34d14614SJeremy L Thompson           }
107*34d14614SJeremy L Thompson         }
108*34d14614SJeremy L Thompson 
109*34d14614SJeremy L Thompson         // Map from coefficients
110*34d14614SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
111*34d14614SJeremy L Thompson         post = 1;
112*34d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
113*34d14614SJeremy L Thompson           __syncthreads();
114*34d14614SJeremy L Thompson           // Update buffers used
115*34d14614SJeremy L Thompson           pre /= Q;
116*34d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
117*34d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
118*34d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
119*34d14614SJeremy L Thompson 
120*34d14614SJeremy L Thompson           // Contract along middle index
121*34d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
122*34d14614SJeremy L Thompson             const CeedInt c   = k % post;
123*34d14614SJeremy L Thompson             const CeedInt j   = (k / post) % P;
124*34d14614SJeremy L Thompson             const CeedInt a   = k / (post * P);
125*34d14614SJeremy L Thompson             CeedScalar    v_k = 0;
126*34d14614SJeremy L Thompson 
127*34d14614SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
128*34d14614SJeremy L Thompson             out[k] = v_k;
129*34d14614SJeremy L Thompson           }
130*34d14614SJeremy L Thompson           post *= P;
131*34d14614SJeremy L Thompson         }
132*34d14614SJeremy L Thompson       }
133*34d14614SJeremy L Thompson     }
134*34d14614SJeremy L Thompson   } else {
135*34d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
136*34d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
137*34d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
138*34d14614SJeremy L Thompson         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
139*34d14614SJeremy L Thompson         CeedInt           pre   = u_size;
140*34d14614SJeremy L Thompson         CeedInt           post  = 1;
141*34d14614SJeremy L Thompson 
142*34d14614SJeremy L Thompson         // Map to coefficients
143*34d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
144*34d14614SJeremy L Thompson           __syncthreads();
145*34d14614SJeremy L Thompson           // Update buffers used
146*34d14614SJeremy L Thompson           pre /= P;
147*34d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
148*34d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
149*34d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
150*34d14614SJeremy L Thompson 
151*34d14614SJeremy L Thompson           // Contract along middle index
152*34d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
153*34d14614SJeremy L Thompson             const CeedInt c   = k % post;
154*34d14614SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
155*34d14614SJeremy L Thompson             const CeedInt a   = k / (post * Q);
156*34d14614SJeremy L Thompson             CeedScalar    v_k = 0;
157*34d14614SJeremy L Thompson 
158*34d14614SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
159*34d14614SJeremy L Thompson             out[k] = v_k;
160*34d14614SJeremy L Thompson           }
161*34d14614SJeremy L Thompson           post *= Q;
162*34d14614SJeremy L Thompson         }
163*34d14614SJeremy L Thompson 
164*34d14614SJeremy L Thompson         // Map to point
165*34d14614SJeremy L Thompson         __syncthreads();
166*34d14614SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
167*34d14614SJeremy L Thompson           pre  = BASIS_NUM_QPTS;
168*34d14614SJeremy L Thompson           post = 1;
169*34d14614SJeremy L Thompson           for (CeedInt d = 0; d < BASIS_DIM; d++) {
170*34d14614SJeremy L Thompson             // Update buffers used
171*34d14614SJeremy L Thompson             pre /= Q;
172*34d14614SJeremy L Thompson             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
173*34d14614SJeremy L Thompson             CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
174*34d14614SJeremy L Thompson 
175*34d14614SJeremy L Thompson             // Build Chebyshev polynomial values
176*34d14614SJeremy L Thompson             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
177*34d14614SJeremy L Thompson 
178*34d14614SJeremy L Thompson             // Contract along middle index
179*34d14614SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
180*34d14614SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
181*34d14614SJeremy L Thompson                 CeedScalar v_k = 0;
182*34d14614SJeremy L Thompson 
183*34d14614SJeremy L Thompson                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
184*34d14614SJeremy L Thompson                 out[a * post + c] = v_k;
185*34d14614SJeremy L Thompson               }
186*34d14614SJeremy L Thompson             }
187*34d14614SJeremy L Thompson             post *= 1;
188*34d14614SJeremy L Thompson           }
189*34d14614SJeremy L Thompson         }
190*34d14614SJeremy L Thompson       }
191*34d14614SJeremy L Thompson     }
192*34d14614SJeremy L Thompson   }
193*34d14614SJeremy L Thompson }
194*34d14614SJeremy L Thompson 
195*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
196*34d14614SJeremy L Thompson // Grad
197*34d14614SJeremy L Thompson //------------------------------------------------------------------------------
198*34d14614SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
199*34d14614SJeremy L Thompson                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
200*34d14614SJeremy L Thompson   const CeedInt i = threadIdx.x;
201*34d14614SJeremy L Thompson 
202*34d14614SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
203*34d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
204*34d14614SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
205*34d14614SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
206*34d14614SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
207*34d14614SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
208*34d14614SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
209*34d14614SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
210*34d14614SJeremy L Thompson   }
211*34d14614SJeremy L Thompson 
212*34d14614SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
213*34d14614SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
214*34d14614SJeremy L Thompson   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
215*34d14614SJeremy L Thompson   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
216*34d14614SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
217*34d14614SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
218*34d14614SJeremy L Thompson   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
219*34d14614SJeremy L Thompson   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
220*34d14614SJeremy L Thompson   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
221*34d14614SJeremy L Thompson 
222*34d14614SJeremy L Thompson   // Apply basis element by element
223*34d14614SJeremy L Thompson   if (is_transpose) {
224*34d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
225*34d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
226*34d14614SJeremy L Thompson         CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
227*34d14614SJeremy L Thompson         CeedInt     pre   = 1;
228*34d14614SJeremy L Thompson         CeedInt     post  = 1;
229*34d14614SJeremy L Thompson 
230*34d14614SJeremy L Thompson         // Clear Chebyshev coeffs
231*34d14614SJeremy L Thompson         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
232*34d14614SJeremy L Thompson           s_chebyshev_coeffs[k] = 0.0;
233*34d14614SJeremy L Thompson         }
234*34d14614SJeremy L Thompson 
235*34d14614SJeremy L Thompson         // Map from point
236*34d14614SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
237*34d14614SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
238*34d14614SJeremy L Thompson             const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
239*34d14614SJeremy L Thompson 
240*34d14614SJeremy L Thompson             pre  = 1;
241*34d14614SJeremy L Thompson             post = 1;
242*34d14614SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
243*34d14614SJeremy L Thompson               // Update buffers used
244*34d14614SJeremy L Thompson               pre /= 1;
245*34d14614SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
246*34d14614SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
247*34d14614SJeremy L Thompson 
248*34d14614SJeremy L Thompson               // Build Chebyshev polynomial values
249*34d14614SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
250*34d14614SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
251*34d14614SJeremy L Thompson 
252*34d14614SJeremy L Thompson               // Contract along middle index
253*34d14614SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
254*34d14614SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
255*34d14614SJeremy L Thompson                   if (dim_2 == BASIS_DIM - 1) {
256*34d14614SJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
257*34d14614SJeremy L Thompson                   } else {
258*34d14614SJeremy L Thompson                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
259*34d14614SJeremy L Thompson                   }
260*34d14614SJeremy L Thompson                 }
261*34d14614SJeremy L Thompson               }
262*34d14614SJeremy L Thompson               post *= Q;
263*34d14614SJeremy L Thompson             }
264*34d14614SJeremy L Thompson           }
265*34d14614SJeremy L Thompson         }
266*34d14614SJeremy L Thompson 
267*34d14614SJeremy L Thompson         // Map from coefficients
268*34d14614SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
269*34d14614SJeremy L Thompson         post = 1;
270*34d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
271*34d14614SJeremy L Thompson           __syncthreads();
272*34d14614SJeremy L Thompson           // Update buffers used
273*34d14614SJeremy L Thompson           pre /= Q;
274*34d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
275*34d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
276*34d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * P;
277*34d14614SJeremy L Thompson 
278*34d14614SJeremy L Thompson           // Contract along middle index
279*34d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
280*34d14614SJeremy L Thompson             const CeedInt c   = k % post;
281*34d14614SJeremy L Thompson             const CeedInt j   = (k / post) % P;
282*34d14614SJeremy L Thompson             const CeedInt a   = k / (post * P);
283*34d14614SJeremy L Thompson             CeedScalar    v_k = 0;
284*34d14614SJeremy L Thompson 
285*34d14614SJeremy L Thompson             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
286*34d14614SJeremy L Thompson             out[k] = v_k;
287*34d14614SJeremy L Thompson           }
288*34d14614SJeremy L Thompson           post *= P;
289*34d14614SJeremy L Thompson         }
290*34d14614SJeremy L Thompson       }
291*34d14614SJeremy L Thompson     }
292*34d14614SJeremy L Thompson   } else {
293*34d14614SJeremy L Thompson     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
294*34d14614SJeremy L Thompson       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
295*34d14614SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
296*34d14614SJeremy L Thompson         CeedInt           pre   = u_size;
297*34d14614SJeremy L Thompson         CeedInt           post  = 1;
298*34d14614SJeremy L Thompson 
299*34d14614SJeremy L Thompson         // Map to coefficients
300*34d14614SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
301*34d14614SJeremy L Thompson           __syncthreads();
302*34d14614SJeremy L Thompson           // Update buffers used
303*34d14614SJeremy L Thompson           pre /= P;
304*34d14614SJeremy L Thompson           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
305*34d14614SJeremy L Thompson           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
306*34d14614SJeremy L Thompson           const CeedInt     writeLen = pre * post * Q;
307*34d14614SJeremy L Thompson 
308*34d14614SJeremy L Thompson           // Contract along middle index
309*34d14614SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
310*34d14614SJeremy L Thompson             const CeedInt c   = k % post;
311*34d14614SJeremy L Thompson             const CeedInt j   = (k / post) % Q;
312*34d14614SJeremy L Thompson             const CeedInt a   = k / (post * Q);
313*34d14614SJeremy L Thompson             CeedScalar    v_k = 0;
314*34d14614SJeremy L Thompson 
315*34d14614SJeremy L Thompson             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
316*34d14614SJeremy L Thompson             out[k] = v_k;
317*34d14614SJeremy L Thompson           }
318*34d14614SJeremy L Thompson           post *= Q;
319*34d14614SJeremy L Thompson         }
320*34d14614SJeremy L Thompson 
321*34d14614SJeremy L Thompson         // Map to point
322*34d14614SJeremy L Thompson         __syncthreads();
323*34d14614SJeremy L Thompson         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
324*34d14614SJeremy L Thompson           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
325*34d14614SJeremy L Thompson             CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
326*34d14614SJeremy L Thompson 
327*34d14614SJeremy L Thompson             pre  = BASIS_NUM_QPTS;
328*34d14614SJeremy L Thompson             post = 1;
329*34d14614SJeremy L Thompson             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
330*34d14614SJeremy L Thompson               // Update buffers used
331*34d14614SJeremy L Thompson               pre /= Q;
332*34d14614SJeremy L Thompson               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
333*34d14614SJeremy L Thompson               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
334*34d14614SJeremy L Thompson 
335*34d14614SJeremy L Thompson               // Build Chebyshev polynomial values
336*34d14614SJeremy L Thompson               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
337*34d14614SJeremy L Thompson               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
338*34d14614SJeremy L Thompson 
339*34d14614SJeremy L Thompson               // Contract along middle index
340*34d14614SJeremy L Thompson               for (CeedInt a = 0; a < pre; a++) {
341*34d14614SJeremy L Thompson                 for (CeedInt c = 0; c < post; c++) {
342*34d14614SJeremy L Thompson                   CeedScalar v_k = 0;
343*34d14614SJeremy L Thompson 
344*34d14614SJeremy L Thompson                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
345*34d14614SJeremy L Thompson                   out[a * post + c] = v_k;
346*34d14614SJeremy L Thompson                 }
347*34d14614SJeremy L Thompson               }
348*34d14614SJeremy L Thompson               post *= 1;
349*34d14614SJeremy L Thompson             }
350*34d14614SJeremy L Thompson           }
351*34d14614SJeremy L Thompson         }
352*34d14614SJeremy L Thompson       }
353*34d14614SJeremy L Thompson     }
354*34d14614SJeremy L Thompson   }
355*34d14614SJeremy L Thompson }
356