xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h (revision 9e1d4b8291fc4f4e19d20cdfdecd866260d0e6d2)
1*9e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9e1d4b82SJeremy L Thompson //
4*9e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*9e1d4b82SJeremy L Thompson //
6*9e1d4b82SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*9e1d4b82SJeremy L Thompson 
8*9e1d4b82SJeremy L Thompson /// @file
9*9e1d4b82SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis AtPoints templates
10*9e1d4b82SJeremy L Thompson #include <ceed/types.h>
11*9e1d4b82SJeremy L Thompson 
12*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
13*9e1d4b82SJeremy L Thompson // Chebyshev values
14*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
15*9e1d4b82SJeremy L Thompson template <int Q_1D>
16*9e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17*9e1d4b82SJeremy L Thompson   chebyshev_x[0] = 1.0;
18*9e1d4b82SJeremy L Thompson   chebyshev_x[1] = 2 * x;
19*9e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20*9e1d4b82SJeremy L Thompson }
21*9e1d4b82SJeremy L Thompson 
22*9e1d4b82SJeremy L Thompson template <int Q_1D>
23*9e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24*9e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[3];
25*9e1d4b82SJeremy L Thompson 
26*9e1d4b82SJeremy L Thompson   chebyshev_x[1]  = 1.0;
27*9e1d4b82SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
28*9e1d4b82SJeremy L Thompson   chebyshev_dx[0] = 0.0;
29*9e1d4b82SJeremy L Thompson   chebyshev_dx[1] = 2.0;
30*9e1d4b82SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
31*9e1d4b82SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32*9e1d4b82SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33*9e1d4b82SJeremy L Thompson   }
34*9e1d4b82SJeremy L Thompson }
35*9e1d4b82SJeremy L Thompson 
36*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
37*9e1d4b82SJeremy L Thompson // 1D
38*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
39*9e1d4b82SJeremy L Thompson 
40*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
41*9e1d4b82SJeremy L Thompson // 1D interpolate to points
42*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
43*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
44*9e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
45*9e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
46*9e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
47*9e1d4b82SJeremy L Thompson 
48*9e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49*9e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51*9e1d4b82SJeremy L Thompson     // Load coefficients
52*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
53*9e1d4b82SJeremy L Thompson     __syncthreads();
54*9e1d4b82SJeremy L Thompson     // Contract x direction
55*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
56*9e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
57*9e1d4b82SJeremy L Thompson     }
58*9e1d4b82SJeremy L Thompson   }
59*9e1d4b82SJeremy L Thompson }
60*9e1d4b82SJeremy L Thompson 
61*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
62*9e1d4b82SJeremy L Thompson // 1D interpolate transpose
63*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
64*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
65*9e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
66*9e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
67*9e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
68*9e1d4b82SJeremy L Thompson 
69*9e1d4b82SJeremy L Thompson   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
70*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
71*9e1d4b82SJeremy L Thompson     // Clear shared memory
72*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
73*9e1d4b82SJeremy L Thompson     __syncthreads();
74*9e1d4b82SJeremy L Thompson     // Contract x direction
75*9e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
76*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
77*9e1d4b82SJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
78*9e1d4b82SJeremy L Thompson       }
79*9e1d4b82SJeremy L Thompson     }
80*9e1d4b82SJeremy L Thompson     // Pull from shared to register
81*9e1d4b82SJeremy L Thompson     __syncthreads();
82*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
83*9e1d4b82SJeremy L Thompson   }
84*9e1d4b82SJeremy L Thompson }
85*9e1d4b82SJeremy L Thompson 
86*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
87*9e1d4b82SJeremy L Thompson // 1D derivatives at points
88*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
89*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
90*9e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
91*9e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
92*9e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
93*9e1d4b82SJeremy L Thompson 
94*9e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
95*9e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
96*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
97*9e1d4b82SJeremy L Thompson     // Load coefficients
98*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
99*9e1d4b82SJeremy L Thompson     __syncthreads();
100*9e1d4b82SJeremy L Thompson     // Contract x direction
101*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
102*9e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * data.slice[i];
103*9e1d4b82SJeremy L Thompson     }
104*9e1d4b82SJeremy L Thompson   }
105*9e1d4b82SJeremy L Thompson }
106*9e1d4b82SJeremy L Thompson 
107*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
108*9e1d4b82SJeremy L Thompson // 1D derivatives transpose
109*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
110*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
111*9e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
112*9e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
113*9e1d4b82SJeremy L Thompson   CeedScalar chebyshev_x[Q_1D];
114*9e1d4b82SJeremy L Thompson 
115*9e1d4b82SJeremy L Thompson   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
116*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
117*9e1d4b82SJeremy L Thompson     // Clear shared memory
118*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
119*9e1d4b82SJeremy L Thompson     __syncthreads();
120*9e1d4b82SJeremy L Thompson     // Contract x direction
121*9e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
122*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
123*9e1d4b82SJeremy L Thompson         atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
124*9e1d4b82SJeremy L Thompson       }
125*9e1d4b82SJeremy L Thompson     }
126*9e1d4b82SJeremy L Thompson     // Pull from shared to register
127*9e1d4b82SJeremy L Thompson     __syncthreads();
128*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
129*9e1d4b82SJeremy L Thompson   }
130*9e1d4b82SJeremy L Thompson }
131*9e1d4b82SJeremy L Thompson 
132*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
133*9e1d4b82SJeremy L Thompson // 2D
134*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
135*9e1d4b82SJeremy L Thompson 
136*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
137*9e1d4b82SJeremy L Thompson // 2D interpolate to points
138*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
139*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
140*9e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
141*9e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
142*9e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
143*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
144*9e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
145*9e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
146*9e1d4b82SJeremy L Thompson 
147*9e1d4b82SJeremy L Thompson     // Load coefficients
148*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
149*9e1d4b82SJeremy L Thompson     __syncthreads();
150*9e1d4b82SJeremy L Thompson     // Contract x direction
151*9e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
152*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
153*9e1d4b82SJeremy L Thompson       buffer[i] = 0.0;
154*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < Q_1D; j++) {
155*9e1d4b82SJeremy L Thompson         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
156*9e1d4b82SJeremy L Thompson       }
157*9e1d4b82SJeremy L Thompson     }
158*9e1d4b82SJeremy L Thompson     // Contract y direction
159*9e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
160*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
161*9e1d4b82SJeremy L Thompson       r_V[comp] += chebyshev_x[i] * buffer[i];
162*9e1d4b82SJeremy L Thompson     }
163*9e1d4b82SJeremy L Thompson   }
164*9e1d4b82SJeremy L Thompson }
165*9e1d4b82SJeremy L Thompson 
166*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
167*9e1d4b82SJeremy L Thompson // 2D interpolate transpose
168*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
169*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
170*9e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
171*9e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
172*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
173*9e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
174*9e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
175*9e1d4b82SJeremy L Thompson 
176*9e1d4b82SJeremy L Thompson     // Clear shared memory
177*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
178*9e1d4b82SJeremy L Thompson     __syncthreads();
179*9e1d4b82SJeremy L Thompson     // Contract y direction
180*9e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
181*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
182*9e1d4b82SJeremy L Thompson       buffer[i] = chebyshev_x[i] * r_U[comp];
183*9e1d4b82SJeremy L Thompson     }
184*9e1d4b82SJeremy L Thompson     // Contract x direction
185*9e1d4b82SJeremy L Thompson     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
186*9e1d4b82SJeremy L Thompson     if (p < NUM_POINTS) {
187*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
188*9e1d4b82SJeremy L Thompson         // Note: shifting to avoid atomic adds
189*9e1d4b82SJeremy L Thompson         const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
190*9e1d4b82SJeremy L Thompson 
191*9e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
192*9e1d4b82SJeremy L Thompson           const CeedInt jj = (j + p) % Q_1D;
193*9e1d4b82SJeremy L Thompson 
194*9e1d4b82SJeremy L Thompson           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
195*9e1d4b82SJeremy L Thompson         }
196*9e1d4b82SJeremy L Thompson       }
197*9e1d4b82SJeremy L Thompson     }
198*9e1d4b82SJeremy L Thompson     // Pull from shared to register
199*9e1d4b82SJeremy L Thompson     __syncthreads();
200*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
201*9e1d4b82SJeremy L Thompson   }
202*9e1d4b82SJeremy L Thompson }
203*9e1d4b82SJeremy L Thompson 
204*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
205*9e1d4b82SJeremy L Thompson // 2D derivatives at points
206*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
207*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
208*9e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
209*9e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
210*9e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
211*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
212*9e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
213*9e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
214*9e1d4b82SJeremy L Thompson 
215*9e1d4b82SJeremy L Thompson     // Load coefficients
216*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
217*9e1d4b82SJeremy L Thompson     __syncthreads();
218*9e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
219*9e1d4b82SJeremy L Thompson       // Contract x direction
220*9e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
221*9e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
222*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
223*9e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
224*9e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
225*9e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
226*9e1d4b82SJeremy L Thompson         }
227*9e1d4b82SJeremy L Thompson       }
228*9e1d4b82SJeremy L Thompson       // Contract y direction
229*9e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
230*9e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
231*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
232*9e1d4b82SJeremy L Thompson         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
233*9e1d4b82SJeremy L Thompson       }
234*9e1d4b82SJeremy L Thompson     }
235*9e1d4b82SJeremy L Thompson   }
236*9e1d4b82SJeremy L Thompson }
237*9e1d4b82SJeremy L Thompson 
238*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
239*9e1d4b82SJeremy L Thompson // 2D derivatives transpose
240*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
241*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
242*9e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
243*9e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
244*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
245*9e1d4b82SJeremy L Thompson     CeedScalar buffer[Q_1D];
246*9e1d4b82SJeremy L Thompson     CeedScalar chebyshev_x[Q_1D];
247*9e1d4b82SJeremy L Thompson 
248*9e1d4b82SJeremy L Thompson     // Clear shared memory
249*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
250*9e1d4b82SJeremy L Thompson     __syncthreads();
251*9e1d4b82SJeremy L Thompson     for (CeedInt dim = 0; dim < 2; dim++) {
252*9e1d4b82SJeremy L Thompson       // Contract y direction
253*9e1d4b82SJeremy L Thompson       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
254*9e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
255*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
256*9e1d4b82SJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
257*9e1d4b82SJeremy L Thompson       }
258*9e1d4b82SJeremy L Thompson       // Contract x direction
259*9e1d4b82SJeremy L Thompson       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
260*9e1d4b82SJeremy L Thompson       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
261*9e1d4b82SJeremy L Thompson       if (p < NUM_POINTS) {
262*9e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
263*9e1d4b82SJeremy L Thompson           // Note: shifting to avoid atomic adds
264*9e1d4b82SJeremy L Thompson           const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
265*9e1d4b82SJeremy L Thompson 
266*9e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
267*9e1d4b82SJeremy L Thompson             const CeedInt jj = (j + p) % Q_1D;
268*9e1d4b82SJeremy L Thompson 
269*9e1d4b82SJeremy L Thompson             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
270*9e1d4b82SJeremy L Thompson           }
271*9e1d4b82SJeremy L Thompson         }
272*9e1d4b82SJeremy L Thompson       }
273*9e1d4b82SJeremy L Thompson     }
274*9e1d4b82SJeremy L Thompson     // Pull from shared to register
275*9e1d4b82SJeremy L Thompson     __syncthreads();
276*9e1d4b82SJeremy L Thompson     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
277*9e1d4b82SJeremy L Thompson   }
278*9e1d4b82SJeremy L Thompson }
279*9e1d4b82SJeremy L Thompson 
280*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
281*9e1d4b82SJeremy L Thompson // 3D
282*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
283*9e1d4b82SJeremy L Thompson 
284*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
285*9e1d4b82SJeremy L Thompson // 3D interpolate to points
286*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
287*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
288*9e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
289*9e1d4b82SJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
290*9e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
291*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
292*9e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
293*9e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
294*9e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
295*9e1d4b82SJeremy L Thompson 
296*9e1d4b82SJeremy L Thompson       // Load coefficients
297*9e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
298*9e1d4b82SJeremy L Thompson       __syncthreads();
299*9e1d4b82SJeremy L Thompson       // Contract x direction
300*9e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
301*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
302*9e1d4b82SJeremy L Thompson         buffer[i] = 0.0;
303*9e1d4b82SJeremy L Thompson         for (CeedInt j = 0; j < Q_1D; j++) {
304*9e1d4b82SJeremy L Thompson           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
305*9e1d4b82SJeremy L Thompson         }
306*9e1d4b82SJeremy L Thompson       }
307*9e1d4b82SJeremy L Thompson       // Contract y and z direction
308*9e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
309*9e1d4b82SJeremy L Thompson       const CeedScalar z = chebyshev_x[k];
310*9e1d4b82SJeremy L Thompson 
311*9e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
312*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
313*9e1d4b82SJeremy L Thompson         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
314*9e1d4b82SJeremy L Thompson       }
315*9e1d4b82SJeremy L Thompson     }
316*9e1d4b82SJeremy L Thompson   }
317*9e1d4b82SJeremy L Thompson }
318*9e1d4b82SJeremy L Thompson 
319*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
320*9e1d4b82SJeremy L Thompson // 3D interpolate transpose
321*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
322*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
323*9e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
324*9e1d4b82SJeremy L Thompson                                                  CeedScalar *__restrict__ r_C) {
325*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
326*9e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
327*9e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
328*9e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
329*9e1d4b82SJeremy L Thompson 
330*9e1d4b82SJeremy L Thompson       // Clear shared memory
331*9e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
332*9e1d4b82SJeremy L Thompson       __syncthreads();
333*9e1d4b82SJeremy L Thompson       // Contract y and z direction
334*9e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
335*9e1d4b82SJeremy L Thompson       const CeedScalar z = chebyshev_x[k];
336*9e1d4b82SJeremy L Thompson 
337*9e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
338*9e1d4b82SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
339*9e1d4b82SJeremy L Thompson         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
340*9e1d4b82SJeremy L Thompson       }
341*9e1d4b82SJeremy L Thompson       // Contract x direction
342*9e1d4b82SJeremy L Thompson       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
343*9e1d4b82SJeremy L Thompson       if (p < NUM_POINTS) {
344*9e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
345*9e1d4b82SJeremy L Thompson           // Note: shifting to avoid atomic adds
346*9e1d4b82SJeremy L Thompson           const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
347*9e1d4b82SJeremy L Thompson 
348*9e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
349*9e1d4b82SJeremy L Thompson             const CeedInt jj = ((j + p) % Q_1D);
350*9e1d4b82SJeremy L Thompson 
351*9e1d4b82SJeremy L Thompson             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
352*9e1d4b82SJeremy L Thompson           }
353*9e1d4b82SJeremy L Thompson         }
354*9e1d4b82SJeremy L Thompson       }
355*9e1d4b82SJeremy L Thompson       // Pull from shared to register
356*9e1d4b82SJeremy L Thompson       __syncthreads();
357*9e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
358*9e1d4b82SJeremy L Thompson     }
359*9e1d4b82SJeremy L Thompson   }
360*9e1d4b82SJeremy L Thompson }
361*9e1d4b82SJeremy L Thompson 
362*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
363*9e1d4b82SJeremy L Thompson // 3D derivatives at points
364*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
365*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
366*9e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
367*9e1d4b82SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
368*9e1d4b82SJeremy L Thompson   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
369*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
370*9e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
371*9e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
372*9e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
373*9e1d4b82SJeremy L Thompson 
374*9e1d4b82SJeremy L Thompson       // Load coefficients
375*9e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
376*9e1d4b82SJeremy L Thompson       __syncthreads();
377*9e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
378*9e1d4b82SJeremy L Thompson         // Contract x direction
379*9e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
380*9e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
381*9e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
382*9e1d4b82SJeremy L Thompson           buffer[i] = 0.0;
383*9e1d4b82SJeremy L Thompson           for (CeedInt j = 0; j < Q_1D; j++) {
384*9e1d4b82SJeremy L Thompson             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
385*9e1d4b82SJeremy L Thompson           }
386*9e1d4b82SJeremy L Thompson         }
387*9e1d4b82SJeremy L Thompson         // Contract y and z direction
388*9e1d4b82SJeremy L Thompson         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
389*9e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
390*9e1d4b82SJeremy L Thompson         const CeedScalar z = chebyshev_x[k];
391*9e1d4b82SJeremy L Thompson 
392*9e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
393*9e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
394*9e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
395*9e1d4b82SJeremy L Thompson           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
396*9e1d4b82SJeremy L Thompson         }
397*9e1d4b82SJeremy L Thompson       }
398*9e1d4b82SJeremy L Thompson     }
399*9e1d4b82SJeremy L Thompson   }
400*9e1d4b82SJeremy L Thompson }
401*9e1d4b82SJeremy L Thompson 
402*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
403*9e1d4b82SJeremy L Thompson // 3D derivatives transpose
404*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
405*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D>
406*9e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
407*9e1d4b82SJeremy L Thompson                                                CeedScalar *__restrict__ r_C) {
408*9e1d4b82SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
409*9e1d4b82SJeremy L Thompson     for (CeedInt k = 0; k < Q_1D; k++) {
410*9e1d4b82SJeremy L Thompson       CeedScalar buffer[Q_1D];
411*9e1d4b82SJeremy L Thompson       CeedScalar chebyshev_x[Q_1D];
412*9e1d4b82SJeremy L Thompson 
413*9e1d4b82SJeremy L Thompson       // Clear shared memory
414*9e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
415*9e1d4b82SJeremy L Thompson       __syncthreads();
416*9e1d4b82SJeremy L Thompson       for (CeedInt dim = 0; dim < 3; dim++) {
417*9e1d4b82SJeremy L Thompson         // Contract y and z direction
418*9e1d4b82SJeremy L Thompson         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
419*9e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
420*9e1d4b82SJeremy L Thompson         const CeedScalar z = chebyshev_x[k];
421*9e1d4b82SJeremy L Thompson 
422*9e1d4b82SJeremy L Thompson         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
423*9e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
424*9e1d4b82SJeremy L Thompson         for (CeedInt i = 0; i < Q_1D; i++) {
425*9e1d4b82SJeremy L Thompson           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
426*9e1d4b82SJeremy L Thompson         }
427*9e1d4b82SJeremy L Thompson         // Contract x direction
428*9e1d4b82SJeremy L Thompson         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
429*9e1d4b82SJeremy L Thompson         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
430*9e1d4b82SJeremy L Thompson         if (p < NUM_POINTS) {
431*9e1d4b82SJeremy L Thompson           for (CeedInt i = 0; i < Q_1D; i++) {
432*9e1d4b82SJeremy L Thompson             // Note: shifting to avoid atomic adds
433*9e1d4b82SJeremy L Thompson             const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
434*9e1d4b82SJeremy L Thompson 
435*9e1d4b82SJeremy L Thompson             for (CeedInt j = 0; j < Q_1D; j++) {
436*9e1d4b82SJeremy L Thompson               const CeedInt jj = ((j + p) % Q_1D);
437*9e1d4b82SJeremy L Thompson 
438*9e1d4b82SJeremy L Thompson               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
439*9e1d4b82SJeremy L Thompson             }
440*9e1d4b82SJeremy L Thompson           }
441*9e1d4b82SJeremy L Thompson         }
442*9e1d4b82SJeremy L Thompson       }
443*9e1d4b82SJeremy L Thompson       // Pull from shared to register
444*9e1d4b82SJeremy L Thompson       __syncthreads();
445*9e1d4b82SJeremy L Thompson       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
446*9e1d4b82SJeremy L Thompson     }
447*9e1d4b82SJeremy L Thompson   }
448*9e1d4b82SJeremy L Thompson }
449