xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h (revision 343e3094792a64f9c2da70ef2256f98e7dc173cf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
44 inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
45                                         CeedScalar *__restrict__ r_V) {
46   CeedScalar chebyshev_x[Q_1D];
47 
48   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51     // Load coefficients
52     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
53     __syncthreads();
54     // Contract x direction
55     for (CeedInt i = 0; i < Q_1D; i++) {
56       r_V[comp] += chebyshev_x[i] * data.slice[i];
57     }
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
66                                                  CeedScalar *__restrict__ r_C) {
67   CeedScalar chebyshev_x[Q_1D];
68 
69   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
70   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
71     // Clear shared memory
72     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
73     __syncthreads();
74     // Contract x direction
75     if (p < NUM_POINTS) {
76       for (CeedInt i = 0; i < Q_1D; i++) {
77         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
78       }
79     }
80     // Pull from shared to register
81     __syncthreads();
82     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // 1D derivatives at points
88 //------------------------------------------------------------------------------
89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
90 inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
91                                       CeedScalar *__restrict__ r_V) {
92   CeedScalar chebyshev_x[Q_1D];
93 
94   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
95   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
96   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
97     // Load coefficients
98     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
99     __syncthreads();
100     // Contract x direction
101     for (CeedInt i = 0; i < Q_1D; i++) {
102       r_V[comp] += chebyshev_x[i] * data.slice[i];
103     }
104   }
105 }
106 
107 //------------------------------------------------------------------------------
108 // 1D derivatives transpose
109 //------------------------------------------------------------------------------
110 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
111 inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
112                                                CeedScalar *__restrict__ r_C) {
113   CeedScalar chebyshev_x[Q_1D];
114 
115   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
116   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
117     // Clear shared memory
118     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
119     __syncthreads();
120     // Contract x direction
121     if (p < NUM_POINTS) {
122       for (CeedInt i = 0; i < Q_1D; i++) {
123         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
124       }
125     }
126     // Pull from shared to register
127     __syncthreads();
128     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
129   }
130 }
131 
132 //------------------------------------------------------------------------------
133 // 2D
134 //------------------------------------------------------------------------------
135 
136 //------------------------------------------------------------------------------
137 // 2D interpolate to points
138 //------------------------------------------------------------------------------
139 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
140 inline __device__ void InterpAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
141                                              const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, CeedScalar *__restrict__ r_V) {
142   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
143   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
144     CeedScalar buffer[Q_1D];
145     CeedScalar chebyshev_x[Q_1D];
146 
147     // Load coefficients
148     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
149     __syncthreads();
150     // Contract x direction
151     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
152     for (CeedInt i = 0; i < Q_1D; i++) {
153       buffer[i] = 0.0;
154       for (CeedInt j = 0; j < Q_1D; j++) {
155         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
156       }
157     }
158     // Contract y direction
159     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
160     for (CeedInt i = 0; i < Q_1D; i++) {
161       r_V[comp] += chebyshev_x[i] * buffer[i];
162     }
163   }
164 }
165 
166 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
167 inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
168                                         CeedScalar *__restrict__ r_V) {
169   InterpAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
170 }
171 
172 //------------------------------------------------------------------------------
173 // 2D interpolate transpose
174 //------------------------------------------------------------------------------
175 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
176 inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
177                                                       const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, CeedScalar *__restrict__ r_C) {
178   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
179     CeedScalar buffer[Q_1D];
180     CeedScalar chebyshev_x[Q_1D];
181 
182     // Clear shared memory
183     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
184     __syncthreads();
185     // Contract y direction
186     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
187     for (CeedInt i = 0; i < Q_1D; i++) {
188       buffer[i] = chebyshev_x[i] * r_U[comp];
189     }
190     // Contract x direction
191     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
192     if (p < NUM_POINTS) {
193       for (CeedInt i = 0; i < Q_1D; i++) {
194         // Note: shifting to avoid atomic adds
195         const CeedInt ii = (i + t_id_x) % Q_1D;
196 
197         for (CeedInt j = 0; j < Q_1D; j++) {
198           const CeedInt jj = (j + t_id_y) % Q_1D;
199 
200           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
201         }
202       }
203     }
204     // Pull from shared to register
205     __syncthreads();
206     if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
207   }
208 }
209 
210 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
211 inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
212                                                  CeedScalar *__restrict__ r_C) {
213   InterpTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
214 }
215 
216 //------------------------------------------------------------------------------
217 // 2D derivatives at points
218 //------------------------------------------------------------------------------
219 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
220 inline __device__ void GradAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
221                                            const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, CeedScalar *__restrict__ r_V) {
222   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
223   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
224     CeedScalar buffer[Q_1D];
225     CeedScalar chebyshev_x[Q_1D];
226 
227     // Load coefficients
228     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
229     __syncthreads();
230     for (CeedInt dim = 0; dim < 2; dim++) {
231       // Contract x direction
232       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
233       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
234       for (CeedInt i = 0; i < Q_1D; i++) {
235         buffer[i] = 0.0;
236         for (CeedInt j = 0; j < Q_1D; j++) {
237           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
238         }
239       }
240       // Contract y direction
241       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
242       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
243       for (CeedInt i = 0; i < Q_1D; i++) {
244         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
245       }
246     }
247   }
248 }
249 
250 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
251 inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
252                                       CeedScalar *__restrict__ r_V) {
253   GradAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
254 }
255 
256 //------------------------------------------------------------------------------
257 // 2D derivatives transpose
258 //------------------------------------------------------------------------------
259 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
260 inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedInt p,
261                                                     const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, CeedScalar *__restrict__ r_C) {
262   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
263     CeedScalar buffer[Q_1D];
264     CeedScalar chebyshev_x[Q_1D];
265 
266     // Clear shared memory
267     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
268     __syncthreads();
269     for (CeedInt dim = 0; dim < 2; dim++) {
270       // Contract y direction
271       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
272       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
273       for (CeedInt i = 0; i < Q_1D; i++) {
274         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
275       }
276       // Contract x direction
277       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
278       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
279       if (p < NUM_POINTS) {
280         for (CeedInt i = 0; i < Q_1D; i++) {
281           // Note: shifting to avoid atomic adds
282           const CeedInt ii = (i + t_id_x) % Q_1D;
283 
284           for (CeedInt j = 0; j < Q_1D; j++) {
285             const CeedInt jj = (j + t_id_y) % Q_1D;
286 
287             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
288           }
289         }
290       }
291     }
292     // Pull from shared to register
293     __syncthreads();
294     if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
295   }
296 }
297 
298 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
299 inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
300                                                CeedScalar *__restrict__ r_C) {
301   GradTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
302 }
303 
304 //------------------------------------------------------------------------------
305 // 3D
306 //------------------------------------------------------------------------------
307 
308 //------------------------------------------------------------------------------
309 // 3D interpolate to points
310 //------------------------------------------------------------------------------
311 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
312 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
313                                         CeedScalar *__restrict__ r_V) {
314   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
315   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
316     for (CeedInt k = 0; k < Q_1D; k++) {
317       CeedScalar buffer[Q_1D];
318       CeedScalar chebyshev_x[Q_1D];
319 
320       // Load coefficients
321       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
322       __syncthreads();
323       // Contract x direction
324       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
325       for (CeedInt i = 0; i < Q_1D; i++) {
326         buffer[i] = 0.0;
327         for (CeedInt j = 0; j < Q_1D; j++) {
328           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
329         }
330       }
331       // Contract y and z direction
332       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
333       const CeedScalar z = chebyshev_x[k];
334 
335       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
336       for (CeedInt i = 0; i < Q_1D; i++) {
337         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
338       }
339     }
340   }
341 }
342 
343 //------------------------------------------------------------------------------
344 // 3D interpolate transpose
345 //------------------------------------------------------------------------------
346 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
347 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
348                                                  CeedScalar *__restrict__ r_C) {
349   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
350     for (CeedInt k = 0; k < Q_1D; k++) {
351       CeedScalar buffer[Q_1D];
352       CeedScalar chebyshev_x[Q_1D];
353 
354       // Clear shared memory
355       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
356       __syncthreads();
357       // Contract y and z direction
358       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
359       const CeedScalar z = chebyshev_x[k];
360 
361       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
362       for (CeedInt i = 0; i < Q_1D; i++) {
363         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
364       }
365       // Contract x direction
366       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
367       if (p < NUM_POINTS) {
368         for (CeedInt i = 0; i < Q_1D; i++) {
369           // Note: shifting to avoid atomic adds
370           const CeedInt ii = (i + data.t_id_x) % Q_1D;
371 
372           for (CeedInt j = 0; j < Q_1D; j++) {
373             const CeedInt jj = (j + data.t_id_y) % Q_1D;
374 
375             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
376           }
377         }
378       }
379       // Pull from shared to register
380       __syncthreads();
381       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
382     }
383   }
384 }
385 
386 //------------------------------------------------------------------------------
387 // 3D derivatives at points
388 //------------------------------------------------------------------------------
389 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
390 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
391                                       CeedScalar *__restrict__ r_V) {
392   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
393   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
394     for (CeedInt k = 0; k < Q_1D; k++) {
395       CeedScalar buffer[Q_1D];
396       CeedScalar chebyshev_x[Q_1D];
397 
398       // Load coefficients
399       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
400       __syncthreads();
401       for (CeedInt dim = 0; dim < 3; dim++) {
402         // Contract x direction
403         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
404         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
405         for (CeedInt i = 0; i < Q_1D; i++) {
406           buffer[i] = 0.0;
407           for (CeedInt j = 0; j < Q_1D; j++) {
408             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
409           }
410         }
411         // Contract y and z direction
412         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
413         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
414         const CeedScalar z = chebyshev_x[k];
415 
416         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
417         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
418         for (CeedInt i = 0; i < Q_1D; i++) {
419           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
420         }
421       }
422     }
423   }
424 }
425 
426 //------------------------------------------------------------------------------
427 // 3D derivatives transpose
428 //------------------------------------------------------------------------------
429 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
430 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
431                                                CeedScalar *__restrict__ r_C) {
432   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
433     for (CeedInt k = 0; k < Q_1D; k++) {
434       CeedScalar buffer[Q_1D];
435       CeedScalar chebyshev_x[Q_1D];
436 
437       // Clear shared memory
438       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
439       __syncthreads();
440       for (CeedInt dim = 0; dim < 3; dim++) {
441         // Contract y and z direction
442         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
443         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
444         const CeedScalar z = chebyshev_x[k];
445 
446         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
447         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
448         for (CeedInt i = 0; i < Q_1D; i++) {
449           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
450         }
451         // Contract x direction
452         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
453         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
454         if (p < NUM_POINTS) {
455           for (CeedInt i = 0; i < Q_1D; i++) {
456             // Note: shifting to avoid atomic adds
457             const CeedInt ii = (i + data.t_id_x) % Q_1D;
458 
459             for (CeedInt j = 0; j < Q_1D; j++) {
460               const CeedInt jj = (j + data.t_id_y) % Q_1D;
461 
462               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
463             }
464           }
465         }
466       }
467       // Pull from shared to register
468       __syncthreads();
469       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
470     }
471   }
472 }
473