xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h (revision 343e3094792a64f9c2da70ef2256f98e7dc173cf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
45                                         CeedScalar *__restrict__ r_V) {
46   CeedScalar chebyshev_x[Q_1D];
47 
48   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51     // Load coefficients
52     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
53     __syncthreads();
54     // Contract x direction
55     for (CeedInt i = 0; i < Q_1D; i++) {
56       r_V[comp] += chebyshev_x[i] * data.slice[i];
57     }
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
66                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
67   CeedScalar chebyshev_x[Q_1D];
68 
69   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
70   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
71     // Clear shared memory
72     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
73     __syncthreads();
74     // Contract x direction
75     if (p < NUM_POINTS) {
76       for (CeedInt i = 0; i < Q_1D; i++) {
77         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
78       }
79     }
80     // Pull from shared to register
81     __syncthreads();
82     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // 1D derivatives at points
88 //------------------------------------------------------------------------------
89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
90 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
91                                       CeedScalar *__restrict__ r_V) {
92   CeedScalar chebyshev_x[Q_1D];
93 
94   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
95   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
96   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
97     // Load coefficients
98     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
99     __syncthreads();
100     // Contract x direction
101     for (CeedInt i = 0; i < Q_1D; i++) {
102       r_V[comp] += chebyshev_x[i] * data.slice[i];
103     }
104   }
105 }
106 
107 //------------------------------------------------------------------------------
108 // 1D derivatives transpose
109 //------------------------------------------------------------------------------
110 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
111 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
112                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
113   CeedScalar chebyshev_x[Q_1D];
114 
115   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
116   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
117     // Clear shared memory
118     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
119     __syncthreads();
120     // Contract x direction
121     if (p < NUM_POINTS) {
122       for (CeedInt i = 0; i < Q_1D; i++) {
123         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
124       }
125     }
126     // Pull from shared to register
127     __syncthreads();
128     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
129   }
130 }
131 
132 //------------------------------------------------------------------------------
133 // 2D
134 //------------------------------------------------------------------------------
135 
136 //------------------------------------------------------------------------------
137 // 2D interpolate to points
138 //------------------------------------------------------------------------------
139 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
140 inline __device__ void InterpAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
141                                              const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) {
142   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
143   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
144     CeedScalar buffer[Q_1D];
145     CeedScalar chebyshev_x[Q_1D];
146 
147     // Load coefficients
148     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
149     __syncthreads();
150     // Contract x direction
151     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
152     for (CeedInt i = 0; i < Q_1D; i++) {
153       buffer[i] = 0.0;
154       for (CeedInt j = 0; j < Q_1D; j++) {
155         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
156       }
157     }
158     // Contract y direction
159     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
160     for (CeedInt i = 0; i < Q_1D; i++) {
161       r_V[comp] += chebyshev_x[i] * buffer[i];
162     }
163   }
164 }
165 
166 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
167 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
168                                         CeedScalar *__restrict__ r_V) {
169   InterpAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
170 }
171 
172 //------------------------------------------------------------------------------
173 // 2D interpolate transpose
174 //------------------------------------------------------------------------------
175 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
176 inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
177                                                       const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X,
178                                                       CeedScalar *__restrict__ r_C) {
179   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
180     CeedScalar buffer[Q_1D];
181     CeedScalar chebyshev_x[Q_1D];
182 
183     // Clear shared memory
184     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
185     __syncthreads();
186     // Contract y direction
187     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
188     for (CeedInt i = 0; i < Q_1D; i++) {
189       buffer[i] = chebyshev_x[i] * r_U[comp];
190     }
191     // Contract x direction
192     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
193     if (p < NUM_POINTS) {
194       for (CeedInt i = 0; i < Q_1D; i++) {
195         // Note: shifting to avoid atomic adds
196         const CeedInt ii = (i + t_id_x) % Q_1D;
197 
198         for (CeedInt j = 0; j < Q_1D; j++) {
199           const CeedInt jj = (j + t_id_y) % Q_1D;
200 
201           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
202         }
203       }
204     }
205     // Pull from shared to register
206     __syncthreads();
207     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
208   }
209 }
210 
211 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
212 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
213                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
214   InterpTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
215 }
216 
217 //------------------------------------------------------------------------------
218 // 2D derivatives at points
219 //------------------------------------------------------------------------------
220 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
221 inline __device__ void GradAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
222                                            const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) {
223   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
224   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
225     CeedScalar buffer[Q_1D];
226     CeedScalar chebyshev_x[Q_1D];
227 
228     // Load coefficients
229     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp];
230     __syncthreads();
231     for (CeedInt dim = 0; dim < 2; dim++) {
232       // Contract x direction
233       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
234       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
235       for (CeedInt i = 0; i < Q_1D; i++) {
236         buffer[i] = 0.0;
237         for (CeedInt j = 0; j < Q_1D; j++) {
238           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
239         }
240       }
241       // Contract y direction
242       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
243       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
244       for (CeedInt i = 0; i < Q_1D; i++) {
245         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
246       }
247     }
248   }
249 }
250 
251 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
252 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
253                                       CeedScalar *__restrict__ r_V) {
254   GradAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V);
255 }
256 
257 //------------------------------------------------------------------------------
258 // 2D derivatives transpose
259 //------------------------------------------------------------------------------
260 template <int NUM_COMP, int NUM_POINTS, int Q_1D>
261 inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p,
262                                                     const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X,
263                                                     CeedScalar *__restrict__ r_C) {
264   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
265     CeedScalar buffer[Q_1D];
266     CeedScalar chebyshev_x[Q_1D];
267 
268     // Clear shared memory
269     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0;
270     __syncthreads();
271     for (CeedInt dim = 0; dim < 2; dim++) {
272       // Contract y direction
273       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
274       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
275       for (CeedInt i = 0; i < Q_1D; i++) {
276         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
277       }
278       // Contract x direction
279       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
280       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
281       if (p < NUM_POINTS) {
282         for (CeedInt i = 0; i < Q_1D; i++) {
283           // Note: shifting to avoid atomic adds
284           const CeedInt ii = (i + t_id_x) % Q_1D;
285 
286           for (CeedInt j = 0; j < Q_1D; j++) {
287             const CeedInt jj = (j + t_id_y) % Q_1D;
288 
289             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
290           }
291         }
292       }
293     }
294     // Pull from shared to register
295     __syncthreads();
296     if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D];
297   }
298 }
299 
300 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
301 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
302                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
303   GradTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C);
304 }
305 
306 //------------------------------------------------------------------------------
307 // 3D
308 //------------------------------------------------------------------------------
309 
310 //------------------------------------------------------------------------------
311 // 3D interpolate to points
312 //------------------------------------------------------------------------------
313 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
314 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
315                                         CeedScalar *__restrict__ r_V) {
316   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
317   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
318     for (CeedInt k = 0; k < Q_1D; k++) {
319       CeedScalar buffer[Q_1D];
320       CeedScalar chebyshev_x[Q_1D];
321 
322       // Load coefficients
323       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
324       __syncthreads();
325       // Contract x direction
326       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
327       for (CeedInt i = 0; i < Q_1D; i++) {
328         buffer[i] = 0.0;
329         for (CeedInt j = 0; j < Q_1D; j++) {
330           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
331         }
332       }
333       // Contract y and z direction
334       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
335       const CeedScalar z = chebyshev_x[k];
336 
337       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
338       for (CeedInt i = 0; i < Q_1D; i++) {
339         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
340       }
341     }
342   }
343 }
344 
345 //------------------------------------------------------------------------------
346 // 3D interpolate transpose
347 //------------------------------------------------------------------------------
348 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
349 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
350                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
351   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
352     for (CeedInt k = 0; k < Q_1D; k++) {
353       CeedScalar buffer[Q_1D];
354       CeedScalar chebyshev_x[Q_1D];
355 
356       // Clear shared memory
357       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
358       __syncthreads();
359       // Contract y and z direction
360       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
361       const CeedScalar z = chebyshev_x[k];
362 
363       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
364       for (CeedInt i = 0; i < Q_1D; i++) {
365         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
366       }
367       // Contract x direction
368       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
369       if (p < NUM_POINTS) {
370         for (CeedInt i = 0; i < Q_1D; i++) {
371           // Note: shifting to avoid atomic adds
372           const CeedInt ii = (i + data.t_id_x) % Q_1D;
373 
374           for (CeedInt j = 0; j < Q_1D; j++) {
375             const CeedInt jj = (j + data.t_id_y) % Q_1D;
376 
377             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
378           }
379         }
380       }
381       // Pull from shared to register
382       __syncthreads();
383       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
384     }
385   }
386 }
387 
388 //------------------------------------------------------------------------------
389 // 3D derivatives at points
390 //------------------------------------------------------------------------------
391 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
392 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
393                                       CeedScalar *__restrict__ r_V) {
394   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
395   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
396     for (CeedInt k = 0; k < Q_1D; k++) {
397       CeedScalar buffer[Q_1D];
398       CeedScalar chebyshev_x[Q_1D];
399 
400       // Load coefficients
401       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
402       __syncthreads();
403       for (CeedInt dim = 0; dim < 3; dim++) {
404         // Contract x direction
405         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
406         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
407         for (CeedInt i = 0; i < Q_1D; i++) {
408           buffer[i] = 0.0;
409           for (CeedInt j = 0; j < Q_1D; j++) {
410             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
411           }
412         }
413         // Contract y and z direction
414         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
415         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
416         const CeedScalar z = chebyshev_x[k];
417 
418         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
419         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
420         for (CeedInt i = 0; i < Q_1D; i++) {
421           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
422         }
423       }
424     }
425   }
426 }
427 
428 //------------------------------------------------------------------------------
429 // 3D derivatives transpose
430 //------------------------------------------------------------------------------
431 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
432 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
433                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
434   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
435     for (CeedInt k = 0; k < Q_1D; k++) {
436       CeedScalar buffer[Q_1D];
437       CeedScalar chebyshev_x[Q_1D];
438 
439       // Clear shared memory
440       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
441       __syncthreads();
442       for (CeedInt dim = 0; dim < 3; dim++) {
443         // Contract y and z direction
444         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
445         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
446         const CeedScalar z = chebyshev_x[k];
447 
448         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
449         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
450         for (CeedInt i = 0; i < Q_1D; i++) {
451           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
452         }
453         // Contract x direction
454         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
455         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
456         if (p < NUM_POINTS) {
457           for (CeedInt i = 0; i < Q_1D; i++) {
458             // Note: shifting to avoid atomic adds
459             const CeedInt ii = (i + data.t_id_x) % Q_1D;
460 
461             for (CeedInt j = 0; j < Q_1D; j++) {
462               const CeedInt jj = (j + data.t_id_y) % Q_1D;
463 
464               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
465             }
466           }
467         }
468       }
469       // Pull from shared to register
470       __syncthreads();
471       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
472     }
473   }
474 }
475