xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h (revision f36e753100926c940e21f506e95c0b0794520501)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
44 inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
45                                         CeedScalar *__restrict__ r_V) {
46   CeedScalar chebyshev_x[Q_1D];
47 
48   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51     // Load coefficients
52     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
53     __syncthreads();
54     // Contract x direction
55     for (CeedInt i = 0; i < Q_1D; i++) {
56       r_V[comp] += chebyshev_x[i] * data.slice[i];
57     }
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
66                                                  CeedScalar *__restrict__ r_C) {
67   CeedScalar chebyshev_x[Q_1D];
68 
69   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
70   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
71     // Clear shared memory
72     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
73     __syncthreads();
74     // Contract x direction
75     if (p < NUM_POINTS) {
76       for (CeedInt i = 0; i < Q_1D; i++) {
77         atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
78       }
79     }
80     // Pull from shared to register
81     __syncthreads();
82     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // 1D derivatives at points
88 //------------------------------------------------------------------------------
89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
90 inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
91                                       CeedScalar *__restrict__ r_V) {
92   CeedScalar chebyshev_x[Q_1D];
93 
94   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
95   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
96   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
97     // Load coefficients
98     __syncthreads();
99     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
100     __syncthreads();
101     // Contract x direction
102     for (CeedInt i = 0; i < Q_1D; i++) {
103       r_V[comp] += chebyshev_x[i] * data.slice[i];
104     }
105   }
106 }
107 
108 //------------------------------------------------------------------------------
109 // 1D derivatives transpose
110 //------------------------------------------------------------------------------
111 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
112 inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
113                                                CeedScalar *__restrict__ r_C) {
114   CeedScalar chebyshev_x[Q_1D];
115 
116   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
117   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
118     // Clear shared memory
119     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
120     __syncthreads();
121     // Contract x direction
122     if (p < NUM_POINTS) {
123       for (CeedInt i = 0; i < Q_1D; i++) {
124         atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
125       }
126     }
127     // Pull from shared to register
128     __syncthreads();
129     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
130   }
131 }
132 
133 //------------------------------------------------------------------------------
134 // 2D
135 //------------------------------------------------------------------------------
136 
137 //------------------------------------------------------------------------------
138 // 2D interpolate to points
139 //------------------------------------------------------------------------------
140 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
141 inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
142                                         CeedScalar *__restrict__ r_V) {
143   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
144   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
145     CeedScalar buffer[Q_1D];
146     CeedScalar chebyshev_x[Q_1D];
147 
148     // Load coefficients
149     __syncthreads();
150     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
151     __syncthreads();
152     // Contract x direction
153     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
154     for (CeedInt i = 0; i < Q_1D; i++) {
155       buffer[i] = 0.0;
156       for (CeedInt j = 0; j < Q_1D; j++) {
157         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
158       }
159     }
160     // Contract y direction
161     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
162     for (CeedInt i = 0; i < Q_1D; i++) {
163       r_V[comp] += chebyshev_x[i] * buffer[i];
164     }
165   }
166 }
167 
168 //------------------------------------------------------------------------------
169 // 2D interpolate transpose
170 //------------------------------------------------------------------------------
171 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
172 inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
173                                                  CeedScalar *__restrict__ r_C) {
174   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
175     CeedScalar buffer[Q_1D];
176     CeedScalar chebyshev_x[Q_1D];
177 
178     // Clear shared memory
179     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
180     __syncthreads();
181     // Contract y direction
182     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
183     const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
184 
185     for (CeedInt i = 0; i < Q_1D; i++) {
186       buffer[i] = chebyshev_x[i] * r_u;
187     }
188     // Contract x direction
189     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
190     for (CeedInt i = 0; i < Q_1D; i++) {
191       // Note: shifting to avoid atomic adds
192       const CeedInt ii = (i + data.t_id_y) % Q_1D;
193 
194       for (CeedInt j = 0; j < Q_1D; j++) {
195         const CeedInt jj = (j + data.t_id_x) % Q_1D;
196 
197         if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198       }
199     }
200     // Pull from shared to register
201     __syncthreads();
202     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
203   }
204 }
205 
206 //------------------------------------------------------------------------------
207 // 2D derivatives at points
208 //------------------------------------------------------------------------------
209 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
210 inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
211                                       CeedScalar *__restrict__ r_V) {
212   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
213   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
214     CeedScalar buffer[Q_1D];
215     CeedScalar chebyshev_x[Q_1D];
216 
217     // Load coefficients
218     __syncthreads();
219     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
220     __syncthreads();
221     for (CeedInt dim = 0; dim < 2; dim++) {
222       // Contract x direction
223       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
224       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
225       for (CeedInt i = 0; i < Q_1D; i++) {
226         buffer[i] = 0.0;
227         for (CeedInt j = 0; j < Q_1D; j++) {
228           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
229         }
230       }
231       // Contract y direction
232       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
233       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
234       for (CeedInt i = 0; i < Q_1D; i++) {
235         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
236       }
237     }
238   }
239 }
240 
241 //------------------------------------------------------------------------------
242 // 2D derivatives transpose
243 //------------------------------------------------------------------------------
244 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
245 inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
246                                                CeedScalar *__restrict__ r_C) {
247   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248     CeedScalar buffer[Q_1D];
249     CeedScalar chebyshev_x[Q_1D];
250 
251     // Clear shared memory
252     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
253     __syncthreads();
254     for (CeedInt dim = 0; dim < 2; dim++) {
255       // Contract y direction
256       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
257       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
258       const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
259 
260       for (CeedInt i = 0; i < Q_1D; i++) {
261         buffer[i] = chebyshev_x[i] * r_u;
262       }
263       // Contract x direction
264       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
265       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
266       for (CeedInt i = 0; i < Q_1D; i++) {
267         // Note: shifting to avoid atomic adds
268         const CeedInt ii = (i + data.t_id_y) % Q_1D;
269 
270         for (CeedInt j = 0; j < Q_1D; j++) {
271           const CeedInt jj = (j + data.t_id_x) % Q_1D;
272 
273           if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274         }
275       }
276     }
277     // Pull from shared to register
278     __syncthreads();
279     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
280   }
281 }
282 
283 //------------------------------------------------------------------------------
284 // 3D
285 //------------------------------------------------------------------------------
286 
287 //------------------------------------------------------------------------------
288 // 3D interpolate to points
289 //------------------------------------------------------------------------------
290 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
291 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
292                                         CeedScalar *__restrict__ r_V) {
293   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
294   for (CeedInt k = 0; k < Q_1D; k++) {
295     CeedScalar buffer[Q_1D];
296     CeedScalar chebyshev_x[Q_1D];
297 
298     // Get z contraction value
299     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
300     const CeedScalar z = chebyshev_x[k];
301 
302     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
303       // Load coefficients
304       __syncthreads();
305       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
306       __syncthreads();
307       // Contract x direction
308       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
309       for (CeedInt i = 0; i < Q_1D; i++) {
310         buffer[i] = 0.0;
311         for (CeedInt j = 0; j < Q_1D; j++) {
312           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
313         }
314       }
315       // Contract y and z direction
316       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
317       for (CeedInt i = 0; i < Q_1D; i++) {
318         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
319       }
320     }
321   }
322 }
323 
324 //------------------------------------------------------------------------------
325 // 3D interpolate transpose
326 //------------------------------------------------------------------------------
327 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
328 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
329                                                  CeedScalar *__restrict__ r_C) {
330   for (CeedInt k = 0; k < Q_1D; k++) {
331     CeedScalar buffer[Q_1D];
332     CeedScalar chebyshev_x[Q_1D];
333 
334     // Get z contraction value
335     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
336     const CeedScalar z = chebyshev_x[k];
337 
338     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
339       // Clear shared memory
340       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
341       __syncthreads();
342       // Contract y and z direction
343       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
344       const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
345 
346       for (CeedInt i = 0; i < Q_1D; i++) {
347         buffer[i] = chebyshev_x[i] * r_u * z;
348       }
349       // Contract x direction
350       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
351       for (CeedInt i = 0; i < Q_1D; i++) {
352         // Note: shifting to avoid atomic adds
353         const CeedInt ii = (i + data.t_id_y) % Q_1D;
354 
355         for (CeedInt j = 0; j < Q_1D; j++) {
356           const CeedInt jj = (j + data.t_id_x) % Q_1D;
357 
358           if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359         }
360       }
361       // Pull from shared to register
362       __syncthreads();
363       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
364     }
365   }
366 }
367 
368 //------------------------------------------------------------------------------
369 // 3D derivatives at points
370 //------------------------------------------------------------------------------
371 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
372 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
373                                       CeedScalar *__restrict__ r_V) {
374   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
375   for (CeedInt k = 0; k < Q_1D; k++) {
376     CeedScalar buffer[Q_1D];
377     CeedScalar chebyshev_x[Q_1D];
378 
379     // Get z contraction values
380     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
381     const CeedScalar z = chebyshev_x[k];
382 
383     ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
384     const CeedScalar dz = chebyshev_x[k];
385 
386     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
387       // Load coefficients
388       __syncthreads();
389       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
390       __syncthreads();
391       // Gradient directions
392       for (CeedInt dim = 0; dim < 3; dim++) {
393         // Contract x direction
394         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
395         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
396         for (CeedInt i = 0; i < Q_1D; i++) {
397           buffer[i] = 0.0;
398           for (CeedInt j = 0; j < Q_1D; j++) {
399             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
400           }
401         }
402         // Contract y and z direction
403         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
404         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
405         const CeedScalar zz = dim == 2 ? dz : z;
406 
407         for (CeedInt i = 0; i < Q_1D; i++) {
408           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
409         }
410       }
411     }
412   }
413 }
414 
415 //------------------------------------------------------------------------------
416 // 3D derivatives transpose
417 //------------------------------------------------------------------------------
418 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
419 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
420                                                CeedScalar *__restrict__ r_C) {
421   for (CeedInt k = 0; k < Q_1D; k++) {
422     CeedScalar buffer[Q_1D];
423     CeedScalar chebyshev_x[Q_1D];
424 
425     // Get z contraction values
426     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
427     const CeedScalar z = chebyshev_x[k];
428 
429     ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
430     const CeedScalar dz = chebyshev_x[k];
431 
432     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
433       // Clear shared memory
434       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
435       __syncthreads();
436       // Gradient directions
437       for (CeedInt dim = 0; dim < 3; dim++) {
438         // Contract y and z direction
439         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
440         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
441         const CeedScalar zz  = dim == 2 ? dz : z;
442         const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0;
443 
444         for (CeedInt i = 0; i < Q_1D; i++) {
445           buffer[i] = chebyshev_x[i] * r_u * zz;
446         }
447 
448         // Contract x direction
449         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
450         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
451         for (CeedInt i = 0; i < Q_1D; i++) {
452           // Note: shifting to avoid atomic adds
453           const CeedInt ii = (i + data.t_id_y) % Q_1D;
454 
455           for (CeedInt j = 0; j < Q_1D; j++) {
456             const CeedInt jj = (j + data.t_id_x) % Q_1D;
457 
458             if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459           }
460         }
461       }
462       // Pull from shared to register
463       __syncthreads();
464       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
465     }
466   }
467 }
468