xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h (revision f1f13db40bc3ea1cde6ca54b4e70359a9e16f116)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
45                                         CeedScalar *__restrict__ r_V) {
46   CeedScalar chebyshev_x[Q_1D];
47 
48   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51     // Load coefficients
52     __syncthreads();
53     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
54     __syncthreads();
55     // Contract x direction
56     for (CeedInt i = 0; i < Q_1D; i++) {
57       r_V[comp] += chebyshev_x[i] * data.slice[i];
58     }
59   }
60 }
61 
62 //------------------------------------------------------------------------------
63 // 1D interpolate transpose
64 //------------------------------------------------------------------------------
65 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
66 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
67                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
68   CeedScalar chebyshev_x[Q_1D];
69 
70   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
71   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
72     // Clear shared memory
73     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
74     __syncthreads();
75     // Contract x direction
76     if (p < NUM_POINTS) {
77       for (CeedInt i = 0; i < Q_1D; i++) {
78         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
79       }
80     }
81     // Pull from shared to register
82     __syncthreads();
83     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
84   }
85 }
86 
87 //------------------------------------------------------------------------------
88 // 1D derivatives at points
89 //------------------------------------------------------------------------------
90 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
91 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
92                                       CeedScalar *__restrict__ r_V) {
93   CeedScalar chebyshev_x[Q_1D];
94 
95   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
96   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
97   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98     __syncthreads();
99     // Load coefficients
100     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
101     __syncthreads();
102     // Contract x direction
103     for (CeedInt i = 0; i < Q_1D; i++) {
104       r_V[comp] += chebyshev_x[i] * data.slice[i];
105     }
106   }
107 }
108 
109 //------------------------------------------------------------------------------
110 // 1D derivatives transpose
111 //------------------------------------------------------------------------------
112 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
113 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
114                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
115   CeedScalar chebyshev_x[Q_1D];
116 
117   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
118   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
119     // Clear shared memory
120     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
121     __syncthreads();
122     // Contract x direction
123     if (p < NUM_POINTS) {
124       for (CeedInt i = 0; i < Q_1D; i++) {
125         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
126       }
127     }
128     // Pull from shared to register
129     __syncthreads();
130     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
131   }
132 }
133 
134 //------------------------------------------------------------------------------
135 // 2D
136 //------------------------------------------------------------------------------
137 
138 //------------------------------------------------------------------------------
139 // 2D interpolate to points
140 //------------------------------------------------------------------------------
141 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
142 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
143                                         CeedScalar *__restrict__ r_V) {
144   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
145   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
146     CeedScalar buffer[Q_1D];
147     CeedScalar chebyshev_x[Q_1D];
148 
149     // Load coefficients
150     __syncthreads();
151     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
152     __syncthreads();
153     // Contract x direction
154     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
155     for (CeedInt i = 0; i < Q_1D; i++) {
156       buffer[i] = 0.0;
157       for (CeedInt j = 0; j < Q_1D; j++) {
158         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
159       }
160     }
161     // Contract y direction
162     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
163     for (CeedInt i = 0; i < Q_1D; i++) {
164       r_V[comp] += chebyshev_x[i] * buffer[i];
165     }
166   }
167 }
168 
169 //------------------------------------------------------------------------------
170 // 2D interpolate transpose
171 //------------------------------------------------------------------------------
172 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
173 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
174                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
175   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
176     CeedScalar buffer[Q_1D];
177     CeedScalar chebyshev_x[Q_1D];
178 
179     // Clear shared memory
180     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
181     __syncthreads();
182     // Contract y direction
183     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
184     for (CeedInt i = 0; i < Q_1D; i++) {
185       buffer[i] = chebyshev_x[i] * r_U[comp];
186     }
187     // Contract x direction
188     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
189     if (p < NUM_POINTS) {
190       for (CeedInt i = 0; i < Q_1D; i++) {
191         // Note: shifting to avoid atomic adds
192         const CeedInt ii = (i + data.t_id_x) % Q_1D;
193 
194         for (CeedInt j = 0; j < Q_1D; j++) {
195           const CeedInt jj = (j + data.t_id_y) % Q_1D;
196 
197           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198         }
199       }
200     }
201     // Pull from shared to register
202     __syncthreads();
203     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
204   }
205 }
206 
207 //------------------------------------------------------------------------------
208 // 2D derivatives at points
209 //------------------------------------------------------------------------------
210 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
211 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
212                                       CeedScalar *__restrict__ r_V) {
213   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
214   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
215     CeedScalar buffer[Q_1D];
216     CeedScalar chebyshev_x[Q_1D];
217 
218     // Load coefficients
219     __syncthreads();
220     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
221     __syncthreads();
222     for (CeedInt dim = 0; dim < 2; dim++) {
223       // Contract x direction
224       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
225       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
226       for (CeedInt i = 0; i < Q_1D; i++) {
227         buffer[i] = 0.0;
228         for (CeedInt j = 0; j < Q_1D; j++) {
229           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
230         }
231       }
232       // Contract y direction
233       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
234       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
235       for (CeedInt i = 0; i < Q_1D; i++) {
236         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
237       }
238     }
239   }
240 }
241 
242 //------------------------------------------------------------------------------
243 // 2D derivatives transpose
244 //------------------------------------------------------------------------------
245 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
246 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
247                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
248   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
249     CeedScalar buffer[Q_1D];
250     CeedScalar chebyshev_x[Q_1D];
251 
252     // Clear shared memory
253     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
254     __syncthreads();
255     for (CeedInt dim = 0; dim < 2; dim++) {
256       // Contract y direction
257       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
258       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
259       for (CeedInt i = 0; i < Q_1D; i++) {
260         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
261       }
262       // Contract x direction
263       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
264       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
265       if (p < NUM_POINTS) {
266         for (CeedInt i = 0; i < Q_1D; i++) {
267           // Note: shifting to avoid atomic adds
268           const CeedInt ii = (i + data.t_id_x) % Q_1D;
269 
270           for (CeedInt j = 0; j < Q_1D; j++) {
271             const CeedInt jj = (j + data.t_id_y) % Q_1D;
272 
273             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274           }
275         }
276       }
277     }
278     // Pull from shared to register
279     __syncthreads();
280     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
281   }
282 }
283 
284 //------------------------------------------------------------------------------
285 // 3D
286 //------------------------------------------------------------------------------
287 
288 //------------------------------------------------------------------------------
289 // 3D interpolate to points
290 //------------------------------------------------------------------------------
291 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
292 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
293                                         CeedScalar *__restrict__ r_V) {
294   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
295   for (CeedInt k = 0; k < Q_1D; k++) {
296     CeedScalar buffer[Q_1D];
297     CeedScalar chebyshev_x[Q_1D];
298 
299     // Get z contraction value
300     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
301     const CeedScalar z = chebyshev_x[k];
302 
303     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
304       // Load coefficients
305       __syncthreads();
306       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
307       __syncthreads();
308       // Contract x direction
309       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
310       for (CeedInt i = 0; i < Q_1D; i++) {
311         buffer[i] = 0.0;
312         for (CeedInt j = 0; j < Q_1D; j++) {
313           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
314         }
315       }
316       // Contract y and z direction
317       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
318       for (CeedInt i = 0; i < Q_1D; i++) {
319         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
320       }
321     }
322   }
323 }
324 
325 //------------------------------------------------------------------------------
326 // 3D interpolate transpose
327 //------------------------------------------------------------------------------
328 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
329 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
330                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
331   for (CeedInt k = 0; k < Q_1D; k++) {
332     CeedScalar buffer[Q_1D];
333     CeedScalar chebyshev_x[Q_1D];
334 
335     // Get z contraction value
336     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
337     const CeedScalar z = chebyshev_x[k];
338 
339     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
340       // Clear shared memory
341       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
342       __syncthreads();
343       // Contract y and z direction
344       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
345       for (CeedInt i = 0; i < Q_1D; i++) {
346         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
347       }
348       // Contract x direction
349       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
350       if (p < NUM_POINTS) {
351         for (CeedInt i = 0; i < Q_1D; i++) {
352           // Note: shifting to avoid atomic adds
353           const CeedInt ii = (i + data.t_id_x) % Q_1D;
354 
355           for (CeedInt j = 0; j < Q_1D; j++) {
356             const CeedInt jj = (j + data.t_id_y) % Q_1D;
357 
358             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359           }
360         }
361       }
362       // Pull from shared to register
363       __syncthreads();
364       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
365     }
366   }
367 }
368 
369 //------------------------------------------------------------------------------
370 // 3D derivatives at points
371 //------------------------------------------------------------------------------
372 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
373 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
374                                       CeedScalar *__restrict__ r_V) {
375   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
376   for (CeedInt k = 0; k < Q_1D; k++) {
377     CeedScalar buffer[Q_1D];
378     CeedScalar chebyshev_x[Q_1D];
379 
380     // Get z contraction values
381     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
382     const CeedScalar z = chebyshev_x[k];
383 
384     ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
385     const CeedScalar dz = chebyshev_x[k];
386 
387     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
388       // Load coefficients
389       __syncthreads();
390       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
391       __syncthreads();
392       // Gradient directions
393       for (CeedInt dim = 0; dim < 3; dim++) {
394         // Contract x direction
395         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
396         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
397         for (CeedInt i = 0; i < Q_1D; i++) {
398           buffer[i] = 0.0;
399           for (CeedInt j = 0; j < Q_1D; j++) {
400             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
401           }
402         }
403         // Contract y and z direction
404         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
405         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
406         const CeedScalar zz = dim == 2 ? dz : z;
407 
408         for (CeedInt i = 0; i < Q_1D; i++) {
409           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
410         }
411       }
412     }
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // 3D derivatives transpose
418 //------------------------------------------------------------------------------
419 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
420 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
421                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
422   for (CeedInt k = 0; k < Q_1D; k++) {
423     CeedScalar buffer[Q_1D];
424     CeedScalar chebyshev_x[Q_1D];
425 
426     // Get z contraction values
427     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
428     const CeedScalar z = chebyshev_x[k];
429 
430     ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
431     const CeedScalar dz = chebyshev_x[k];
432 
433     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
434       // Clear shared memory
435       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
436       __syncthreads();
437       // Gradient directions
438       for (CeedInt dim = 0; dim < 3; dim++) {
439         // Contract y and z direction
440         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
441         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
442         const CeedScalar zz = dim == 2 ? dz : z;
443 
444         for (CeedInt i = 0; i < Q_1D; i++) {
445           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
446         }
447         // Contract x direction
448         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
449         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
450         if (p < NUM_POINTS) {
451           for (CeedInt i = 0; i < Q_1D; i++) {
452             // Note: shifting to avoid atomic adds
453             const CeedInt ii = (i + data.t_id_x) % Q_1D;
454 
455             for (CeedInt j = 0; j < Q_1D; j++) {
456               const CeedInt jj = (j + data.t_id_y) % Q_1D;
457 
458               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459             }
460           }
461         }
462       }
463       // Pull from shared to register
464       __syncthreads();
465       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
466     }
467   }
468 }
469