1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
ChebyshevPolynomialsAtPoint(const CeedScalar x,CeedScalar * chebyshev_x)16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17 chebyshev_x[0] = 1.0;
18 chebyshev_x[1] = 2 * x;
19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21
22 template <int Q_1D>
ChebyshevDerivativeAtPoint(const CeedScalar x,CeedScalar * chebyshev_dx)23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24 CeedScalar chebyshev_x[3];
25
26 chebyshev_x[1] = 1.0;
27 chebyshev_x[2] = 2 * x;
28 chebyshev_dx[0] = 0.0;
29 chebyshev_dx[1] = 2.0;
30 for (CeedInt i = 2; i < Q_1D; i++) {
31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33 }
34 }
35
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)44 inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
45 CeedScalar *__restrict__ r_V) {
46 CeedScalar chebyshev_x[Q_1D];
47
48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51 // Load coefficients
52 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
53 __syncthreads();
54 // Contract x direction
55 for (CeedInt i = 0; i < Q_1D; i++) {
56 r_V[comp] += chebyshev_x[i] * data.slice[i];
57 }
58 }
59 }
60
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
66 CeedScalar *__restrict__ r_C) {
67 CeedScalar chebyshev_x[Q_1D];
68
69 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
70 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
71 // Clear shared memory
72 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
73 __syncthreads();
74 // Contract x direction
75 if (p < NUM_POINTS) {
76 for (CeedInt i = 0; i < Q_1D; i++) {
77 atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
78 }
79 }
80 // Pull from shared to register
81 __syncthreads();
82 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
83 }
84 }
85
86 //------------------------------------------------------------------------------
87 // 1D derivatives at points
88 //------------------------------------------------------------------------------
89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)90 inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
91 CeedScalar *__restrict__ r_V) {
92 CeedScalar chebyshev_x[Q_1D];
93
94 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
95 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
96 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
97 // Load coefficients
98 __syncthreads();
99 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
100 __syncthreads();
101 // Contract x direction
102 for (CeedInt i = 0; i < Q_1D; i++) {
103 r_V[comp] += chebyshev_x[i] * data.slice[i];
104 }
105 }
106 }
107
108 //------------------------------------------------------------------------------
109 // 1D derivatives transpose
110 //------------------------------------------------------------------------------
111 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints1d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)112 inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
113 CeedScalar *__restrict__ r_C) {
114 CeedScalar chebyshev_x[Q_1D];
115
116 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
117 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
118 // Clear shared memory
119 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
120 __syncthreads();
121 // Contract x direction
122 if (p < NUM_POINTS) {
123 for (CeedInt i = 0; i < Q_1D; i++) {
124 atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
125 }
126 }
127 // Pull from shared to register
128 __syncthreads();
129 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
130 }
131 }
132
133 //------------------------------------------------------------------------------
134 // 2D
135 //------------------------------------------------------------------------------
136
137 //------------------------------------------------------------------------------
138 // 2D interpolate to points
139 //------------------------------------------------------------------------------
140 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)141 inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
142 CeedScalar *__restrict__ r_V) {
143 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
144 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
145 CeedScalar buffer[Q_1D];
146 CeedScalar chebyshev_x[Q_1D];
147
148 // Load coefficients
149 __syncthreads();
150 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
151 __syncthreads();
152 // Contract x direction
153 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
154 for (CeedInt i = 0; i < Q_1D; i++) {
155 buffer[i] = 0.0;
156 for (CeedInt j = 0; j < Q_1D; j++) {
157 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
158 }
159 }
160 // Contract y direction
161 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
162 for (CeedInt i = 0; i < Q_1D; i++) {
163 r_V[comp] += chebyshev_x[i] * buffer[i];
164 }
165 }
166 }
167
168 //------------------------------------------------------------------------------
169 // 2D interpolate transpose
170 //------------------------------------------------------------------------------
171 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)172 inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
173 CeedScalar *__restrict__ r_C) {
174 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
175 CeedScalar buffer[Q_1D];
176 CeedScalar chebyshev_x[Q_1D];
177
178 // Clear shared memory
179 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
180 __syncthreads();
181 // Contract y direction
182 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
183 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
184
185 for (CeedInt i = 0; i < Q_1D; i++) {
186 buffer[i] = chebyshev_x[i] * r_u;
187 }
188 // Contract x direction
189 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
190 for (CeedInt i = 0; i < Q_1D; i++) {
191 // Note: shifting to avoid atomic adds
192 const CeedInt ii = (i + data.t_id_y) % Q_1D;
193
194 for (CeedInt j = 0; j < Q_1D; j++) {
195 const CeedInt jj = (j + data.t_id_x) % Q_1D;
196
197 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198 }
199 }
200 // Pull from shared to register
201 __syncthreads();
202 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
203 }
204 }
205
206 //------------------------------------------------------------------------------
207 // 2D derivatives at points
208 //------------------------------------------------------------------------------
209 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)210 inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
211 CeedScalar *__restrict__ r_V) {
212 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
214 CeedScalar buffer[Q_1D];
215 CeedScalar chebyshev_x[Q_1D];
216
217 // Load coefficients
218 __syncthreads();
219 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
220 __syncthreads();
221 for (CeedInt dim = 0; dim < 2; dim++) {
222 // Contract x direction
223 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
224 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
225 for (CeedInt i = 0; i < Q_1D; i++) {
226 buffer[i] = 0.0;
227 for (CeedInt j = 0; j < Q_1D; j++) {
228 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
229 }
230 }
231 // Contract y direction
232 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
233 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
234 for (CeedInt i = 0; i < Q_1D; i++) {
235 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
236 }
237 }
238 }
239 }
240
241 //------------------------------------------------------------------------------
242 // 2D derivatives transpose
243 //------------------------------------------------------------------------------
244 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints2d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)245 inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
246 CeedScalar *__restrict__ r_C) {
247 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248 CeedScalar buffer[Q_1D];
249 CeedScalar chebyshev_x[Q_1D];
250
251 // Clear shared memory
252 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
253 __syncthreads();
254 for (CeedInt dim = 0; dim < 2; dim++) {
255 // Contract y direction
256 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
257 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
258 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
259
260 for (CeedInt i = 0; i < Q_1D; i++) {
261 buffer[i] = chebyshev_x[i] * r_u;
262 }
263 // Contract x direction
264 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
265 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
266 for (CeedInt i = 0; i < Q_1D; i++) {
267 // Note: shifting to avoid atomic adds
268 const CeedInt ii = (i + data.t_id_y) % Q_1D;
269
270 for (CeedInt j = 0; j < Q_1D; j++) {
271 const CeedInt jj = (j + data.t_id_x) % Q_1D;
272
273 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274 }
275 }
276 }
277 // Pull from shared to register
278 __syncthreads();
279 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
280 }
281 }
282
283 //------------------------------------------------------------------------------
284 // 3D
285 //------------------------------------------------------------------------------
286
287 //------------------------------------------------------------------------------
288 // 3D interpolate to points
289 //------------------------------------------------------------------------------
290 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)291 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
292 CeedScalar *__restrict__ r_V) {
293 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
294 for (CeedInt k = 0; k < Q_1D; k++) {
295 CeedScalar buffer[Q_1D];
296 CeedScalar chebyshev_x[Q_1D];
297
298 // Get z contraction value
299 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
300 const CeedScalar z = chebyshev_x[k];
301
302 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
303 // Load coefficients
304 __syncthreads();
305 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
306 __syncthreads();
307 // Contract x direction
308 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
309 for (CeedInt i = 0; i < Q_1D; i++) {
310 buffer[i] = 0.0;
311 for (CeedInt j = 0; j < Q_1D; j++) {
312 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
313 }
314 }
315 // Contract y and z direction
316 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
317 for (CeedInt i = 0; i < Q_1D; i++) {
318 r_V[comp] += chebyshev_x[i] * buffer[i] * z;
319 }
320 }
321 }
322 }
323
324 //------------------------------------------------------------------------------
325 // 3D interpolate transpose
326 //------------------------------------------------------------------------------
327 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)328 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
329 CeedScalar *__restrict__ r_C) {
330 for (CeedInt k = 0; k < Q_1D; k++) {
331 CeedScalar buffer[Q_1D];
332 CeedScalar chebyshev_x[Q_1D];
333
334 // Get z contraction value
335 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
336 const CeedScalar z = chebyshev_x[k];
337
338 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
339 // Clear shared memory
340 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
341 __syncthreads();
342 // Contract y and z direction
343 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
344 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
345
346 for (CeedInt i = 0; i < Q_1D; i++) {
347 buffer[i] = chebyshev_x[i] * r_u * z;
348 }
349 // Contract x direction
350 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
351 for (CeedInt i = 0; i < Q_1D; i++) {
352 // Note: shifting to avoid atomic adds
353 const CeedInt ii = (i + data.t_id_y) % Q_1D;
354
355 for (CeedInt j = 0; j < Q_1D; j++) {
356 const CeedInt jj = (j + data.t_id_x) % Q_1D;
357
358 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359 }
360 }
361 // Pull from shared to register
362 __syncthreads();
363 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
364 }
365 }
366 }
367
368 //------------------------------------------------------------------------------
369 // 3D derivatives at points
370 //------------------------------------------------------------------------------
371 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * r_X,CeedScalar * __restrict__ r_V)372 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
373 CeedScalar *__restrict__ r_V) {
374 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
375 for (CeedInt k = 0; k < Q_1D; k++) {
376 CeedScalar buffer[Q_1D];
377 CeedScalar chebyshev_x[Q_1D];
378
379 // Get z contraction values
380 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
381 const CeedScalar z = chebyshev_x[k];
382
383 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
384 const CeedScalar dz = chebyshev_x[k];
385
386 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
387 // Load coefficients
388 __syncthreads();
389 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
390 __syncthreads();
391 // Gradient directions
392 for (CeedInt dim = 0; dim < 3; dim++) {
393 // Contract x direction
394 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
395 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
396 for (CeedInt i = 0; i < Q_1D; i++) {
397 buffer[i] = 0.0;
398 for (CeedInt j = 0; j < Q_1D; j++) {
399 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
400 }
401 }
402 // Contract y and z direction
403 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
404 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
405 const CeedScalar zz = dim == 2 ? dz : z;
406
407 for (CeedInt i = 0; i < Q_1D; i++) {
408 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
409 }
410 }
411 }
412 }
413 }
414
415 //------------------------------------------------------------------------------
416 // 3D derivatives transpose
417 //------------------------------------------------------------------------------
418 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints3d(SharedData_Cuda & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * r_X,CeedScalar * __restrict__ r_C)419 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
420 CeedScalar *__restrict__ r_C) {
421 for (CeedInt k = 0; k < Q_1D; k++) {
422 CeedScalar buffer[Q_1D];
423 CeedScalar chebyshev_x[Q_1D];
424
425 // Get z contraction values
426 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
427 const CeedScalar z = chebyshev_x[k];
428
429 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
430 const CeedScalar dz = chebyshev_x[k];
431
432 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
433 // Clear shared memory
434 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
435 __syncthreads();
436 // Gradient directions
437 for (CeedInt dim = 0; dim < 3; dim++) {
438 // Contract y and z direction
439 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
440 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
441 const CeedScalar zz = dim == 2 ? dz : z;
442 const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0;
443
444 for (CeedInt i = 0; i < Q_1D; i++) {
445 buffer[i] = chebyshev_x[i] * r_u * zz;
446 }
447
448 // Contract x direction
449 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
450 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
451 for (CeedInt i = 0; i < Q_1D; i++) {
452 // Note: shifting to avoid atomic adds
453 const CeedInt ii = (i + data.t_id_y) % Q_1D;
454
455 for (CeedInt j = 0; j < Q_1D; j++) {
456 const CeedInt jj = (j + data.t_id_x) % Q_1D;
457
458 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459 }
460 }
461 }
462 // Pull from shared to register
463 __syncthreads();
464 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
465 }
466 }
467 }
468