1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
ChebyshevPolynomialsAtPoint(const CeedScalar x,CeedScalar * chebyshev_x)16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17 chebyshev_x[0] = 1.0;
18 chebyshev_x[1] = 2 * x;
19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21
22 template <int Q_1D>
ChebyshevDerivativeAtPoint(const CeedScalar x,CeedScalar * chebyshev_dx)23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24 CeedScalar chebyshev_x[3];
25
26 chebyshev_x[1] = 1.0;
27 chebyshev_x[2] = 2 * x;
28 chebyshev_dx[0] = 0.0;
29 chebyshev_dx[1] = 2.0;
30 for (CeedInt i = 2; i < Q_1D; i++) {
31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33 }
34 }
35
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
45 CeedScalar *__restrict__ r_V) {
46 CeedScalar chebyshev_x[Q_1D];
47
48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51 // Load coefficients
52 __syncthreads();
53 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
54 __syncthreads();
55 // Contract x direction
56 for (CeedInt i = 0; i < Q_1D; i++) {
57 r_V[comp] += chebyshev_x[i] * data.slice[i];
58 }
59 }
60 }
61
62 //------------------------------------------------------------------------------
63 // 1D interpolate transpose
64 //------------------------------------------------------------------------------
65 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)66 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
67 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
68 CeedScalar chebyshev_x[Q_1D];
69
70 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
71 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
72 // Clear shared memory
73 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
74 __syncthreads();
75 // Contract x direction
76 if (p < NUM_POINTS) {
77 for (CeedInt i = 0; i < Q_1D; i++) {
78 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
79 }
80 }
81 // Pull from shared to register
82 __syncthreads();
83 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
84 }
85 }
86
87 //------------------------------------------------------------------------------
88 // 1D derivatives at points
89 //------------------------------------------------------------------------------
90 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)91 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
92 CeedScalar *__restrict__ r_V) {
93 CeedScalar chebyshev_x[Q_1D];
94
95 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
96 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98 __syncthreads();
99 // Load coefficients
100 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
101 __syncthreads();
102 // Contract x direction
103 for (CeedInt i = 0; i < Q_1D; i++) {
104 r_V[comp] += chebyshev_x[i] * data.slice[i];
105 }
106 }
107 }
108
109 //------------------------------------------------------------------------------
110 // 1D derivatives transpose
111 //------------------------------------------------------------------------------
112 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints1d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)113 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
114 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
115 CeedScalar chebyshev_x[Q_1D];
116
117 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
118 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
119 // Clear shared memory
120 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
121 __syncthreads();
122 // Contract x direction
123 if (p < NUM_POINTS) {
124 for (CeedInt i = 0; i < Q_1D; i++) {
125 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
126 }
127 }
128 // Pull from shared to register
129 __syncthreads();
130 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
131 }
132 }
133
134 //------------------------------------------------------------------------------
135 // 2D
136 //------------------------------------------------------------------------------
137
138 //------------------------------------------------------------------------------
139 // 2D interpolate to points
140 //------------------------------------------------------------------------------
141 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)142 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
143 CeedScalar *__restrict__ r_V) {
144 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
145 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
146 CeedScalar buffer[Q_1D];
147 CeedScalar chebyshev_x[Q_1D];
148
149 // Load coefficients
150 __syncthreads();
151 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
152 __syncthreads();
153 // Contract x direction
154 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
155 for (CeedInt i = 0; i < Q_1D; i++) {
156 buffer[i] = 0.0;
157 for (CeedInt j = 0; j < Q_1D; j++) {
158 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
159 }
160 }
161 // Contract y direction
162 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
163 for (CeedInt i = 0; i < Q_1D; i++) {
164 r_V[comp] += chebyshev_x[i] * buffer[i];
165 }
166 }
167 }
168
169 //------------------------------------------------------------------------------
170 // 2D interpolate transpose
171 //------------------------------------------------------------------------------
172 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)173 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
174 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
175 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
176 CeedScalar buffer[Q_1D];
177 CeedScalar chebyshev_x[Q_1D];
178
179 // Clear shared memory
180 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
181 __syncthreads();
182 // Contract y direction
183 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
184 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
185
186 for (CeedInt i = 0; i < Q_1D; i++) {
187 buffer[i] = chebyshev_x[i] * r_u;
188 }
189 // Contract x direction
190 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
191 for (CeedInt i = 0; i < Q_1D; i++) {
192 // Note: shifting to avoid atomic adds
193 const CeedInt ii = (i + data.t_id_y) % Q_1D;
194
195 for (CeedInt j = 0; j < Q_1D; j++) {
196 const CeedInt jj = (j + data.t_id_x) % Q_1D;
197
198 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
199 }
200 }
201 // Pull from shared to register
202 __syncthreads();
203 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
204 }
205 }
206
207 //------------------------------------------------------------------------------
208 // 2D derivatives at points
209 //------------------------------------------------------------------------------
210 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)211 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
212 CeedScalar *__restrict__ r_V) {
213 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
214 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
215 CeedScalar buffer[Q_1D];
216 CeedScalar chebyshev_x[Q_1D];
217
218 // Load coefficients
219 __syncthreads();
220 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
221 __syncthreads();
222 for (CeedInt dim = 0; dim < 2; dim++) {
223 // Contract x direction
224 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
225 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
226 for (CeedInt i = 0; i < Q_1D; i++) {
227 buffer[i] = 0.0;
228 for (CeedInt j = 0; j < Q_1D; j++) {
229 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
230 }
231 }
232 // Contract y direction
233 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
234 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
235 for (CeedInt i = 0; i < Q_1D; i++) {
236 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
237 }
238 }
239 }
240 }
241
242 //------------------------------------------------------------------------------
243 // 2D derivatives transpose
244 //------------------------------------------------------------------------------
245 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints2d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)246 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
247 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
248 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
249 CeedScalar buffer[Q_1D];
250 CeedScalar chebyshev_x[Q_1D];
251
252 // Clear shared memory
253 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
254 __syncthreads();
255 for (CeedInt dim = 0; dim < 2; dim++) {
256 // Contract y direction
257 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
258 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
259 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
260
261 for (CeedInt i = 0; i < Q_1D; i++) {
262 buffer[i] = chebyshev_x[i] * r_u;
263 }
264 // Contract x direction
265 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
266 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
267 for (CeedInt i = 0; i < Q_1D; i++) {
268 // Note: shifting to avoid atomic adds
269 const CeedInt ii = (i + data.t_id_y) % Q_1D;
270
271 for (CeedInt j = 0; j < Q_1D; j++) {
272 const CeedInt jj = (j + data.t_id_x) % Q_1D;
273
274 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
275 }
276 }
277 }
278 // Pull from shared to register
279 __syncthreads();
280 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
281 }
282 }
283
284 //------------------------------------------------------------------------------
285 // 3D
286 //------------------------------------------------------------------------------
287
288 //------------------------------------------------------------------------------
289 // 3D interpolate to points
290 //------------------------------------------------------------------------------
291 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)292 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
293 CeedScalar *__restrict__ r_V) {
294 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
295 for (CeedInt k = 0; k < Q_1D; k++) {
296 CeedScalar buffer[Q_1D];
297 CeedScalar chebyshev_x[Q_1D];
298
299 // Get z contraction value
300 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
301 const CeedScalar z = chebyshev_x[k];
302
303 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
304 // Load coefficients
305 __syncthreads();
306 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
307 __syncthreads();
308 // Contract x direction
309 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
310 for (CeedInt i = 0; i < Q_1D; i++) {
311 buffer[i] = 0.0;
312 for (CeedInt j = 0; j < Q_1D; j++) {
313 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
314 }
315 }
316 // Contract y and z direction
317 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
318 for (CeedInt i = 0; i < Q_1D; i++) {
319 r_V[comp] += chebyshev_x[i] * buffer[i] * z;
320 }
321 }
322 }
323 }
324
325 //------------------------------------------------------------------------------
326 // 3D interpolate transpose
327 //------------------------------------------------------------------------------
328 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
InterpTransposeAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)329 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
330 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
331 for (CeedInt k = 0; k < Q_1D; k++) {
332 CeedScalar buffer[Q_1D];
333 CeedScalar chebyshev_x[Q_1D];
334
335 // Get z contraction value
336 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
337 const CeedScalar z = chebyshev_x[k];
338
339 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
340 // Clear shared memory
341 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
342 __syncthreads();
343 // Contract y and z direction
344 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
345 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
346
347 for (CeedInt i = 0; i < Q_1D; i++) {
348 buffer[i] = chebyshev_x[i] * r_u * z;
349 }
350 // Contract x direction
351 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
352 for (CeedInt i = 0; i < Q_1D; i++) {
353 // Note: shifting to avoid atomic adds
354 const CeedInt ii = (i + data.t_id_y) % Q_1D;
355
356 for (CeedInt j = 0; j < Q_1D; j++) {
357 const CeedInt jj = (j + data.t_id_x) % Q_1D;
358
359 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
360 }
361 }
362 // Pull from shared to register
363 __syncthreads();
364 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
365 }
366 }
367 }
368
369 //------------------------------------------------------------------------------
370 // 3D derivatives at points
371 //------------------------------------------------------------------------------
372 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_C,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_V)373 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
374 CeedScalar *__restrict__ r_V) {
375 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
376 for (CeedInt k = 0; k < Q_1D; k++) {
377 CeedScalar buffer[Q_1D];
378 CeedScalar chebyshev_x[Q_1D];
379
380 // Get z contraction values
381 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
382 const CeedScalar z = chebyshev_x[k];
383
384 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
385 const CeedScalar dz = chebyshev_x[k];
386
387 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
388 // Load coefficients
389 __syncthreads();
390 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
391 __syncthreads();
392 // Gradient directions
393 for (CeedInt dim = 0; dim < 3; dim++) {
394 // Contract x direction
395 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
396 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
397 for (CeedInt i = 0; i < Q_1D; i++) {
398 buffer[i] = 0.0;
399 for (CeedInt j = 0; j < Q_1D; j++) {
400 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
401 }
402 }
403 // Contract y and z direction
404 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
405 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
406 const CeedScalar zz = dim == 2 ? dz : z;
407
408 for (CeedInt i = 0; i < Q_1D; i++) {
409 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
410 }
411 }
412 }
413 }
414 }
415
416 //------------------------------------------------------------------------------
417 // 3D derivatives transpose
418 //------------------------------------------------------------------------------
419 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
GradTransposeAtPoints3d(SharedData_Hip & data,const CeedInt p,const CeedScalar * __restrict__ r_U,const CeedScalar * __restrict__ r_X,CeedScalar * __restrict__ r_C)420 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
421 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
422 for (CeedInt k = 0; k < Q_1D; k++) {
423 CeedScalar buffer[Q_1D];
424 CeedScalar chebyshev_x[Q_1D];
425
426 // Get z contraction values
427 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
428 const CeedScalar z = chebyshev_x[k];
429
430 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
431 const CeedScalar dz = chebyshev_x[k];
432
433 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
434 // Clear shared memory
435 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
436 __syncthreads();
437 // Gradient directions
438 for (CeedInt dim = 0; dim < 3; dim++) {
439 // Contract y and z direction
440 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
441 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
442 const CeedScalar zz = dim == 2 ? dz : z;
443 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
444
445 for (CeedInt i = 0; i < Q_1D; i++) {
446 buffer[i] = chebyshev_x[i] * r_u * zz;
447 }
448 // Contract x direction
449 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
450 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
451 for (CeedInt i = 0; i < Q_1D; i++) {
452 // Note: shifting to avoid atomic adds
453 const CeedInt ii = (i + data.t_id_y) % Q_1D;
454
455 for (CeedInt j = 0; j < Q_1D; j++) {
456 const CeedInt jj = (j + data.t_id_x) % Q_1D;
457
458 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459 }
460 }
461 }
462 // Pull from shared to register
463 __syncthreads();
464 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
465 }
466 }
467 }
468