xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h (revision 0183ed61035d97ff853cf8c8e722c0fda76e54df)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
45                                         CeedScalar *__restrict__ r_V) {
46   CeedScalar chebyshev_x[Q_1D];
47 
48   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51     // Load coefficients
52     __syncthreads();
53     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
54     __syncthreads();
55     // Contract x direction
56     for (CeedInt i = 0; i < Q_1D; i++) {
57       r_V[comp] += chebyshev_x[i] * data.slice[i];
58     }
59   }
60 }
61 
62 //------------------------------------------------------------------------------
63 // 1D interpolate transpose
64 //------------------------------------------------------------------------------
65 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
66 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
67                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
68   CeedScalar chebyshev_x[Q_1D];
69 
70   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
71   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
72     // Clear shared memory
73     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
74     __syncthreads();
75     // Contract x direction
76     if (p < NUM_POINTS) {
77       for (CeedInt i = 0; i < Q_1D; i++) {
78         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
79       }
80     }
81     // Pull from shared to register
82     __syncthreads();
83     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
84   }
85 }
86 
87 //------------------------------------------------------------------------------
88 // 1D derivatives at points
89 //------------------------------------------------------------------------------
90 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
91 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
92                                       CeedScalar *__restrict__ r_V) {
93   CeedScalar chebyshev_x[Q_1D];
94 
95   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
96   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
97   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98     __syncthreads();
99     // Load coefficients
100     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
101     __syncthreads();
102     // Contract x direction
103     for (CeedInt i = 0; i < Q_1D; i++) {
104       r_V[comp] += chebyshev_x[i] * data.slice[i];
105     }
106   }
107 }
108 
109 //------------------------------------------------------------------------------
110 // 1D derivatives transpose
111 //------------------------------------------------------------------------------
112 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
113 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
114                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
115   CeedScalar chebyshev_x[Q_1D];
116 
117   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
118   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
119     // Clear shared memory
120     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
121     __syncthreads();
122     // Contract x direction
123     if (p < NUM_POINTS) {
124       for (CeedInt i = 0; i < Q_1D; i++) {
125         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
126       }
127     }
128     // Pull from shared to register
129     __syncthreads();
130     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
131   }
132 }
133 
134 //------------------------------------------------------------------------------
135 // 2D
136 //------------------------------------------------------------------------------
137 
138 //------------------------------------------------------------------------------
139 // 2D interpolate to points
140 //------------------------------------------------------------------------------
141 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
142 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
143                                         CeedScalar *__restrict__ r_V) {
144   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
145   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
146     CeedScalar buffer[Q_1D];
147     CeedScalar chebyshev_x[Q_1D];
148 
149     // Load coefficients
150     __syncthreads();
151     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
152     __syncthreads();
153     // Contract x direction
154     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
155     for (CeedInt i = 0; i < Q_1D; i++) {
156       buffer[i] = 0.0;
157       for (CeedInt j = 0; j < Q_1D; j++) {
158         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
159       }
160     }
161     // Contract y direction
162     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
163     for (CeedInt i = 0; i < Q_1D; i++) {
164       r_V[comp] += chebyshev_x[i] * buffer[i];
165     }
166   }
167 }
168 
169 //------------------------------------------------------------------------------
170 // 2D interpolate transpose
171 //------------------------------------------------------------------------------
172 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
173 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
174                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
175   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
176     CeedScalar buffer[Q_1D];
177     CeedScalar chebyshev_x[Q_1D];
178 
179     // Clear shared memory
180     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
181     __syncthreads();
182     // Contract y direction
183     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
184     for (CeedInt i = 0; i < Q_1D; i++) {
185       buffer[i] = chebyshev_x[i] * r_U[comp];
186     }
187     // Contract x direction
188     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
189     if (p < NUM_POINTS) {
190       for (CeedInt i = 0; i < Q_1D; i++) {
191         // Note: shifting to avoid atomic adds
192         const CeedInt ii = (i + data.t_id_x) % Q_1D;
193 
194         for (CeedInt j = 0; j < Q_1D; j++) {
195           const CeedInt jj = (j + data.t_id_y) % Q_1D;
196 
197           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198         }
199       }
200     }
201     // Pull from shared to register
202     __syncthreads();
203     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
204   }
205 }
206 
207 //------------------------------------------------------------------------------
208 // 2D derivatives at points
209 //------------------------------------------------------------------------------
210 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
211 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
212                                       CeedScalar *__restrict__ r_V) {
213   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
214   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
215     CeedScalar buffer[Q_1D];
216     CeedScalar chebyshev_x[Q_1D];
217 
218     // Load coefficients
219     __syncthreads();
220     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
221     __syncthreads();
222     for (CeedInt dim = 0; dim < 2; dim++) {
223       // Contract x direction
224       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
225       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
226       for (CeedInt i = 0; i < Q_1D; i++) {
227         buffer[i] = 0.0;
228         for (CeedInt j = 0; j < Q_1D; j++) {
229           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
230         }
231       }
232       // Contract y direction
233       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
234       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
235       for (CeedInt i = 0; i < Q_1D; i++) {
236         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
237       }
238     }
239   }
240 }
241 
242 //------------------------------------------------------------------------------
243 // 2D derivatives transpose
244 //------------------------------------------------------------------------------
245 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
246 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
247                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
248   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
249     CeedScalar buffer[Q_1D];
250     CeedScalar chebyshev_x[Q_1D];
251 
252     // Clear shared memory
253     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
254     __syncthreads();
255     for (CeedInt dim = 0; dim < 2; dim++) {
256       // Contract y direction
257       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
258       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
259       for (CeedInt i = 0; i < Q_1D; i++) {
260         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
261       }
262       // Contract x direction
263       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
264       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
265       if (p < NUM_POINTS) {
266         for (CeedInt i = 0; i < Q_1D; i++) {
267           // Note: shifting to avoid atomic adds
268           const CeedInt ii = (i + data.t_id_x) % Q_1D;
269 
270           for (CeedInt j = 0; j < Q_1D; j++) {
271             const CeedInt jj = (j + data.t_id_y) % Q_1D;
272 
273             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274           }
275         }
276       }
277     }
278     // Pull from shared to register
279     __syncthreads();
280     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
281   }
282 }
283 
284 //------------------------------------------------------------------------------
285 // 3D
286 //------------------------------------------------------------------------------
287 
288 //------------------------------------------------------------------------------
289 // 3D interpolate to points
290 //------------------------------------------------------------------------------
291 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
292 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
293                                         CeedScalar *__restrict__ r_V) {
294   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
295   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
296     for (CeedInt k = 0; k < Q_1D; k++) {
297       CeedScalar buffer[Q_1D];
298       CeedScalar chebyshev_x[Q_1D];
299 
300       // Load coefficients
301       __syncthreads();
302       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
303       __syncthreads();
304       // Contract x direction
305       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
306       for (CeedInt i = 0; i < Q_1D; i++) {
307         buffer[i] = 0.0;
308         for (CeedInt j = 0; j < Q_1D; j++) {
309           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
310         }
311       }
312       // Contract y and z direction
313       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
314       const CeedScalar z = chebyshev_x[k];
315 
316       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
317       for (CeedInt i = 0; i < Q_1D; i++) {
318         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
319       }
320     }
321   }
322 }
323 
324 //------------------------------------------------------------------------------
325 // 3D interpolate transpose
326 //------------------------------------------------------------------------------
327 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
328 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
329                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
330   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
331     for (CeedInt k = 0; k < Q_1D; k++) {
332       CeedScalar buffer[Q_1D];
333       CeedScalar chebyshev_x[Q_1D];
334 
335       // Clear shared memory
336       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
337       __syncthreads();
338       // Contract y and z direction
339       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
340       const CeedScalar z = chebyshev_x[k];
341 
342       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
343       for (CeedInt i = 0; i < Q_1D; i++) {
344         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
345       }
346       // Contract x direction
347       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
348       if (p < NUM_POINTS) {
349         for (CeedInt i = 0; i < Q_1D; i++) {
350           // Note: shifting to avoid atomic adds
351           const CeedInt ii = (i + data.t_id_x) % Q_1D;
352 
353           for (CeedInt j = 0; j < Q_1D; j++) {
354             const CeedInt jj = (j + data.t_id_y) % Q_1D;
355 
356             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
357           }
358         }
359       }
360       // Pull from shared to register
361       __syncthreads();
362       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
363     }
364   }
365 }
366 
367 //------------------------------------------------------------------------------
368 // 3D derivatives at points
369 //------------------------------------------------------------------------------
370 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
371 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
372                                       CeedScalar *__restrict__ r_V) {
373   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
374   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
375     for (CeedInt k = 0; k < Q_1D; k++) {
376       CeedScalar buffer[Q_1D];
377       CeedScalar chebyshev_x[Q_1D];
378 
379       // Load coefficients
380       __syncthreads();
381       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
382       __syncthreads();
383       for (CeedInt dim = 0; dim < 3; dim++) {
384         // Contract x direction
385         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
386         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
387         for (CeedInt i = 0; i < Q_1D; i++) {
388           buffer[i] = 0.0;
389           for (CeedInt j = 0; j < Q_1D; j++) {
390             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
391           }
392         }
393         // Contract y and z direction
394         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
395         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
396         const CeedScalar z = chebyshev_x[k];
397 
398         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
399         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
400         for (CeedInt i = 0; i < Q_1D; i++) {
401           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
402         }
403       }
404     }
405   }
406 }
407 
408 //------------------------------------------------------------------------------
409 // 3D derivatives transpose
410 //------------------------------------------------------------------------------
411 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
412 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
413                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
414   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
415     for (CeedInt k = 0; k < Q_1D; k++) {
416       CeedScalar buffer[Q_1D];
417       CeedScalar chebyshev_x[Q_1D];
418 
419       // Clear shared memory
420       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
421       __syncthreads();
422       for (CeedInt dim = 0; dim < 3; dim++) {
423         // Contract y and z direction
424         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
425         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
426         const CeedScalar z = chebyshev_x[k];
427 
428         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
429         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
430         for (CeedInt i = 0; i < Q_1D; i++) {
431           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
432         }
433         // Contract x direction
434         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
435         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
436         if (p < NUM_POINTS) {
437           for (CeedInt i = 0; i < Q_1D; i++) {
438             // Note: shifting to avoid atomic adds
439             const CeedInt ii = (i + data.t_id_x) % Q_1D;
440 
441             for (CeedInt j = 0; j < Q_1D; j++) {
442               const CeedInt jj = (j + data.t_id_y) % Q_1D;
443 
444               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
445             }
446           }
447         }
448       }
449       // Pull from shared to register
450       __syncthreads();
451       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
452     }
453   }
454 }
455