xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h (revision bfbde1a268fe98bfe2ba1c9d50ebdf542389ce5e)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis AtPoints templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // 1D
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // 1D interpolate to points
42 //------------------------------------------------------------------------------
43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
45                                         CeedScalar *__restrict__ r_V) {
46   CeedScalar chebyshev_x[Q_1D];
47 
48   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
49   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
50   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51     // Load coefficients
52     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
53     __syncthreads();
54     // Contract x direction
55     for (CeedInt i = 0; i < Q_1D; i++) {
56       r_V[comp] += chebyshev_x[i] * data.slice[i];
57     }
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
66                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
67   CeedScalar chebyshev_x[Q_1D];
68 
69   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
70   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
71     // Clear shared memory
72     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
73     __syncthreads();
74     // Contract x direction
75     if (p < NUM_POINTS) {
76       for (CeedInt i = 0; i < Q_1D; i++) {
77         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
78       }
79     }
80     // Pull from shared to register
81     __syncthreads();
82     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // 1D derivatives at points
88 //------------------------------------------------------------------------------
89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
90 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
91                                       CeedScalar *__restrict__ r_V) {
92   CeedScalar chebyshev_x[Q_1D];
93 
94   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
95   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
96   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
97     // Load coefficients
98     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
99     __syncthreads();
100     // Contract x direction
101     for (CeedInt i = 0; i < Q_1D; i++) {
102       r_V[comp] += chebyshev_x[i] * data.slice[i];
103     }
104   }
105 }
106 
107 //------------------------------------------------------------------------------
108 // 1D derivatives transpose
109 //------------------------------------------------------------------------------
110 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
111 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
112                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
113   CeedScalar chebyshev_x[Q_1D];
114 
115   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
116   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
117     // Clear shared memory
118     if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
119     __syncthreads();
120     // Contract x direction
121     if (p < NUM_POINTS) {
122       for (CeedInt i = 0; i < Q_1D; i++) {
123         atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
124       }
125     }
126     // Pull from shared to register
127     __syncthreads();
128     if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
129   }
130 }
131 
132 //------------------------------------------------------------------------------
133 // 2D
134 //------------------------------------------------------------------------------
135 
136 //------------------------------------------------------------------------------
137 // 2D interpolate to points
138 //------------------------------------------------------------------------------
139 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
140 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
141                                         CeedScalar *__restrict__ r_V) {
142   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
143   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
144     CeedScalar buffer[Q_1D];
145     CeedScalar chebyshev_x[Q_1D];
146 
147     // Load coefficients
148     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
149     __syncthreads();
150     // Contract x direction
151     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
152     for (CeedInt i = 0; i < Q_1D; i++) {
153       buffer[i] = 0.0;
154       for (CeedInt j = 0; j < Q_1D; j++) {
155         buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
156       }
157     }
158     // Contract y direction
159     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
160     for (CeedInt i = 0; i < Q_1D; i++) {
161       r_V[comp] += chebyshev_x[i] * buffer[i];
162     }
163   }
164 }
165 
166 //------------------------------------------------------------------------------
167 // 2D interpolate transpose
168 //------------------------------------------------------------------------------
169 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
170 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
171                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
172   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
173     CeedScalar buffer[Q_1D];
174     CeedScalar chebyshev_x[Q_1D];
175 
176     // Clear shared memory
177     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
178     __syncthreads();
179     // Contract y direction
180     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
181     for (CeedInt i = 0; i < Q_1D; i++) {
182       buffer[i] = chebyshev_x[i] * r_U[comp];
183     }
184     // Contract x direction
185     ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
186     if (p < NUM_POINTS) {
187       for (CeedInt i = 0; i < Q_1D; i++) {
188         // Note: shifting to avoid atomic adds
189         const CeedInt ii = (i + data.t_id_x) % Q_1D;
190 
191         for (CeedInt j = 0; j < Q_1D; j++) {
192           const CeedInt jj = (j + data.t_id_y) % Q_1D;
193 
194           atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
195         }
196       }
197     }
198     // Pull from shared to register
199     __syncthreads();
200     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // 2D derivatives at points
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
208 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
209                                       CeedScalar *__restrict__ r_V) {
210   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
211   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
212     CeedScalar buffer[Q_1D];
213     CeedScalar chebyshev_x[Q_1D];
214 
215     // Load coefficients
216     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
217     __syncthreads();
218     for (CeedInt dim = 0; dim < 2; dim++) {
219       // Contract x direction
220       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
221       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
222       for (CeedInt i = 0; i < Q_1D; i++) {
223         buffer[i] = 0.0;
224         for (CeedInt j = 0; j < Q_1D; j++) {
225           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
226         }
227       }
228       // Contract y direction
229       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
230       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
231       for (CeedInt i = 0; i < Q_1D; i++) {
232         r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
233       }
234     }
235   }
236 }
237 
238 //------------------------------------------------------------------------------
239 // 2D derivatives transpose
240 //------------------------------------------------------------------------------
241 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
242 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
243                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
244   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
245     CeedScalar buffer[Q_1D];
246     CeedScalar chebyshev_x[Q_1D];
247 
248     // Clear shared memory
249     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
250     __syncthreads();
251     for (CeedInt dim = 0; dim < 2; dim++) {
252       // Contract y direction
253       if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
254       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
255       for (CeedInt i = 0; i < Q_1D; i++) {
256         buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
257       }
258       // Contract x direction
259       if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
260       else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
261       if (p < NUM_POINTS) {
262         for (CeedInt i = 0; i < Q_1D; i++) {
263           // Note: shifting to avoid atomic adds
264           const CeedInt ii = (i + data.t_id_x) % Q_1D;
265 
266           for (CeedInt j = 0; j < Q_1D; j++) {
267             const CeedInt jj = (j + data.t_id_y) % Q_1D;
268 
269             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
270           }
271         }
272       }
273     }
274     // Pull from shared to register
275     __syncthreads();
276     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
277   }
278 }
279 
280 //------------------------------------------------------------------------------
281 // 3D
282 //------------------------------------------------------------------------------
283 
284 //------------------------------------------------------------------------------
285 // 3D interpolate to points
286 //------------------------------------------------------------------------------
287 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
288 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
289                                         CeedScalar *__restrict__ r_V) {
290   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
291   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
292     for (CeedInt k = 0; k < Q_1D; k++) {
293       CeedScalar buffer[Q_1D];
294       CeedScalar chebyshev_x[Q_1D];
295 
296       // Load coefficients
297       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
298       __syncthreads();
299       // Contract x direction
300       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
301       for (CeedInt i = 0; i < Q_1D; i++) {
302         buffer[i] = 0.0;
303         for (CeedInt j = 0; j < Q_1D; j++) {
304           buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
305         }
306       }
307       // Contract y and z direction
308       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
309       const CeedScalar z = chebyshev_x[k];
310 
311       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
312       for (CeedInt i = 0; i < Q_1D; i++) {
313         r_V[comp] += chebyshev_x[i] * buffer[i] * z;
314       }
315     }
316   }
317 }
318 
319 //------------------------------------------------------------------------------
320 // 3D interpolate transpose
321 //------------------------------------------------------------------------------
322 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
323 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
324                                                  const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
325   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
326     for (CeedInt k = 0; k < Q_1D; k++) {
327       CeedScalar buffer[Q_1D];
328       CeedScalar chebyshev_x[Q_1D];
329 
330       // Clear shared memory
331       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
332       __syncthreads();
333       // Contract y and z direction
334       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
335       const CeedScalar z = chebyshev_x[k];
336 
337       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
338       for (CeedInt i = 0; i < Q_1D; i++) {
339         buffer[i] = chebyshev_x[i] * r_U[comp] * z;
340       }
341       // Contract x direction
342       ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
343       if (p < NUM_POINTS) {
344         for (CeedInt i = 0; i < Q_1D; i++) {
345           // Note: shifting to avoid atomic adds
346           const CeedInt ii = (i + data.t_id_x) % Q_1D;
347 
348           for (CeedInt j = 0; j < Q_1D; j++) {
349             const CeedInt jj = (j + data.t_id_y) % Q_1D;
350 
351             atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
352           }
353         }
354       }
355       // Pull from shared to register
356       __syncthreads();
357       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
358     }
359   }
360 }
361 
362 //------------------------------------------------------------------------------
363 // 3D derivatives at points
364 //------------------------------------------------------------------------------
365 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
366 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
367                                       CeedScalar *__restrict__ r_V) {
368   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
369   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
370     for (CeedInt k = 0; k < Q_1D; k++) {
371       CeedScalar buffer[Q_1D];
372       CeedScalar chebyshev_x[Q_1D];
373 
374       // Load coefficients
375       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
376       __syncthreads();
377       for (CeedInt dim = 0; dim < 3; dim++) {
378         // Contract x direction
379         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
380         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
381         for (CeedInt i = 0; i < Q_1D; i++) {
382           buffer[i] = 0.0;
383           for (CeedInt j = 0; j < Q_1D; j++) {
384             buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
385           }
386         }
387         // Contract y and z direction
388         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
389         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
390         const CeedScalar z = chebyshev_x[k];
391 
392         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
393         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
394         for (CeedInt i = 0; i < Q_1D; i++) {
395           r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
396         }
397       }
398     }
399   }
400 }
401 
402 //------------------------------------------------------------------------------
403 // 3D derivatives transpose
404 //------------------------------------------------------------------------------
405 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
406 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
407                                                const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
408   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
409     for (CeedInt k = 0; k < Q_1D; k++) {
410       CeedScalar buffer[Q_1D];
411       CeedScalar chebyshev_x[Q_1D];
412 
413       // Clear shared memory
414       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
415       __syncthreads();
416       for (CeedInt dim = 0; dim < 3; dim++) {
417         // Contract y and z direction
418         if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
419         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
420         const CeedScalar z = chebyshev_x[k];
421 
422         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
423         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
424         for (CeedInt i = 0; i < Q_1D; i++) {
425           buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
426         }
427         // Contract x direction
428         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
429         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
430         if (p < NUM_POINTS) {
431           for (CeedInt i = 0; i < Q_1D; i++) {
432             // Note: shifting to avoid atomic adds
433             const CeedInt ii = (i + data.t_id_x) % Q_1D;
434 
435             for (CeedInt j = 0; j < Q_1D; j++) {
436               const CeedInt jj = (j + data.t_id_y) % Q_1D;
437 
438               atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
439             }
440           }
441         }
442       }
443       // Pull from shared to register
444       __syncthreads();
445       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
446     }
447   }
448 }
449