xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h (revision 42a37a0bedc8a227472fb83b1871607d9c2bb231)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 
11 #include <ceed.h>
12 
13 //------------------------------------------------------------------------------
14 // Chebyshev values
15 //------------------------------------------------------------------------------
16 template <int Q_1D>
17 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
18   chebyshev_x[0] = 1.0;
19   chebyshev_x[1] = 2 * x;
20   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
21 }
22 
23 template <int Q_1D>
24 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
25   CeedScalar chebyshev_x[3];
26 
27   chebyshev_x[1]  = 1.0;
28   chebyshev_x[2]  = 2 * x;
29   chebyshev_dx[0] = 0.0;
30   chebyshev_dx[1] = 2.0;
31   for (CeedInt i = 2; i < Q_1D; i++) {
32     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
33     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Tensor Basis Kernels AtPoints
39 //------------------------------------------------------------------------------
40 
41 //------------------------------------------------------------------------------
42 // Interp
43 //------------------------------------------------------------------------------
44 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
45                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
46   const CeedInt i = threadIdx.x;
47 
48   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
49   CeedScalar           *s_chebyshev_interp_1d = s_mem;
50   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
51   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
52   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
53   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
54   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
55     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
56   }
57 
58   const CeedInt P             = BASIS_P_1D;
59   const CeedInt Q             = BASIS_Q_1D;
60   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
61   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
62   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
63   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
64   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
65 
66   // Apply basis element by element
67   if (is_transpose) {
68     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
69       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
70         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
71         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
72         CeedInt           pre   = 1;
73         CeedInt           post  = 1;
74 
75         // Clear Chebyshev coeffs
76         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
77           s_chebyshev_coeffs[k] = 0.0;
78         }
79 
80         // Map from point
81         __syncthreads();
82         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
83           pre  = 1;
84           post = 1;
85           for (CeedInt d = 0; d < BASIS_DIM; d++) {
86             // Update buffers used
87             pre /= 1;
88             const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
89             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
90 
91             // Build Chebyshev polynomial values
92             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
93 
94             // Contract along middle index
95             for (CeedInt a = 0; a < pre; a++) {
96               for (CeedInt c = 0; c < post; c++) {
97                 if (d == BASIS_DIM - 1) {
98                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
99                 } else {
100                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
101                 }
102               }
103             }
104             post *= Q;
105           }
106         }
107 
108         // Map from coefficients
109         pre  = BASIS_NUM_QPTS;
110         post = 1;
111         for (CeedInt d = 0; d < BASIS_DIM; d++) {
112           __syncthreads();
113           // Update buffers used
114           pre /= Q;
115           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
116           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
117           const CeedInt     writeLen = pre * post * P;
118 
119           // Contract along middle index
120           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
121             const CeedInt c   = k % post;
122             const CeedInt j   = (k / post) % P;
123             const CeedInt a   = k / (post * P);
124             CeedScalar    v_k = 0;
125 
126             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
127             if (d == BASIS_DIM - 1) out[k] += v_k;
128             else out[k] = v_k;
129           }
130           post *= P;
131         }
132       }
133     }
134   } else {
135     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
136       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
137         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
138         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
139         CeedInt           pre   = u_size;
140         CeedInt           post  = 1;
141 
142         // Map to coefficients
143         for (CeedInt d = 0; d < BASIS_DIM; d++) {
144           __syncthreads();
145           // Update buffers used
146           pre /= P;
147           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
148           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
149           const CeedInt     writeLen = pre * post * Q;
150 
151           // Contract along middle index
152           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
153             const CeedInt c   = k % post;
154             const CeedInt j   = (k / post) % Q;
155             const CeedInt a   = k / (post * Q);
156             CeedScalar    v_k = 0;
157 
158             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
159             out[k] = v_k;
160           }
161           post *= Q;
162         }
163 
164         // Map to point
165         __syncthreads();
166         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
167           pre  = BASIS_NUM_QPTS;
168           post = 1;
169           for (CeedInt d = 0; d < BASIS_DIM; d++) {
170             // Update buffers used
171             pre /= Q;
172             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
173             CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
174 
175             // Build Chebyshev polynomial values
176             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
177 
178             // Contract along middle index
179             for (CeedInt a = 0; a < pre; a++) {
180               for (CeedInt c = 0; c < post; c++) {
181                 CeedScalar v_k = 0;
182 
183                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
184                 out[a * post + c] = v_k;
185               }
186             }
187             post *= 1;
188           }
189         }
190       }
191     }
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // Grad
197 //------------------------------------------------------------------------------
198 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
199                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
200   const CeedInt i = threadIdx.x;
201 
202   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
203   CeedScalar           *s_chebyshev_interp_1d = s_mem;
204   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
205   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
206   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
207   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
208   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
209     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
210   }
211 
212   const CeedInt P             = BASIS_P_1D;
213   const CeedInt Q             = BASIS_Q_1D;
214   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
215   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
216   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
217   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
218   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
219   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
220   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
221 
222   // Apply basis element by element
223   if (is_transpose) {
224     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
225       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
226         CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
227         CeedInt     pre   = 1;
228         CeedInt     post  = 1;
229 
230         // Clear Chebyshev coeffs
231         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
232           s_chebyshev_coeffs[k] = 0.0;
233         }
234 
235         // Map from point
236         __syncthreads();
237         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
238           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
239             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
240 
241             pre  = 1;
242             post = 1;
243             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
244               // Update buffers used
245               pre /= 1;
246               const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
247               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
248 
249               // Build Chebyshev polynomial values
250               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
251               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
252 
253               // Contract along middle index
254               for (CeedInt a = 0; a < pre; a++) {
255                 for (CeedInt c = 0; c < post; c++) {
256                   if (dim_2 == BASIS_DIM - 1) {
257                     for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
258                   } else {
259                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
260                   }
261                 }
262               }
263               post *= Q;
264             }
265           }
266         }
267 
268         // Map from coefficients
269         pre  = BASIS_NUM_QPTS;
270         post = 1;
271         for (CeedInt d = 0; d < BASIS_DIM; d++) {
272           __syncthreads();
273           // Update buffers used
274           pre /= Q;
275           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
276           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
277           const CeedInt     writeLen = pre * post * P;
278 
279           // Contract along middle index
280           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
281             const CeedInt c   = k % post;
282             const CeedInt j   = (k / post) % P;
283             const CeedInt a   = k / (post * P);
284             CeedScalar    v_k = 0;
285 
286             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
287             if (d == BASIS_DIM - 1) out[k] += v_k;
288             else out[k] = v_k;
289           }
290           post *= P;
291         }
292       }
293     }
294   } else {
295     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
296       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
297         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
298         CeedInt           pre   = u_size;
299         CeedInt           post  = 1;
300 
301         // Map to coefficients
302         for (CeedInt d = 0; d < BASIS_DIM; d++) {
303           __syncthreads();
304           // Update buffers used
305           pre /= P;
306           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
307           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
308           const CeedInt     writeLen = pre * post * Q;
309 
310           // Contract along middle index
311           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
312             const CeedInt c   = k % post;
313             const CeedInt j   = (k / post) % Q;
314             const CeedInt a   = k / (post * Q);
315             CeedScalar    v_k = 0;
316 
317             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
318             out[k] = v_k;
319           }
320           post *= Q;
321         }
322 
323         // Map to point
324         __syncthreads();
325         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
326           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
327             CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
328 
329             pre  = BASIS_NUM_QPTS;
330             post = 1;
331             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
332               // Update buffers used
333               pre /= Q;
334               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
335               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
336 
337               // Build Chebyshev polynomial values
338               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
339               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
340 
341               // Contract along middle index
342               for (CeedInt a = 0; a < pre; a++) {
343                 for (CeedInt c = 0; c < post; c++) {
344                   CeedScalar v_k = 0;
345 
346                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
347                   out[a * post + c] = v_k;
348                 }
349               }
350               post *= 1;
351             }
352           }
353         }
354       }
355     }
356   }
357 }
358