xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h (revision 26ef7cdab87216015dfdd84a70b0ca05f3e531f1)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // Tensor Basis Kernels AtPoints
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // Interp
42 //------------------------------------------------------------------------------
43 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
44                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
45                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
46   const CeedInt i = threadIdx.x;
47 
48   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
49   CeedScalar           *s_chebyshev_interp_1d = s_mem;
50   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
51   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
52   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
53   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
54   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
55     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
56   }
57 
58   const CeedInt P             = BASIS_P_1D;
59   const CeedInt Q             = BASIS_Q_1D;
60   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
61   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
62   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
63   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
64   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
65 
66   // Apply basis element by element
67   if (is_transpose) {
68     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
69       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
70         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
71         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
72         CeedInt           pre   = 1;
73         CeedInt           post  = 1;
74 
75         // Clear Chebyshev coeffs
76         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
77           s_chebyshev_coeffs[k] = 0.0;
78         }
79 
80         // Map from point
81         __syncthreads();
82         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
83           if (p >= points_per_elem[elem]) continue;
84           pre  = 1;
85           post = 1;
86           for (CeedInt d = 0; d < BASIS_DIM; d++) {
87             // Update buffers used
88             pre /= 1;
89             const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
90             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
91 
92             // Build Chebyshev polynomial values
93             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
94 
95             // Contract along middle index
96             for (CeedInt a = 0; a < pre; a++) {
97               for (CeedInt c = 0; c < post; c++) {
98                 if (d == BASIS_DIM - 1) {
99                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
100                 } else {
101                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
102                 }
103               }
104             }
105             post *= Q;
106           }
107         }
108 
109         // Map from coefficients
110         pre  = BASIS_NUM_QPTS;
111         post = 1;
112         for (CeedInt d = 0; d < BASIS_DIM; d++) {
113           __syncthreads();
114           // Update buffers used
115           pre /= Q;
116           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
117           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
118           const CeedInt     writeLen = pre * post * P;
119 
120           // Contract along middle index
121           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
122             const CeedInt c   = k % post;
123             const CeedInt j   = (k / post) % P;
124             const CeedInt a   = k / (post * P);
125             CeedScalar    v_k = 0;
126 
127             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
128             if (d == BASIS_DIM - 1) out[k] += v_k;
129             else out[k] = v_k;
130           }
131           post *= P;
132         }
133       }
134     }
135   } else {
136     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
137       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
138         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
139         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
140         CeedInt           pre   = u_size;
141         CeedInt           post  = 1;
142 
143         // Map to coefficients
144         for (CeedInt d = 0; d < BASIS_DIM; d++) {
145           __syncthreads();
146           // Update buffers used
147           pre /= P;
148           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
149           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
150           const CeedInt     writeLen = pre * post * Q;
151 
152           // Contract along middle index
153           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
154             const CeedInt c   = k % post;
155             const CeedInt j   = (k / post) % Q;
156             const CeedInt a   = k / (post * Q);
157             CeedScalar    v_k = 0;
158 
159             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
160             out[k] = v_k;
161           }
162           post *= Q;
163         }
164 
165         // Map to point
166         __syncthreads();
167         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
168           pre  = BASIS_NUM_QPTS;
169           post = 1;
170           for (CeedInt d = 0; d < BASIS_DIM; d++) {
171             // Update buffers used
172             pre /= Q;
173             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
174             CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
175 
176             // Build Chebyshev polynomial values
177             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
178 
179             // Contract along middle index
180             for (CeedInt a = 0; a < pre; a++) {
181               for (CeedInt c = 0; c < post; c++) {
182                 CeedScalar v_k = 0;
183 
184                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
185                 out[a * post + c] = v_k;
186               }
187             }
188             post *= 1;
189           }
190         }
191       }
192     }
193   }
194 }
195 
196 //------------------------------------------------------------------------------
197 // Grad
198 //------------------------------------------------------------------------------
199 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
200                                         const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
201                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
202   const CeedInt i = threadIdx.x;
203 
204   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
205   CeedScalar           *s_chebyshev_interp_1d = s_mem;
206   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
207   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
208   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
209   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
210   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
211     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
212   }
213 
214   const CeedInt P             = BASIS_P_1D;
215   const CeedInt Q             = BASIS_Q_1D;
216   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
217   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
218   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
219   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
220   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
221   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
222   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
223 
224   // Apply basis element by element
225   if (is_transpose) {
226     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
227       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
228         CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
229         CeedInt     pre   = 1;
230         CeedInt     post  = 1;
231 
232         // Clear Chebyshev coeffs
233         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
234           s_chebyshev_coeffs[k] = 0.0;
235         }
236 
237         // Map from point
238         __syncthreads();
239         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
240           if (p >= points_per_elem[elem]) continue;
241           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
242             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
243 
244             pre  = 1;
245             post = 1;
246             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
247               // Update buffers used
248               pre /= 1;
249               const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
250               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
251 
252               // Build Chebyshev polynomial values
253               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
254               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
255 
256               // Contract along middle index
257               for (CeedInt a = 0; a < pre; a++) {
258                 for (CeedInt c = 0; c < post; c++) {
259                   if (dim_2 == BASIS_DIM - 1) {
260                     for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
261                   } else {
262                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
263                   }
264                 }
265               }
266               post *= Q;
267             }
268           }
269         }
270 
271         // Map from coefficients
272         pre  = BASIS_NUM_QPTS;
273         post = 1;
274         for (CeedInt d = 0; d < BASIS_DIM; d++) {
275           __syncthreads();
276           // Update buffers used
277           pre /= Q;
278           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
279           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
280           const CeedInt     writeLen = pre * post * P;
281 
282           // Contract along middle index
283           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
284             const CeedInt c   = k % post;
285             const CeedInt j   = (k / post) % P;
286             const CeedInt a   = k / (post * P);
287             CeedScalar    v_k = 0;
288 
289             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
290             if (d == BASIS_DIM - 1) out[k] += v_k;
291             else out[k] = v_k;
292           }
293           post *= P;
294         }
295       }
296     }
297   } else {
298     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
299       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
300         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
301         CeedInt           pre   = u_size;
302         CeedInt           post  = 1;
303 
304         // Map to coefficients
305         for (CeedInt d = 0; d < BASIS_DIM; d++) {
306           __syncthreads();
307           // Update buffers used
308           pre /= P;
309           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
310           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
311           const CeedInt     writeLen = pre * post * Q;
312 
313           // Contract along middle index
314           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
315             const CeedInt c   = k % post;
316             const CeedInt j   = (k / post) % Q;
317             const CeedInt a   = k / (post * Q);
318             CeedScalar    v_k = 0;
319 
320             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
321             out[k] = v_k;
322           }
323           post *= Q;
324         }
325 
326         // Map to point
327         __syncthreads();
328         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
329           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
330             CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
331 
332             pre  = BASIS_NUM_QPTS;
333             post = 1;
334             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
335               // Update buffers used
336               pre /= Q;
337               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
338               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
339 
340               // Build Chebyshev polynomial values
341               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
342               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
343 
344               // Contract along middle index
345               for (CeedInt a = 0; a < pre; a++) {
346                 for (CeedInt c = 0; c < post; c++) {
347                   CeedScalar v_k = 0;
348 
349                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
350                   out[a * post + c] = v_k;
351                 }
352               }
353               post *= 1;
354             }
355           }
356         }
357       }
358     }
359   }
360 }
361