xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h (revision 1dda9c1ab34a6c4075a3e9d5ebc86d8c1194a9e1)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 
11 #include <ceed.h>
12 
13 //------------------------------------------------------------------------------
14 // Chebyshev values
15 //------------------------------------------------------------------------------
16 template <int Q_1D>
17 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
18   chebyshev_x[0] = 1.0;
19   chebyshev_x[1] = 2 * x;
20   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
21 }
22 
23 template <int Q_1D>
24 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
25   CeedScalar chebyshev_x[3];
26 
27   chebyshev_x[1]  = 1.0;
28   chebyshev_x[2]  = 2 * x;
29   chebyshev_dx[0] = 0.0;
30   chebyshev_dx[1] = 2.0;
31   for (CeedInt i = 2; i < Q_1D; i++) {
32     chebyshev_x[0]  = chebyshev_x[1];
33     chebyshev_x[1]  = chebyshev_x[2];
34     chebyshev_x[2]  = 2 * x * chebyshev_x[1] - chebyshev_x[0];
35     chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2];
36   }
37 }
38 
39 //------------------------------------------------------------------------------
40 // Tensor Basis Kernels AtPoints
41 //------------------------------------------------------------------------------
42 
43 //------------------------------------------------------------------------------
44 // Interp
45 //------------------------------------------------------------------------------
46 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
47                                           const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
48   const CeedInt i = threadIdx.x;
49 
50   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
51   CeedScalar           *s_chebyshev_interp_1d = s_mem;
52   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
53   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
54   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
55   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
56   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
57     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
58   }
59 
60   const CeedInt P             = BASIS_P_1D;
61   const CeedInt Q             = BASIS_Q_1D;
62   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
63   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
64   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
65   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
66   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
67 
68   // Apply basis element by element
69   if (is_transpose) {
70     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
71       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
72         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
73         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
74         CeedInt           pre   = 1;
75         CeedInt           post  = 1;
76 
77         // Clear Chebyshev coeffs
78         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
79           s_chebyshev_coeffs[k] = 0.0;
80         }
81 
82         // Map from point
83         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
84           pre  = 1;
85           post = 1;
86           for (CeedInt d = 0; d < BASIS_DIM; d++) {
87             // Update buffers used
88             pre /= 1;
89             const CeedScalar *in  = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
90             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
91 
92             // Build Chebyshev polynomial values
93             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
94 
95             // Contract along middle index
96             for (CeedInt a = 0; a < pre; a++) {
97               for (CeedInt c = 0; c < post; c++) {
98                 if (d == BASIS_DIM - 1) {
99                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
100                 } else {
101                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
102                 }
103               }
104             }
105             post *= Q;
106           }
107         }
108 
109         // Map from coefficients
110         pre  = BASIS_NUM_QPTS;
111         post = 1;
112         for (CeedInt d = 0; d < BASIS_DIM; d++) {
113           __syncthreads();
114           // Update buffers used
115           pre /= Q;
116           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
117           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
118           const CeedInt     writeLen = pre * post * P;
119 
120           // Contract along middle index
121           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
122             const CeedInt c   = k % post;
123             const CeedInt j   = (k / post) % P;
124             const CeedInt a   = k / (post * P);
125             CeedScalar    v_k = 0;
126 
127             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
128             out[k] = v_k;
129           }
130           post *= P;
131         }
132       }
133     }
134   } else {
135     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
136       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
137         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
138         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
139         CeedInt           pre   = u_size;
140         CeedInt           post  = 1;
141 
142         // Map to coefficients
143         for (CeedInt d = 0; d < BASIS_DIM; d++) {
144           __syncthreads();
145           // Update buffers used
146           pre /= P;
147           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
148           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
149           const CeedInt     writeLen = pre * post * Q;
150 
151           // Contract along middle index
152           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
153             const CeedInt c   = k % post;
154             const CeedInt j   = (k / post) % Q;
155             const CeedInt a   = k / (post * Q);
156             CeedScalar    v_k = 0;
157 
158             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
159             out[k] = v_k;
160           }
161           post *= Q;
162         }
163 
164         // Map to point
165         __syncthreads();
166         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
167           pre  = BASIS_NUM_QPTS;
168           post = 1;
169           for (CeedInt d = 0; d < BASIS_DIM; d++) {
170             // Update buffers used
171             pre /= Q;
172             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
173             CeedScalar       *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
174 
175             // Build Chebyshev polynomial values
176             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
177 
178             // Contract along middle index
179             for (CeedInt a = 0; a < pre; a++) {
180               for (CeedInt c = 0; c < post; c++) {
181                 CeedScalar v_k = 0;
182 
183                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
184                 out[a * post + c] = v_k;
185               }
186             }
187             post *= 1;
188           }
189         }
190       }
191     }
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // Grad
197 //------------------------------------------------------------------------------
198 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
199                                         const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
200   const CeedInt i = threadIdx.x;
201 
202   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN];
203   CeedScalar           *s_chebyshev_interp_1d = s_mem;
204   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
205   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
206   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
207   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
208   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
209     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
210   }
211 
212   const CeedInt P             = BASIS_P_1D;
213   const CeedInt Q             = BASIS_Q_1D;
214   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
215   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
216   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
217   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
218   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
219   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
220   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
221 
222   // Apply basis element by element
223   if (is_transpose) {
224     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
225       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
226         CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
227         CeedInt     pre   = 1;
228         CeedInt     post  = 1;
229 
230         // Clear Chebyshev coeffs
231         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
232           s_chebyshev_coeffs[k] = 0.0;
233         }
234 
235         // Map from point
236         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
237           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
238             const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
239 
240             pre  = 1;
241             post = 1;
242             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
243               // Update buffers used
244               pre /= 1;
245               const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
246               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
247 
248               // Build Chebyshev polynomial values
249               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
250               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
251 
252               // Contract along middle index
253               for (CeedInt a = 0; a < pre; a++) {
254                 for (CeedInt c = 0; c < post; c++) {
255                   if (dim_2 == BASIS_DIM - 1) {
256                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c];
257                   } else {
258                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
259                   }
260                 }
261               }
262               post *= Q;
263             }
264           }
265         }
266 
267         // Map from coefficients
268         pre  = BASIS_NUM_QPTS;
269         post = 1;
270         for (CeedInt d = 0; d < BASIS_DIM; d++) {
271           __syncthreads();
272           // Update buffers used
273           pre /= Q;
274           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
275           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
276           const CeedInt     writeLen = pre * post * P;
277 
278           // Contract along middle index
279           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
280             const CeedInt c   = k % post;
281             const CeedInt j   = (k / post) % P;
282             const CeedInt a   = k / (post * P);
283             CeedScalar    v_k = 0;
284 
285             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
286             out[k] = v_k;
287           }
288           post *= P;
289         }
290       }
291     }
292   } else {
293     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
294       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
295         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
296         CeedInt           pre   = u_size;
297         CeedInt           post  = 1;
298 
299         // Map to coefficients
300         for (CeedInt d = 0; d < BASIS_DIM; d++) {
301           __syncthreads();
302           // Update buffers used
303           pre /= P;
304           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
305           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
306           const CeedInt     writeLen = pre * post * Q;
307 
308           // Contract along middle index
309           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
310             const CeedInt c   = k % post;
311             const CeedInt j   = (k / post) % Q;
312             const CeedInt a   = k / (post * Q);
313             CeedScalar    v_k = 0;
314 
315             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
316             out[k] = v_k;
317           }
318           post *= Q;
319         }
320 
321         // Map to point
322         __syncthreads();
323         for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) {
324           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
325             CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
326 
327             pre  = BASIS_NUM_QPTS;
328             post = 1;
329             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
330               // Update buffers used
331               pre /= Q;
332               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
333               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
334 
335               // Build Chebyshev polynomial values
336               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
337               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
338 
339               // Contract along middle index
340               for (CeedInt a = 0; a < pre; a++) {
341                 for (CeedInt c = 0; c < post; c++) {
342                   CeedScalar v_k = 0;
343 
344                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
345                   out[a * post + c] = v_k;
346                 }
347               }
348               post *= 1;
349             }
350           }
351         }
352       }
353     }
354   }
355 }
356