xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h (revision 70952158361b9d75c803a361a1c499ca025d805a)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 
11 #include <ceed.h>
12 
13 //------------------------------------------------------------------------------
14 // Chebyshev values
15 //------------------------------------------------------------------------------
16 template <int Q_1D>
17 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
18   chebyshev_x[0] = 1.0;
19   chebyshev_x[1] = 2 * x;
20   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
21 }
22 
23 template <int Q_1D>
24 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
25   CeedScalar chebyshev_x[3];
26 
27   chebyshev_x[1]  = 1.0;
28   chebyshev_x[2]  = 2 * x;
29   chebyshev_dx[0] = 0.0;
30   chebyshev_dx[1] = 2.0;
31   for (CeedInt i = 2; i < Q_1D; i++) {
32     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
33     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Tensor Basis Kernels AtPoints
39 //------------------------------------------------------------------------------
40 
41 //------------------------------------------------------------------------------
42 // Interp
43 //------------------------------------------------------------------------------
44 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
45                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
46                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
47   const CeedInt i = threadIdx.x;
48 
49   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
50   CeedScalar           *s_chebyshev_interp_1d = s_mem;
51   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
52   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
53   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
54   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
55   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
56     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
57   }
58 
59   const CeedInt P             = BASIS_P_1D;
60   const CeedInt Q             = BASIS_Q_1D;
61   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
62   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
63   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
64   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
65   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
66 
67   // Apply basis element by element
68   if (is_transpose) {
69     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
70       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
71         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
72         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
73         CeedInt           pre   = 1;
74         CeedInt           post  = 1;
75 
76         // Clear Chebyshev coeffs
77         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
78           s_chebyshev_coeffs[k] = 0.0;
79         }
80 
81         // Map from point
82         __syncthreads();
83         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
84           if (p >= points_per_elem[elem]) continue;
85           pre  = 1;
86           post = 1;
87           for (CeedInt d = 0; d < BASIS_DIM; d++) {
88             // Update buffers used
89             pre /= 1;
90             const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
91             CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
92 
93             // Build Chebyshev polynomial values
94             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
95 
96             // Contract along middle index
97             for (CeedInt a = 0; a < pre; a++) {
98               for (CeedInt c = 0; c < post; c++) {
99                 if (d == BASIS_DIM - 1) {
100                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
101                 } else {
102                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
103                 }
104               }
105             }
106             post *= Q;
107           }
108         }
109 
110         // Map from coefficients
111         pre  = BASIS_NUM_QPTS;
112         post = 1;
113         for (CeedInt d = 0; d < BASIS_DIM; d++) {
114           __syncthreads();
115           // Update buffers used
116           pre /= Q;
117           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
118           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
119           const CeedInt     writeLen = pre * post * P;
120 
121           // Contract along middle index
122           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
123             const CeedInt c   = k % post;
124             const CeedInt j   = (k / post) % P;
125             const CeedInt a   = k / (post * P);
126             CeedScalar    v_k = 0;
127 
128             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
129             if (d == BASIS_DIM - 1) out[k] += v_k;
130             else out[k] = v_k;
131           }
132           post *= P;
133         }
134       }
135     }
136   } else {
137     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
138       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
139         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
140         CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
141         CeedInt           pre   = u_size;
142         CeedInt           post  = 1;
143 
144         // Map to coefficients
145         for (CeedInt d = 0; d < BASIS_DIM; d++) {
146           __syncthreads();
147           // Update buffers used
148           pre /= P;
149           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
150           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
151           const CeedInt     writeLen = pre * post * Q;
152 
153           // Contract along middle index
154           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
155             const CeedInt c   = k % post;
156             const CeedInt j   = (k / post) % Q;
157             const CeedInt a   = k / (post * Q);
158             CeedScalar    v_k = 0;
159 
160             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
161             out[k] = v_k;
162           }
163           post *= Q;
164         }
165 
166         // Map to point
167         __syncthreads();
168         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
169           pre  = BASIS_NUM_QPTS;
170           post = 1;
171           for (CeedInt d = 0; d < BASIS_DIM; d++) {
172             // Update buffers used
173             pre /= Q;
174             const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
175             CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
176 
177             // Build Chebyshev polynomial values
178             ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
179 
180             // Contract along middle index
181             for (CeedInt a = 0; a < pre; a++) {
182               for (CeedInt c = 0; c < post; c++) {
183                 CeedScalar v_k = 0;
184 
185                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
186                 out[a * post + c] = v_k;
187               }
188             }
189             post *= 1;
190           }
191         }
192       }
193     }
194   }
195 }
196 
197 //------------------------------------------------------------------------------
198 // Grad
199 //------------------------------------------------------------------------------
200 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
201                                         const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
202                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
203   const CeedInt i = threadIdx.x;
204 
205   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
206   CeedScalar           *s_chebyshev_interp_1d = s_mem;
207   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
208   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
209   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
210   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
211   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
212     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
213   }
214 
215   const CeedInt P             = BASIS_P_1D;
216   const CeedInt Q             = BASIS_Q_1D;
217   const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
218   const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
219   const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
220   const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
221   const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
222   const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
223   const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
224 
225   // Apply basis element by element
226   if (is_transpose) {
227     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
228       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
229         CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
230         CeedInt     pre   = 1;
231         CeedInt     post  = 1;
232 
233         // Clear Chebyshev coeffs
234         for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
235           s_chebyshev_coeffs[k] = 0.0;
236         }
237 
238         // Map from point
239         __syncthreads();
240         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
241           if (p >= points_per_elem[elem]) continue;
242           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
243             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
244 
245             pre  = 1;
246             post = 1;
247             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
248               // Update buffers used
249               pre /= 1;
250               const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
251               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
252 
253               // Build Chebyshev polynomial values
254               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
255               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
256 
257               // Contract along middle index
258               for (CeedInt a = 0; a < pre; a++) {
259                 for (CeedInt c = 0; c < post; c++) {
260                   if (dim_2 == BASIS_DIM - 1) {
261                     for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
262                   } else {
263                     for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
264                   }
265                 }
266               }
267               post *= Q;
268             }
269           }
270         }
271 
272         // Map from coefficients
273         pre  = BASIS_NUM_QPTS;
274         post = 1;
275         for (CeedInt d = 0; d < BASIS_DIM; d++) {
276           __syncthreads();
277           // Update buffers used
278           pre /= Q;
279           const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
280           CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
281           const CeedInt     writeLen = pre * post * P;
282 
283           // Contract along middle index
284           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
285             const CeedInt c   = k % post;
286             const CeedInt j   = (k / post) % P;
287             const CeedInt a   = k / (post * P);
288             CeedScalar    v_k = 0;
289 
290             for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
291             if (d == BASIS_DIM - 1) out[k] += v_k;
292             else out[k] = v_k;
293           }
294           post *= P;
295         }
296       }
297     }
298   } else {
299     for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
300       for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
301         const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
302         CeedInt           pre   = u_size;
303         CeedInt           post  = 1;
304 
305         // Map to coefficients
306         for (CeedInt d = 0; d < BASIS_DIM; d++) {
307           __syncthreads();
308           // Update buffers used
309           pre /= P;
310           const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
311           CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
312           const CeedInt     writeLen = pre * post * Q;
313 
314           // Contract along middle index
315           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
316             const CeedInt c   = k % post;
317             const CeedInt j   = (k / post) % Q;
318             const CeedInt a   = k / (post * Q);
319             CeedScalar    v_k = 0;
320 
321             for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
322             out[k] = v_k;
323           }
324           post *= Q;
325         }
326 
327         // Map to point
328         __syncthreads();
329         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
330           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
331             CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
332 
333             pre  = BASIS_NUM_QPTS;
334             post = 1;
335             for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
336               // Update buffers used
337               pre /= Q;
338               const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
339               CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
340 
341               // Build Chebyshev polynomial values
342               if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
343               else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
344 
345               // Contract along middle index
346               for (CeedInt a = 0; a < pre; a++) {
347                 for (CeedInt c = 0; c < post; c++) {
348                   CeedScalar v_k = 0;
349 
350                   for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
351                   out[a * post + c] = v_k;
352                 }
353               }
354               post *= 1;
355             }
356           }
357         }
358       }
359     }
360   }
361 }
362