xref: /libCEED/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h (revision d15030167e504f21b69a912dc34420b169f681b5)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17   chebyshev_x[0] = 1.0;
18   chebyshev_x[1] = 2 * x;
19   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21 
22 template <int Q_1D>
23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24   CeedScalar chebyshev_x[3];
25 
26   chebyshev_x[1]  = 1.0;
27   chebyshev_x[2]  = 2 * x;
28   chebyshev_dx[0] = 0.0;
29   chebyshev_dx[1] = 2.0;
30   for (CeedInt i = 2; i < Q_1D; i++) {
31     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33   }
34 }
35 
36 //------------------------------------------------------------------------------
37 // Tensor Basis Kernels AtPoints
38 //------------------------------------------------------------------------------
39 
40 //------------------------------------------------------------------------------
41 // Interp
42 //------------------------------------------------------------------------------
43 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
44                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
45                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
46   const CeedInt i = threadIdx.x;
47 
48   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
49   CeedScalar           *s_chebyshev_interp_1d = s_mem;
50   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
51   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
52   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
53   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
54   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
55     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
56   }
57 
58   const CeedInt P             = BASIS_P_1D;
59   const CeedInt Q             = BASIS_Q_1D;
60   const CeedInt u_stride      = BASIS_NUM_NODES;
61   const CeedInt v_stride      = BASIS_NUM_PTS;
62   const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
63   const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
64   const CeedInt u_size        = BASIS_NUM_NODES;
65 
66   // Apply basis element by element
67   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
68     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
69       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
70       CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
71       CeedInt           pre   = u_size;
72       CeedInt           post  = 1;
73 
74       // Map to coefficients
75       for (CeedInt d = 0; d < BASIS_DIM; d++) {
76         __syncthreads();
77         // Update buffers used
78         pre /= P;
79         const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
80         CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
81         const CeedInt     writeLen = pre * post * Q;
82 
83         // Contract along middle index
84         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
85           const CeedInt c   = k % post;
86           const CeedInt j   = (k / post) % Q;
87           const CeedInt a   = k / (post * Q);
88           CeedScalar    v_k = 0;
89 
90           for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
91           out[k] = v_k;
92         }
93         post *= Q;
94       }
95 
96       // Map to point
97       __syncthreads();
98       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
99         pre  = BASIS_NUM_QPTS;
100         post = 1;
101         for (CeedInt d = 0; d < BASIS_DIM; d++) {
102           // Update buffers used
103           pre /= Q;
104           const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
105           CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
106 
107           // Build Chebyshev polynomial values
108           ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
109 
110           // Contract along middle index
111           for (CeedInt a = 0; a < pre; a++) {
112             for (CeedInt c = 0; c < post; c++) {
113               CeedScalar v_k = 0;
114 
115               for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
116               out[a * post + c] = v_k;
117             }
118           }
119           post *= 1;
120         }
121       }
122     }
123   }
124 }
125 
126 extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
127                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
128                                                    const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
129   const CeedInt i = threadIdx.x;
130 
131   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
132   CeedScalar           *s_chebyshev_interp_1d = s_mem;
133   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
134   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
135   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
136   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
137   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
138     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
139   }
140 
141   const CeedInt P             = BASIS_P_1D;
142   const CeedInt Q             = BASIS_Q_1D;
143   const CeedInt u_stride      = BASIS_NUM_PTS;
144   const CeedInt v_stride      = BASIS_NUM_NODES;
145   const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
146   const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
147   const CeedInt u_size        = BASIS_NUM_PTS;
148 
149   // Apply basis element by element
150   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
151     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
152       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
153       CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
154       CeedInt           pre   = 1;
155       CeedInt           post  = 1;
156 
157       // Clear Chebyshev coeffs
158       for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
159         s_chebyshev_coeffs[k] = 0.0;
160       }
161 
162       // Map from point
163       __syncthreads();
164       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
165         if (p >= points_per_elem[elem]) continue;
166         pre  = 1;
167         post = 1;
168         for (CeedInt d = 0; d < BASIS_DIM; d++) {
169           // Update buffers used
170           pre /= 1;
171           const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
172           CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
173 
174           // Build Chebyshev polynomial values
175           ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
176 
177           // Contract along middle index
178           for (CeedInt a = 0; a < pre; a++) {
179             for (CeedInt c = 0; c < post; c++) {
180               if (d == BASIS_DIM - 1) {
181                 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
182               } else {
183                 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
184               }
185             }
186           }
187           post *= Q;
188         }
189       }
190 
191       // Map from coefficients
192       pre  = BASIS_NUM_QPTS;
193       post = 1;
194       for (CeedInt d = 0; d < BASIS_DIM; d++) {
195         __syncthreads();
196         // Update buffers used
197         pre /= Q;
198         const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
199         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
200         const CeedInt     writeLen = pre * post * P;
201 
202         // Contract along middle index
203         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
204           const CeedInt c   = k % post;
205           const CeedInt j   = (k / post) % P;
206           const CeedInt a   = k / (post * P);
207           CeedScalar    v_k = 0;
208 
209           for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
210           if (d == BASIS_DIM - 1) out[k] += v_k;
211           else out[k] = v_k;
212         }
213         post *= P;
214       }
215     }
216   }
217 }
218 
219 //------------------------------------------------------------------------------
220 // Grad
221 //------------------------------------------------------------------------------
222 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
223                                         const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
224                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
225   const CeedInt i = threadIdx.x;
226 
227   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
228   CeedScalar           *s_chebyshev_interp_1d = s_mem;
229   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
230   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
231   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
232   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
233   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
234     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
235   }
236 
237   const CeedInt P             = BASIS_P_1D;
238   const CeedInt Q             = BASIS_Q_1D;
239   const CeedInt u_stride      = BASIS_NUM_NODES;
240   const CeedInt v_stride      = BASIS_NUM_PTS;
241   const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
242   const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
243   const CeedInt u_size        = BASIS_NUM_NODES;
244   const CeedInt u_dim_stride  = 0;
245   const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
246 
247   // Apply basis element by element
248   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
249     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
250       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
251       CeedInt           pre   = u_size;
252       CeedInt           post  = 1;
253 
254       // Map to coefficients
255       for (CeedInt d = 0; d < BASIS_DIM; d++) {
256         __syncthreads();
257         // Update buffers used
258         pre /= P;
259         const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
260         CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
261         const CeedInt     writeLen = pre * post * Q;
262 
263         // Contract along middle index
264         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
265           const CeedInt c   = k % post;
266           const CeedInt j   = (k / post) % Q;
267           const CeedInt a   = k / (post * Q);
268           CeedScalar    v_k = 0;
269 
270           for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
271           out[k] = v_k;
272         }
273         post *= Q;
274       }
275 
276       // Map to point
277       __syncthreads();
278       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
279         for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
280           CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
281 
282           pre  = BASIS_NUM_QPTS;
283           post = 1;
284           for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
285             // Update buffers used
286             pre /= Q;
287             const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
288             CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
289 
290             // Build Chebyshev polynomial values
291             if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
292             else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
293 
294             // Contract along middle index
295             for (CeedInt a = 0; a < pre; a++) {
296               for (CeedInt c = 0; c < post; c++) {
297                 CeedScalar v_k = 0;
298 
299                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
300                 out[a * post + c] = v_k;
301               }
302             }
303             post *= 1;
304           }
305         }
306       }
307     }
308   }
309 }
310 
311 extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
312                                                  const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
313                                                  const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
314   const CeedInt i = threadIdx.x;
315 
316   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
317   CeedScalar           *s_chebyshev_interp_1d = s_mem;
318   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
319   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
320   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
321   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
322   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
323     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
324   }
325 
326   const CeedInt P             = BASIS_P_1D;
327   const CeedInt Q             = BASIS_Q_1D;
328   const CeedInt u_stride      = BASIS_NUM_PTS;
329   const CeedInt v_stride      = BASIS_NUM_NODES;
330   const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
331   const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
332   const CeedInt u_size        = BASIS_NUM_PTS;
333   const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
334   const CeedInt v_dim_stride  = 0;
335 
336   // Apply basis element by element
337   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
338     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
339       CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
340       CeedInt     pre   = 1;
341       CeedInt     post  = 1;
342 
343       // Clear Chebyshev coeffs
344       for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
345         s_chebyshev_coeffs[k] = 0.0;
346       }
347 
348       // Map from point
349       __syncthreads();
350       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
351         if (p >= points_per_elem[elem]) continue;
352         for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
353           const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
354 
355           pre  = 1;
356           post = 1;
357           for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
358             // Update buffers used
359             pre /= 1;
360             const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
361             CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
362 
363             // Build Chebyshev polynomial values
364             if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
365             else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
366 
367             // Contract along middle index
368             for (CeedInt a = 0; a < pre; a++) {
369               for (CeedInt c = 0; c < post; c++) {
370                 if (dim_2 == BASIS_DIM - 1) {
371                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
372                 } else {
373                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
374                 }
375               }
376             }
377             post *= Q;
378           }
379         }
380       }
381 
382       // Map from coefficients
383       pre  = BASIS_NUM_QPTS;
384       post = 1;
385       for (CeedInt d = 0; d < BASIS_DIM; d++) {
386         __syncthreads();
387         // Update buffers used
388         pre /= Q;
389         const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
390         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
391         const CeedInt     writeLen = pre * post * P;
392 
393         // Contract along middle index
394         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
395           const CeedInt c   = k % post;
396           const CeedInt j   = (k / post) % P;
397           const CeedInt a   = k / (post * P);
398           CeedScalar    v_k = 0;
399 
400           for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
401           if (d == BASIS_DIM - 1) out[k] += v_k;
402           else out[k] = v_k;
403         }
404         post *= P;
405       }
406     }
407   }
408 }
409