1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // Chebyshev values
14 //------------------------------------------------------------------------------
15 template <int Q_1D>
ChebyshevPolynomialsAtPoint(const CeedScalar x,CeedScalar * chebyshev_x)16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
17 chebyshev_x[0] = 1.0;
18 chebyshev_x[1] = 2 * x;
19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
20 }
21
22 template <int Q_1D>
ChebyshevDerivativeAtPoint(const CeedScalar x,CeedScalar * chebyshev_dx)23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
24 CeedScalar chebyshev_x[3];
25
26 chebyshev_x[1] = 1.0;
27 chebyshev_x[2] = 2 * x;
28 chebyshev_dx[0] = 0.0;
29 chebyshev_dx[1] = 2.0;
30 for (CeedInt i = 2; i < Q_1D; i++) {
31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
33 }
34 }
35
36 //------------------------------------------------------------------------------
37 // Tensor Basis Kernels AtPoints
38 //------------------------------------------------------------------------------
39
40 //------------------------------------------------------------------------------
41 // Interp
42 //------------------------------------------------------------------------------
InterpAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)43 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
44 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
45 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
46 const CeedInt i = threadIdx.x;
47
48 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
49 CeedScalar *s_chebyshev_interp_1d = s_mem;
50 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D;
51 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN;
52 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN;
53 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
54 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
55 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
56 }
57
58 const CeedInt P = BASIS_P_1D;
59 const CeedInt Q = BASIS_Q_1D;
60 const CeedInt u_stride = BASIS_NUM_NODES;
61 const CeedInt v_stride = BASIS_NUM_PTS;
62 const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
63 const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
64 const CeedInt u_size = BASIS_NUM_NODES;
65
66 // Apply basis element by element
67 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
68 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
69 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
70 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
71 CeedInt pre = u_size;
72 CeedInt post = 1;
73
74 // Map to coefficients
75 for (CeedInt d = 0; d < BASIS_DIM; d++) {
76 __syncthreads();
77 // Update buffers used
78 pre /= P;
79 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
80 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
81 const CeedInt writeLen = pre * post * Q;
82
83 // Contract along middle index
84 for (CeedInt k = i; k < writeLen; k += blockDim.x) {
85 const CeedInt c = k % post;
86 const CeedInt j = (k / post) % Q;
87 const CeedInt a = k / (post * Q);
88 CeedScalar v_k = 0;
89
90 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
91 out[k] = v_k;
92 }
93 post *= Q;
94 }
95
96 // Map to point
97 __syncthreads();
98 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
99 pre = BASIS_NUM_QPTS;
100 post = 1;
101 for (CeedInt d = 0; d < BASIS_DIM; d++) {
102 // Update buffers used
103 pre /= Q;
104 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
105 CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
106
107 // Build Chebyshev polynomial values
108 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
109
110 // Contract along middle index
111 for (CeedInt a = 0; a < pre; a++) {
112 for (CeedInt c = 0; c < post; c++) {
113 CeedScalar v_k = 0;
114
115 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
116 out[a * post + c] = v_k;
117 }
118 }
119 post *= 1;
120 }
121 }
122 }
123 }
124 }
125
InterpTransposeAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)126 extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
127 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
128 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
129 const CeedInt i = threadIdx.x;
130
131 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
132 CeedScalar *s_chebyshev_interp_1d = s_mem;
133 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D;
134 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN;
135 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN;
136 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
137 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
138 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
139 }
140
141 const CeedInt P = BASIS_P_1D;
142 const CeedInt Q = BASIS_Q_1D;
143 const CeedInt u_stride = BASIS_NUM_PTS;
144 const CeedInt v_stride = BASIS_NUM_NODES;
145 const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
146 const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
147 const CeedInt u_size = BASIS_NUM_PTS;
148
149 // Apply basis element by element
150 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
151 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
152 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
153 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
154 CeedInt pre = 1;
155 CeedInt post = 1;
156
157 // Clear Chebyshev coeffs
158 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
159 s_chebyshev_coeffs[k] = 0.0;
160 }
161
162 // Map from point
163 __syncthreads();
164 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
165 if (p >= points_per_elem[elem]) continue;
166 pre = 1;
167 post = 1;
168 for (CeedInt d = 0; d < BASIS_DIM; d++) {
169 // Update buffers used
170 pre /= 1;
171 const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
172 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
173
174 // Build Chebyshev polynomial values
175 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
176
177 // Contract along middle index
178 for (CeedInt a = 0; a < pre; a++) {
179 for (CeedInt c = 0; c < post; c++) {
180 if (d == BASIS_DIM - 1) {
181 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
182 } else {
183 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
184 }
185 }
186 }
187 post *= Q;
188 }
189 }
190
191 // Map from coefficients
192 pre = BASIS_NUM_QPTS;
193 post = 1;
194 for (CeedInt d = 0; d < BASIS_DIM; d++) {
195 __syncthreads();
196 // Update buffers used
197 pre /= Q;
198 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
199 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
200 const CeedInt writeLen = pre * post * P;
201
202 // Contract along middle index
203 for (CeedInt k = i; k < writeLen; k += blockDim.x) {
204 const CeedInt c = k % post;
205 const CeedInt j = (k / post) % P;
206 const CeedInt a = k / (post * P);
207 CeedScalar v_k = 0;
208
209 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
210 if (d == BASIS_DIM - 1) out[k] += v_k;
211 else out[k] = v_k;
212 }
213 post *= P;
214 }
215 }
216 }
217 }
218
219 //------------------------------------------------------------------------------
220 // Grad
221 //------------------------------------------------------------------------------
GradAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)222 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
223 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
224 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
225 const CeedInt i = threadIdx.x;
226
227 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
228 CeedScalar *s_chebyshev_interp_1d = s_mem;
229 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D;
230 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN;
231 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN;
232 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
233 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
234 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
235 }
236
237 const CeedInt P = BASIS_P_1D;
238 const CeedInt Q = BASIS_Q_1D;
239 const CeedInt u_stride = BASIS_NUM_NODES;
240 const CeedInt v_stride = BASIS_NUM_PTS;
241 const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
242 const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
243 const CeedInt u_size = BASIS_NUM_NODES;
244 const CeedInt u_dim_stride = 0;
245 const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
246
247 // Apply basis element by element
248 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
249 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
250 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
251 CeedInt pre = u_size;
252 CeedInt post = 1;
253
254 // Map to coefficients
255 for (CeedInt d = 0; d < BASIS_DIM; d++) {
256 __syncthreads();
257 // Update buffers used
258 pre /= P;
259 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
260 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
261 const CeedInt writeLen = pre * post * Q;
262
263 // Contract along middle index
264 for (CeedInt k = i; k < writeLen; k += blockDim.x) {
265 const CeedInt c = k % post;
266 const CeedInt j = (k / post) % Q;
267 const CeedInt a = k / (post * Q);
268 CeedScalar v_k = 0;
269
270 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
271 out[k] = v_k;
272 }
273 post *= Q;
274 }
275
276 // Map to point
277 __syncthreads();
278 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
279 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
280 CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
281
282 pre = BASIS_NUM_QPTS;
283 post = 1;
284 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
285 // Update buffers used
286 pre /= Q;
287 const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
288 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
289
290 // Build Chebyshev polynomial values
291 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
292 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
293
294 // Contract along middle index
295 for (CeedInt a = 0; a < pre; a++) {
296 for (CeedInt c = 0; c < post; c++) {
297 CeedScalar v_k = 0;
298
299 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
300 out[a * post + c] = v_k;
301 }
302 }
303 post *= 1;
304 }
305 }
306 }
307 }
308 }
309 }
310
GradTransposeAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)311 extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
312 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
313 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
314 const CeedInt i = threadIdx.x;
315
316 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
317 CeedScalar *s_chebyshev_interp_1d = s_mem;
318 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D;
319 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN;
320 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN;
321 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
322 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
323 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
324 }
325
326 const CeedInt P = BASIS_P_1D;
327 const CeedInt Q = BASIS_Q_1D;
328 const CeedInt u_stride = BASIS_NUM_PTS;
329 const CeedInt v_stride = BASIS_NUM_NODES;
330 const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
331 const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
332 const CeedInt u_size = BASIS_NUM_PTS;
333 const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
334 const CeedInt v_dim_stride = 0;
335
336 // Apply basis element by element
337 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
338 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
339 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
340 CeedInt pre = 1;
341 CeedInt post = 1;
342
343 // Clear Chebyshev coeffs
344 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
345 s_chebyshev_coeffs[k] = 0.0;
346 }
347
348 // Map from point
349 __syncthreads();
350 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
351 if (p >= points_per_elem[elem]) continue;
352 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
353 const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
354
355 pre = 1;
356 post = 1;
357 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
358 // Update buffers used
359 pre /= 1;
360 const CeedScalar *in = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
361 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
362
363 // Build Chebyshev polynomial values
364 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
365 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
366
367 // Contract along middle index
368 for (CeedInt a = 0; a < pre; a++) {
369 for (CeedInt c = 0; c < post; c++) {
370 if (dim_2 == BASIS_DIM - 1) {
371 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
372 } else {
373 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
374 }
375 }
376 }
377 post *= Q;
378 }
379 }
380 }
381
382 // Map from coefficients
383 pre = BASIS_NUM_QPTS;
384 post = 1;
385 for (CeedInt d = 0; d < BASIS_DIM; d++) {
386 __syncthreads();
387 // Update buffers used
388 pre /= Q;
389 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
390 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
391 const CeedInt writeLen = pre * post * P;
392
393 // Contract along middle index
394 for (CeedInt k = i; k < writeLen; k += blockDim.x) {
395 const CeedInt c = k % post;
396 const CeedInt j = (k / post) % P;
397 const CeedInt a = k / (post * P);
398 CeedScalar v_k = 0;
399
400 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
401 if (d == BASIS_DIM - 1) out[k] += v_k;
402 else out[k] = v_k;
403 }
404 post *= P;
405 }
406 }
407 }
408 }
409