1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11
12 #include "cuda-shared-basis-read-write-templates.h"
13 #include "cuda-shared-basis-tensor-at-points-templates.h"
14 #include "cuda-shared-basis-tensor-templates.h"
15
16 //------------------------------------------------------------------------------
17 // Tensor Basis Kernels AtPoints
18 //------------------------------------------------------------------------------
19
20 //------------------------------------------------------------------------------
21 // Interp
22 //------------------------------------------------------------------------------
InterpAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ c_B,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ d_X,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)23 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
24 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
25 extern __shared__ CeedScalar slice[];
26
27 SharedData_Cuda data;
28 data.t_id_x = threadIdx.x;
29 data.t_id_y = threadIdx.y;
30 data.t_id_z = threadIdx.z;
31 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
32 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
33
34 CeedScalar r_X[BASIS_DIM];
35 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
36 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
37 CeedScalar r_V[BASIS_NUM_COMP];
38
39 // load interp_1d into shared memory
40 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
41 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
42 __syncthreads();
43
44 // Apply basis element by element
45 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
46 // Map to coefficients
47 if (BASIS_DIM == 1) {
48 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
49 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
50 } else if (BASIS_DIM == 2) {
51 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
52 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
53 } else if (BASIS_DIM == 3) {
54 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
55 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
56 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
57 }
58
59 // Map to points
60 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
61
62 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
63 const CeedInt p = i % BASIS_NUM_PTS;
64
65 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
66 if (BASIS_DIM == 1) {
67 InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
68 } else if (BASIS_DIM == 2) {
69 InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
70 } else if (BASIS_DIM == 3) {
71 InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
72 }
73 WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
74 }
75 }
76 }
77
InterpTransposeAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ c_B,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ d_X,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)78 extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
79 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
80 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
81 extern __shared__ CeedScalar slice[];
82
83 SharedData_Cuda data;
84 data.t_id_x = threadIdx.x;
85 data.t_id_y = threadIdx.y;
86 data.t_id_z = threadIdx.z;
87 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
88 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
89
90 CeedScalar r_X[BASIS_DIM];
91 CeedScalar r_U[BASIS_NUM_COMP];
92 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
93 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
94
95 // load interp_1d into shared memory
96 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
97 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
98 __syncthreads();
99
100 // Apply basis element by element
101 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
102 // Clear register
103 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
104
105 // Clear output vector
106 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
107 if (BASIS_DIM == 1) {
108 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
109 } else if (BASIS_DIM == 2) {
110 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
111 } else if (BASIS_DIM == 3) {
112 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
113 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
114 }
115
116 // Map from points
117 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
118
119 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
120 const CeedInt p = i % BASIS_NUM_PTS;
121
122 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
123 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
124 if (BASIS_DIM == 1) {
125 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
126 } else if (BASIS_DIM == 2) {
127 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
128 } else if (BASIS_DIM == 3) {
129 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
130 }
131 }
132
133 // Map from coefficients
134 if (BASIS_DIM == 1) {
135 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
136 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
137 } else if (BASIS_DIM == 2) {
138 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
139 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
140 } else if (BASIS_DIM == 3) {
141 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
142 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
143 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
144 }
145 }
146 }
147
InterpTransposeAddAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ c_B,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ d_X,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)148 extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
149 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
150 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
151 extern __shared__ CeedScalar slice[];
152
153 SharedData_Cuda data;
154 data.t_id_x = threadIdx.x;
155 data.t_id_y = threadIdx.y;
156 data.t_id_z = threadIdx.z;
157 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
158 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
159
160 CeedScalar r_X[BASIS_DIM];
161 CeedScalar r_U[BASIS_NUM_COMP];
162 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
163 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
164
165 // load interp_1d into shared memory
166 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
167 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
168 __syncthreads();
169
170 // Apply basis element by element
171 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
172 // Clear register
173 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
174
175 // Map from points
176 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
177
178 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
179 const CeedInt p = i % BASIS_NUM_PTS;
180
181 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
182 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
183 if (BASIS_DIM == 1) {
184 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
185 } else if (BASIS_DIM == 2) {
186 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
187 } else if (BASIS_DIM == 3) {
188 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
189 }
190 }
191
192 // Map from coefficients
193 if (BASIS_DIM == 1) {
194 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
195 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
196 } else if (BASIS_DIM == 2) {
197 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
198 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
199 } else if (BASIS_DIM == 3) {
200 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
201 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
202 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
203 }
204 }
205 }
206
207 //------------------------------------------------------------------------------
208 // Grad
209 //------------------------------------------------------------------------------
GradAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ c_B,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ d_X,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)210 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
211 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
212 extern __shared__ CeedScalar slice[];
213
214 SharedData_Cuda data;
215 data.t_id_x = threadIdx.x;
216 data.t_id_y = threadIdx.y;
217 data.t_id_z = threadIdx.z;
218 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
219 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
220
221 CeedScalar r_X[BASIS_DIM];
222 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
223 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
224 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
225
226 // load interp_1d into shared memory
227 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
228 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
229 __syncthreads();
230
231 // Apply basis element by element
232 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
233 // Map to coefficients
234 if (BASIS_DIM == 1) {
235 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
236 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
237 } else if (BASIS_DIM == 2) {
238 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
239 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
240 } else if (BASIS_DIM == 3) {
241 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
242 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
243 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
244 }
245
246 // Map to points
247 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
248
249 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
250 const CeedInt p = i % BASIS_NUM_PTS;
251
252 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
253 if (BASIS_DIM == 1) {
254 GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
255 } else if (BASIS_DIM == 2) {
256 GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
257 } else if (BASIS_DIM == 3) {
258 GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
259 }
260 WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
261 }
262 }
263 }
264
GradTransposeAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ c_B,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ d_X,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)265 extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
266 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
267 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
268 extern __shared__ CeedScalar slice[];
269
270 SharedData_Cuda data;
271 data.t_id_x = threadIdx.x;
272 data.t_id_y = threadIdx.y;
273 data.t_id_z = threadIdx.z;
274 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
275 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
276
277 CeedScalar r_X[BASIS_DIM];
278 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
279 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
280 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
281
282 // load interp_1d into shared memory
283 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
284 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
285 __syncthreads();
286
287 // Apply basis element by element
288 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
289 // Clear register
290 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
291
292 // Clear output vector
293 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
294 if (BASIS_DIM == 1) {
295 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
296 } else if (BASIS_DIM == 2) {
297 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
298 } else if (BASIS_DIM == 3) {
299 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
300 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
301 }
302
303 // Map from points
304 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
305
306 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
307 const CeedInt p = i % BASIS_NUM_PTS;
308
309 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
310 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
311 r_U);
312 if (BASIS_DIM == 1) {
313 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
314 } else if (BASIS_DIM == 2) {
315 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
316 } else if (BASIS_DIM == 3) {
317 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
318 }
319 }
320
321 // Map from coefficients
322 if (BASIS_DIM == 1) {
323 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
324 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
325 } else if (BASIS_DIM == 2) {
326 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
327 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
328 } else if (BASIS_DIM == 3) {
329 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
330 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
331 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
332 }
333 }
334 }
335
GradTransposeAddAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ c_B,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ d_X,const CeedScalar * __restrict__ d_U,CeedScalar * __restrict__ d_V)336 extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
337 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
338 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
339 extern __shared__ CeedScalar slice[];
340
341 SharedData_Cuda data;
342 data.t_id_x = threadIdx.x;
343 data.t_id_y = threadIdx.y;
344 data.t_id_z = threadIdx.z;
345 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
346 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
347
348 CeedScalar r_X[BASIS_DIM];
349 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
350 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
351 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
352
353 // load interp_1d into shared memory
354 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
355 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
356 __syncthreads();
357
358 // Apply basis element by element
359 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
360 // Clear register
361 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
362
363 // Map from points
364 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
365
366 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
367 const CeedInt p = i % BASIS_NUM_PTS;
368
369 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
370 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
371 r_U);
372 if (BASIS_DIM == 1) {
373 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
374 } else if (BASIS_DIM == 2) {
375 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
376 } else if (BASIS_DIM == 3) {
377 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
378 }
379 }
380
381 // Map from coefficients
382 if (BASIS_DIM == 1) {
383 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
384 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
385 } else if (BASIS_DIM == 2) {
386 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
387 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
388 } else if (BASIS_DIM == 3) {
389 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
390 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
391 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
392 }
393 }
394 }
395