1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for HIP tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11
12 #include "hip-shared-basis-read-write-templates.h"
13 #include "hip-shared-basis-tensor-at-points-templates.h"
14 #include "hip-shared-basis-tensor-templates.h"
15
16 //------------------------------------------------------------------------------
17 // Tensor Basis Kernels AtPoints
18 //------------------------------------------------------------------------------
19
20 //------------------------------------------------------------------------------
21 // Interp
22 //------------------------------------------------------------------------------
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)23 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
24 void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
25 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
26 extern __shared__ CeedScalar slice[];
27
28 SharedData_Hip data;
29 data.t_id_x = threadIdx.x;
30 data.t_id_y = threadIdx.y;
31 data.t_id_z = threadIdx.z;
32 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
33 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
34
35 CeedScalar r_X[BASIS_DIM];
36 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
37 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
38 CeedScalar r_V[BASIS_NUM_COMP];
39
40 // load chebyshev_interp_1d into shared memory
41 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
42 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
43 __syncthreads();
44
45 // Apply basis element by element
46 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
47 // Map to coefficients
48 if (BASIS_DIM == 1) {
49 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
50 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
51 } else if (BASIS_DIM == 2) {
52 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
53 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
54 } else if (BASIS_DIM == 3) {
55 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
56 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
57 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
58 }
59
60 // Map to points
61 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
62
63 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
64 const CeedInt p = i % BASIS_NUM_PTS;
65
66 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
67 if (BASIS_DIM == 1) {
68 InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
69 } else if (BASIS_DIM == 2) {
70 InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
71 } else if (BASIS_DIM == 3) {
72 InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
73 }
74 WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
75 }
76 }
77 }
78
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)79 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
80 void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
81 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
82 extern __shared__ CeedScalar slice[];
83
84 SharedData_Hip data;
85 data.t_id_x = threadIdx.x;
86 data.t_id_y = threadIdx.y;
87 data.t_id_z = threadIdx.z;
88 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
89 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
90
91 CeedScalar r_X[BASIS_DIM];
92 CeedScalar r_U[BASIS_NUM_COMP];
93 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
94 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
95
96 // load chebyshev_interp_1d into shared memory
97 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
98 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
99 __syncthreads();
100
101 // Apply basis element by element
102 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
103 // Clear register
104 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
105
106 // Clear output vector
107 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
108 if (BASIS_DIM == 1) {
109 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
110 } else if (BASIS_DIM == 2) {
111 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
112 } else if (BASIS_DIM == 3) {
113 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
114 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
115 }
116
117 // Map from points
118 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
119
120 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
121 const CeedInt p = i % BASIS_NUM_PTS;
122
123 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
124 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
125 if (BASIS_DIM == 1) {
126 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
127 } else if (BASIS_DIM == 2) {
128 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
129 } else if (BASIS_DIM == 3) {
130 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
131 }
132 }
133
134 // Map from coefficients
135 if (BASIS_DIM == 1) {
136 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
137 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
138 } else if (BASIS_DIM == 2) {
139 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
140 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
141 } else if (BASIS_DIM == 3) {
142 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
143 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
144 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
145 }
146 }
147 }
148
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)149 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
150 void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
151 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
152 extern __shared__ CeedScalar slice[];
153
154 SharedData_Hip data;
155 data.t_id_x = threadIdx.x;
156 data.t_id_y = threadIdx.y;
157 data.t_id_z = threadIdx.z;
158 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
159 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
160
161 CeedScalar r_X[BASIS_DIM];
162 CeedScalar r_U[BASIS_NUM_COMP];
163 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
164 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
165
166 // load chebyshev_interp_1d into shared memory
167 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
168 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
169 __syncthreads();
170
171 // Apply basis element by element
172 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
173 // Clear register
174 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
175
176 // Map from points
177 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
178
179 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
180 const CeedInt p = i % BASIS_NUM_PTS;
181
182 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
183 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
184 if (BASIS_DIM == 1) {
185 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
186 } else if (BASIS_DIM == 2) {
187 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
188 } else if (BASIS_DIM == 3) {
189 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
190 }
191 }
192
193 // Map from coefficients
194 if (BASIS_DIM == 1) {
195 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
196 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
197 } else if (BASIS_DIM == 2) {
198 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
199 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
200 } else if (BASIS_DIM == 3) {
201 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
202 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
203 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
204 }
205 }
206 }
207
208 //------------------------------------------------------------------------------
209 // Grad
210 //------------------------------------------------------------------------------
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)211 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
212 void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
213 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
214 extern __shared__ CeedScalar slice[];
215
216 SharedData_Hip data;
217 data.t_id_x = threadIdx.x;
218 data.t_id_y = threadIdx.y;
219 data.t_id_z = threadIdx.z;
220 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
221 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
222
223 CeedScalar r_X[BASIS_DIM];
224 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
225 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
226 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
227
228 // load chebyshev_interp_1d into shared memory
229 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
230 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
231 __syncthreads();
232
233 // Apply basis element by element
234 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
235 // Map to coefficients
236 if (BASIS_DIM == 1) {
237 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
238 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
239 } else if (BASIS_DIM == 2) {
240 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
241 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
242 } else if (BASIS_DIM == 3) {
243 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
244 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
245 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
246 }
247
248 // Map to points
249 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
250
251 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
252 const CeedInt p = i % BASIS_NUM_PTS;
253
254 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
255 if (BASIS_DIM == 1) {
256 GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
257 } else if (BASIS_DIM == 2) {
258 GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
259 } else if (BASIS_DIM == 3) {
260 GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
261 }
262 WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
263 }
264 }
265 }
266
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)267 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
268 void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
269 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
270 extern __shared__ CeedScalar slice[];
271
272 SharedData_Hip data;
273 data.t_id_x = threadIdx.x;
274 data.t_id_y = threadIdx.y;
275 data.t_id_z = threadIdx.z;
276 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
277 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
278
279 CeedScalar r_X[BASIS_DIM];
280 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
281 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
282 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
283
284 // load chebyshev_interp_1d into shared memory
285 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
286 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
287 __syncthreads();
288
289 // Apply basis element by element
290 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
291 // Clear register
292 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
293
294 // Clear output vector
295 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
296 if (BASIS_DIM == 1) {
297 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
298 } else if (BASIS_DIM == 2) {
299 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
300 } else if (BASIS_DIM == 3) {
301 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
302 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
303 }
304
305 // Map from points
306 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
307
308 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
309 const CeedInt p = i % BASIS_NUM_PTS;
310
311 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
312 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
313 r_U);
314 if (BASIS_DIM == 1) {
315 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
316 } else if (BASIS_DIM == 2) {
317 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
318 } else if (BASIS_DIM == 3) {
319 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
320 }
321 }
322
323 // Map from coefficients
324 if (BASIS_DIM == 1) {
325 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
326 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
327 } else if (BASIS_DIM == 2) {
328 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
329 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
330 } else if (BASIS_DIM == 3) {
331 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
332 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
333 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
334 }
335 }
336 }
337
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)338 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
339 void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
340 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
341 extern __shared__ CeedScalar slice[];
342
343 SharedData_Hip data;
344 data.t_id_x = threadIdx.x;
345 data.t_id_y = threadIdx.y;
346 data.t_id_z = threadIdx.z;
347 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
348 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
349
350 CeedScalar r_X[BASIS_DIM];
351 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
352 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
353 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
354
355 // load chebyshev_interp_1d into shared memory
356 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
357 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
358 __syncthreads();
359
360 // Apply basis element by element
361 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
362 // Clear register
363 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
364
365 // Map from points
366 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
367
368 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
369 const CeedInt p = i % BASIS_NUM_PTS;
370
371 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
372 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
373 r_U);
374 if (BASIS_DIM == 1) {
375 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
376 } else if (BASIS_DIM == 2) {
377 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
378 } else if (BASIS_DIM == 3) {
379 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
380 }
381 }
382
383 // Map from coefficients
384 if (BASIS_DIM == 1) {
385 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
386 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
387 } else if (BASIS_DIM == 2) {
388 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
389 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
390 } else if (BASIS_DIM == 3) {
391 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
392 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
393 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
394 }
395 }
396 }
397