1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
ContractX1d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21 __syncthreads();
22 data.slice[data.t_id_x] = *U;
23 __syncthreads();
24 *V = 0.0;
25 if (data.t_id_x < Q_1D) {
26 for (CeedInt i = 0; i < P_1D; i++) {
27 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction
28 }
29 }
30 }
31
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
ContractTransposeX1d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37 __syncthreads();
38 data.slice[data.t_id_x] = *U;
39 __syncthreads();
40 *V = 0.0;
41 if (data.t_id_x < P_1D) {
42 for (CeedInt i = 0; i < Q_1D; i++) {
43 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction
44 }
45 }
46 }
47
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
Interp1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55 }
56 }
57
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTranspose1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63 CeedScalar *__restrict__ r_V) {
64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66 }
67 }
68
69 //------------------------------------------------------------------------------
70 // 1D interpolate to quadrature points, nodes and quadrature points collocated
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpCollocatedNodes1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)73 inline __device__ void InterpCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
74 CeedScalar *__restrict__ r_V) {
75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76 r_V[comp] = r_U[comp];
77 }
78 }
79
80 //------------------------------------------------------------------------------
81 // 1D interpolate transpose, nodes and quadrature points collocated
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeCollocatedNodes1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)84 inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
85 CeedScalar *__restrict__ r_V) {
86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87 r_V[comp] = r_U[comp];
88 }
89 }
90
91 //------------------------------------------------------------------------------
92 // 1D derivatives at quadrature points
93 //------------------------------------------------------------------------------
94 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
Grad1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)95 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
96 CeedScalar *__restrict__ r_V) {
97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
99 }
100 }
101
102 //------------------------------------------------------------------------------
103 // 1D derivatives transpose
104 //------------------------------------------------------------------------------
105 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTranspose1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)106 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
107 CeedScalar *__restrict__ r_V) {
108 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
109 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
110 }
111 }
112
113 //------------------------------------------------------------------------------
114 // 1D quadrature weights
115 //------------------------------------------------------------------------------
116 template <int P_1D, int Q_1D>
Weight1d(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)117 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
118 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
119 }
120
121 //------------------------------------------------------------------------------
122 // 2D
123 //------------------------------------------------------------------------------
124
125 //------------------------------------------------------------------------------
126 // 2D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)129 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130 __syncthreads();
131 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
132 __syncthreads();
133 *V = 0.0;
134 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
135 for (CeedInt i = 0; i < P_1D; i++) {
136 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
137 }
138 }
139 }
140
141 //------------------------------------------------------------------------------
142 // 2D tensor contract y
143 //------------------------------------------------------------------------------
144 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)145 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
146 __syncthreads();
147 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
148 __syncthreads();
149 *V = 0.0;
150 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
151 for (CeedInt i = 0; i < P_1D; i++) {
152 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
153 }
154 }
155 }
156
157 //------------------------------------------------------------------------------
158 // 2D transpose tensor contract y
159 //------------------------------------------------------------------------------
160 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)161 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162 __syncthreads();
163 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
164 __syncthreads();
165 *V = 0.0;
166 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
167 for (CeedInt i = 0; i < Q_1D; i++) {
168 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
169 }
170 }
171 }
172
173 //------------------------------------------------------------------------------
174 // 2D transpose tensor contract x
175 //------------------------------------------------------------------------------
176 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)177 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
178 __syncthreads();
179 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
180 __syncthreads();
181 *V = 0.0;
182 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
183 for (CeedInt i = 0; i < Q_1D; i++) {
184 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
185 }
186 }
187 }
188
189 //------------------------------------------------------------------------------
190 // 2D transpose tensor contract and add x
191 //------------------------------------------------------------------------------
192 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)193 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
194 __syncthreads();
195 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
196 __syncthreads();
197 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
198 for (CeedInt i = 0; i < Q_1D; i++) {
199 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
200 }
201 }
202 }
203
204 //------------------------------------------------------------------------------
205 // 2D interpolate to quadrature points
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)208 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
209 CeedScalar r_t[1];
210 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
211 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
212 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
213 }
214 }
215
216 //------------------------------------------------------------------------------
217 // 2D interpolate transpose
218 //------------------------------------------------------------------------------
219 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)220 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
221 CeedScalar *__restrict__ r_V) {
222 CeedScalar r_t[1];
223 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
224 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
225 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
226 }
227 }
228
229 //------------------------------------------------------------------------------
230 // 2D interpolate to quadrature points, nodes and quadrature points collocated
231 //------------------------------------------------------------------------------
232 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)233 inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
234 CeedScalar *__restrict__ r_V) {
235 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
236 r_V[comp] = r_U[comp];
237 }
238 }
239
240 //------------------------------------------------------------------------------
241 // 2D interpolate transpose, nodes and quadrature points collocated
242 //------------------------------------------------------------------------------
243 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)244 inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
245 CeedScalar *__restrict__ r_V) {
246 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
247 r_V[comp] = r_U[comp];
248 }
249 }
250
251 //------------------------------------------------------------------------------
252 // 2D derivatives at quadrature points
253 //------------------------------------------------------------------------------
254 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)255 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
256 CeedScalar *__restrict__ r_V) {
257 CeedScalar r_t[1];
258 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
259 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
260 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
261 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
262 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
263 }
264 }
265
266 //------------------------------------------------------------------------------
267 // 2D derivatives transpose
268 //------------------------------------------------------------------------------
269 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)270 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
271 CeedScalar *__restrict__ r_V) {
272 CeedScalar r_t[1];
273 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
274 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
275 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
276 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
277 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
278 }
279 }
280
281 //------------------------------------------------------------------------------
282 // 2D derivatives at quadrature points, nodes and quadrature points collocated
283 //------------------------------------------------------------------------------
284 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)285 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
286 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
287 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
288 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
289 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
290 }
291 }
292
293 //------------------------------------------------------------------------------
294 // 2D derivatives transpose, nodes and quadrature points collocated
295 //------------------------------------------------------------------------------
296 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)297 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
298 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
299 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
300 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
301 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
302 }
303 }
304
305 //------------------------------------------------------------------------------
306 // 2D quadrature weights
307 //------------------------------------------------------------------------------
308 template <int P_1D, int Q_1D>
WeightTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)309 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
310 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
311 }
312
313 //------------------------------------------------------------------------------
314 // 3D
315 //------------------------------------------------------------------------------
316
317 //------------------------------------------------------------------------------
318 // 3D tensor contract x
319 //------------------------------------------------------------------------------
320 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)321 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322 CeedScalar r_B[P_1D];
323 for (CeedInt i = 0; i < P_1D; i++) {
324 r_B[i] = B[i + data.t_id_x * P_1D];
325 }
326
327 for (CeedInt k = 0; k < P_1D; k++) {
328 __syncthreads();
329 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
330 __syncthreads();
331 V[k] = 0.0;
332 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
333 for (CeedInt i = 0; i < P_1D; i++) {
334 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
335 }
336 }
337 }
338 }
339
340 //------------------------------------------------------------------------------
341 // 3D tensor contract y
342 //------------------------------------------------------------------------------
343 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)344 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
345 CeedScalar r_B[P_1D];
346 for (CeedInt i = 0; i < P_1D; i++) {
347 r_B[i] = B[i + data.t_id_y * P_1D];
348 }
349
350 for (CeedInt k = 0; k < P_1D; k++) {
351 __syncthreads();
352 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
353 __syncthreads();
354 V[k] = 0.0;
355 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
356 for (CeedInt i = 0; i < P_1D; i++) {
357 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
358 }
359 }
360 }
361 }
362
363 //------------------------------------------------------------------------------
364 // 3D tensor contract z
365 //------------------------------------------------------------------------------
366 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractZ3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)367 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
368 for (CeedInt k = 0; k < Q_1D; k++) {
369 V[k] = 0.0;
370 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
371 for (CeedInt i = 0; i < P_1D; i++) {
372 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction
373 }
374 }
375 }
376 }
377
378 //------------------------------------------------------------------------------
379 // 3D transpose tensor contract z
380 //------------------------------------------------------------------------------
381 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeZ3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)382 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
383 for (CeedInt k = 0; k < P_1D; k++) {
384 V[k] = 0.0;
385 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
386 for (CeedInt i = 0; i < Q_1D; i++) {
387 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction
388 }
389 }
390 }
391 }
392
393 //------------------------------------------------------------------------------
394 // 3D transpose tensor contract y
395 //------------------------------------------------------------------------------
396 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)397 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
398 CeedScalar r_B[Q_1D];
399 for (CeedInt i = 0; i < Q_1D; i++) {
400 r_B[i] = B[data.t_id_y + i * P_1D];
401 }
402
403 for (CeedInt k = 0; k < P_1D; k++) {
404 __syncthreads();
405 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
406 __syncthreads();
407 V[k] = 0.0;
408 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
409 for (CeedInt i = 0; i < Q_1D; i++) {
410 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
411 }
412 }
413 }
414 }
415
416 //------------------------------------------------------------------------------
417 // 3D transpose tensor contract y
418 //------------------------------------------------------------------------------
419 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddY3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)420 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
421 CeedScalar r_B[Q_1D];
422 for (CeedInt i = 0; i < Q_1D; i++) {
423 r_B[i] = B[data.t_id_y + i * P_1D];
424 }
425
426 for (CeedInt k = 0; k < P_1D; k++) {
427 __syncthreads();
428 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
429 __syncthreads();
430 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
431 for (CeedInt i = 0; i < Q_1D; i++) {
432 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
433 }
434 }
435 }
436 }
437
438 //------------------------------------------------------------------------------
439 // 3D transpose tensor contract x
440 //------------------------------------------------------------------------------
441 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)442 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
443 CeedScalar r_B[Q_1D];
444 for (CeedInt i = 0; i < Q_1D; i++) {
445 r_B[i] = B[data.t_id_x + i * P_1D];
446 }
447
448 for (CeedInt k = 0; k < P_1D; k++) {
449 __syncthreads();
450 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
451 __syncthreads();
452 V[k] = 0.0;
453 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
454 for (CeedInt i = 0; i < Q_1D; i++) {
455 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
456 }
457 }
458 }
459 }
460
461 //------------------------------------------------------------------------------
462 // 3D transpose tensor contract add x
463 //------------------------------------------------------------------------------
464 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)465 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
466 CeedScalar r_B[Q_1D];
467 for (CeedInt i = 0; i < Q_1D; i++) {
468 r_B[i] = B[data.t_id_x + i * P_1D];
469 }
470
471 for (CeedInt k = 0; k < P_1D; k++) {
472 __syncthreads();
473 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
474 __syncthreads();
475 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
476 for (CeedInt i = 0; i < Q_1D; i++) {
477 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
478 }
479 }
480 }
481 }
482
483 //------------------------------------------------------------------------------
484 // 3D interpolate to quadrature points
485 //------------------------------------------------------------------------------
486 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)487 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
488 CeedScalar r_t1[T_1D];
489 CeedScalar r_t2[T_1D];
490 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
491 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
492 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
493 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
494 }
495 }
496
497 //------------------------------------------------------------------------------
498 // 3D interpolate transpose
499 //------------------------------------------------------------------------------
500 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)501 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
502 CeedScalar *__restrict__ r_V) {
503 CeedScalar r_t1[T_1D];
504 CeedScalar r_t2[T_1D];
505 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
506 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
507 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
508 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
509 }
510 }
511
512 //------------------------------------------------------------------------------
513 // 3D interpolate to quadrature points, nodes and quadrature points collocated
514 //------------------------------------------------------------------------------
515 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)516 inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
517 CeedScalar *__restrict__ r_V) {
518 for (CeedInt i = 0; i < Q_1D; i++) {
519 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
520 r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
521 }
522 }
523 }
524
525 //------------------------------------------------------------------------------
526 // 3D interpolate transpose, nodes and quadrature points collocated
527 //------------------------------------------------------------------------------
528 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)529 inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
530 CeedScalar *__restrict__ r_V) {
531 for (CeedInt i = 0; i < Q_1D; i++) {
532 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
533 r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
534 }
535 }
536 }
537
538 //------------------------------------------------------------------------------
539 // 3D derivatives at quadrature points
540 //------------------------------------------------------------------------------
541 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)542 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
543 CeedScalar *__restrict__ r_V) {
544 CeedScalar r_t1[T_1D];
545 CeedScalar r_t2[T_1D];
546 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
547 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
548 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
549 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
550 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
551 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
552 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
553 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
554 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
555 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
556 }
557 }
558
559 //------------------------------------------------------------------------------
560 // 3D derivatives transpose
561 //------------------------------------------------------------------------------
562 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)563 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
564 CeedScalar *__restrict__ r_V) {
565 CeedScalar r_t1[T_1D];
566 CeedScalar r_t2[T_1D];
567 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
568 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
569 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
570 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
571 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
572 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
573 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
574 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
575 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
576 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
577 }
578 }
579
580 //------------------------------------------------------------------------------
581 // 3D derivatives at quadrature points
582 //------------------------------------------------------------------------------
583 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocated3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)584 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
585 CeedScalar *__restrict__ r_V) {
586 CeedScalar r_t1[T_1D];
587 CeedScalar r_t2[T_1D];
588 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
589 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
590 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
591 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
592 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
593 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
594 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
595 }
596 }
597
598 //------------------------------------------------------------------------------
599 // 3D derivatives transpose
600 //------------------------------------------------------------------------------
601 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocated3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)602 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
603 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
604 CeedScalar r_t1[T_1D];
605 CeedScalar r_t2[T_1D];
606 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
607 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
608 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
609 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
610 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
611 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
612 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
613 }
614 }
615
616 //------------------------------------------------------------------------------
617 // 3D derivatives at quadrature points, nodes and quadrature points collocated
618 //------------------------------------------------------------------------------
619 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)620 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
621 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
622 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
623 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
624 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
625 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
626 }
627 }
628
629 //------------------------------------------------------------------------------
630 // 3D derivatives transpose, nodes and quadrature points collocated
631 //------------------------------------------------------------------------------
632 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)633 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
634 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
635 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
636 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
637 ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
638 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
639 }
640 }
641
642 //------------------------------------------------------------------------------
643 // 3D quadrature weights
644 //------------------------------------------------------------------------------
645 template <int P_1D, int Q_1D>
WeightTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)646 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
647 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
648 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
649 for (CeedInt q = 0; q < Q_1D; q++) {
650 w[q] = quad ? pw * q_weight_1d[q] : 0.0;
651 }
652 }
653