1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // 2D
14 //------------------------------------------------------------------------------
15
16 //------------------------------------------------------------------------------
17 // 2D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)20 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21 CeedScalar *V) {
22 __syncthreads();
23 data.slice[t_id_x + t_id_y * T_1D] = *U;
24 __syncthreads();
25 *V = 0.0;
26 if (t_id_x < Q_1D && t_id_y < P_1D) {
27 for (CeedInt i = 0; i < P_1D; i++) {
28 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
29 }
30 }
31 }
32
33 //------------------------------------------------------------------------------
34 // 2D tensor contract y
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)37 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38 CeedScalar *V) {
39 __syncthreads();
40 data.slice[t_id_x + t_id_y * T_1D] = *U;
41 __syncthreads();
42 *V = 0.0;
43 if (t_id_x < Q_1D && t_id_y < Q_1D) {
44 for (CeedInt i = 0; i < P_1D; i++) {
45 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction
46 }
47 }
48 }
49
50 //------------------------------------------------------------------------------
51 // 2D transpose tensor contract y
52 //------------------------------------------------------------------------------
53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55 const CeedScalar *B, CeedScalar *V) {
56 __syncthreads();
57 data.slice[t_id_x + t_id_y * T_1D] = *U;
58 __syncthreads();
59 *V = 0.0;
60 if (t_id_x < Q_1D && t_id_y < P_1D) {
61 for (CeedInt i = 0; i < Q_1D; i++) {
62 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction
63 }
64 }
65 }
66
67 //------------------------------------------------------------------------------
68 // 2D transpose tensor contract x
69 //------------------------------------------------------------------------------
70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72 const CeedScalar *B, CeedScalar *V) {
73 __syncthreads();
74 data.slice[t_id_x + t_id_y * T_1D] = *U;
75 __syncthreads();
76 *V = 0.0;
77 if (t_id_x < P_1D && t_id_y < P_1D) {
78 for (CeedInt i = 0; i < Q_1D; i++) {
79 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
80 }
81 }
82 }
83
84 //------------------------------------------------------------------------------
85 // 2D transpose tensor contract and add x
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89 const CeedScalar *B, CeedScalar *V) {
90 __syncthreads();
91 data.slice[t_id_x + t_id_y * T_1D] = *U;
92 __syncthreads();
93 if (t_id_x < P_1D && t_id_y < P_1D) {
94 for (CeedInt i = 0; i < Q_1D; i++) {
95 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
96 }
97 }
98 }
99
100 //------------------------------------------------------------------------------
101 // 2D pack/unpack quadrature values
102 //------------------------------------------------------------------------------
103 template <int NUM_COMP, int Q_1D, int T_1D>
QPack2d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,CeedScalar * U)104 inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106
107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108 __syncthreads();
109 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110 __syncthreads();
111 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112 }
113 }
114
115 template <int NUM_COMP, int Q_1D, int T_1D>
QUnpack2d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,CeedScalar * U)116 inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118
119 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120 __syncthreads();
121 if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122 __syncthreads();
123 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124 }
125 }
126
127 //------------------------------------------------------------------------------
128 // 2D interpolate to quadrature points
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)131 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132 CeedScalar *__restrict__ r_V) {
133 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134 CeedScalar r_t[1];
135
136 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140 }
141 __syncthreads();
142 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143 if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
144 }
145
146 //------------------------------------------------------------------------------
147 // 2D interpolate transpose
148 //------------------------------------------------------------------------------
149 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)150 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
151 CeedScalar *__restrict__ r_V) {
152 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
153 CeedScalar r_t[1];
154
155 if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
156 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
157 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
158 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
159 }
160 __syncthreads();
161 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
162 }
163
164 //------------------------------------------------------------------------------
165 // 2D interpolate to quadrature points, nodes and quadrature points collocated
166 //------------------------------------------------------------------------------
167 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)168 inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
169 CeedScalar *__restrict__ r_V) {
170 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
171
172 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
173 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
174 r_V[comp] = r_U[comp];
175 }
176 __syncthreads();
177 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
178 if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
179 }
180
181 //------------------------------------------------------------------------------
182 // 2D interpolate transpose, nodes and quadrature points collocated
183 //------------------------------------------------------------------------------
184 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)185 inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
186 CeedScalar *__restrict__ r_V) {
187 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
188
189 if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
190 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
191 r_V[comp] = r_U[comp];
192 }
193 __syncthreads();
194 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
195 }
196
197 //------------------------------------------------------------------------------
198 // 2D derivatives at quadrature points
199 //------------------------------------------------------------------------------
200 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)201 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
202 CeedScalar *__restrict__ r_V) {
203 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
204 CeedScalar r_t[1];
205
206 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
207 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
208 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
209 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
210 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
211 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
212 }
213 __syncthreads();
214 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
215 if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
216 }
217
218 //------------------------------------------------------------------------------
219 // 2D derivatives transpose
220 //------------------------------------------------------------------------------
221 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)222 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
223 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
224 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
225 CeedScalar r_t[1];
226
227 if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
228 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
229 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
230 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
231 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
232 ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
233 }
234 __syncthreads();
235 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
236 }
237
238 //------------------------------------------------------------------------------
239 // 2D derivatives at quadrature points, nodes and quadrature points collocated
240 //------------------------------------------------------------------------------
241 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)242 inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
243 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
244 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
245
246 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
247 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
249 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
250 }
251 __syncthreads();
252 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
253 if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
254 }
255
256 //------------------------------------------------------------------------------
257 // 2D derivatives transpose, nodes and quadrature points collocated
258 //------------------------------------------------------------------------------
259 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)260 inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
261 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
262 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
263
264 if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
265 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
266 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
267 ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
268 }
269 __syncthreads();
270 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
271 }
272
273 //------------------------------------------------------------------------------
274 // 2D quadrature weights
275 //------------------------------------------------------------------------------
276 template <int P_1D, int Q_1D>
WeightTensor2dFlattened(SharedData_Cuda & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)277 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
278 const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
279
280 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
281 }
282
283 //------------------------------------------------------------------------------
284 // 3D
285 //------------------------------------------------------------------------------
286
287 //------------------------------------------------------------------------------
288 // 3D tensor contract x
289 //------------------------------------------------------------------------------
290 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)291 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
292 const CeedScalar *B, CeedScalar *V) {
293 __syncthreads();
294 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
295 __syncthreads();
296 *V = 0.0;
297 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
298 for (CeedInt i = 0; i < P_1D; i++) {
299 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
300 }
301 }
302 }
303
304 //------------------------------------------------------------------------------
305 // 3D tensor contract y
306 //------------------------------------------------------------------------------
307 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)308 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
309 const CeedScalar *B, CeedScalar *V) {
310 __syncthreads();
311 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
312 __syncthreads();
313 *V = 0.0;
314 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
315 for (CeedInt i = 0; i < P_1D; i++) {
316 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
317 }
318 }
319 }
320
321 //------------------------------------------------------------------------------
322 // 3D tensor contract z
323 //------------------------------------------------------------------------------
324 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractZ3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)325 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
326 const CeedScalar *B, CeedScalar *V) {
327 __syncthreads();
328 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
329 __syncthreads();
330 *V = 0.0;
331 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
332 for (CeedInt i = 0; i < P_1D; i++) {
333 *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
334 }
335 }
336 }
337
338 //------------------------------------------------------------------------------
339 // 3D tensor contract z
340 //------------------------------------------------------------------------------
341 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeZ3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)342 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
343 const CeedScalar *B, CeedScalar *V) {
344 __syncthreads();
345 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
346 __syncthreads();
347 *V = 0.0;
348 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
349 for (CeedInt i = 0; i < Q_1D; i++) {
350 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
351 }
352 }
353 }
354
355 //------------------------------------------------------------------------------
356 // 3D transpose tensor contract z
357 //------------------------------------------------------------------------------
358 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddZ3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)359 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
360 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
361 __syncthreads();
362 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
363 __syncthreads();
364 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
365 for (CeedInt i = 0; i < Q_1D; i++) {
366 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
367 }
368 }
369 }
370
371 //------------------------------------------------------------------------------
372 // 3D transpose tensor contract y
373 //------------------------------------------------------------------------------
374 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)375 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
376 const CeedScalar *B, CeedScalar *V) {
377 __syncthreads();
378 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
379 __syncthreads();
380 *V = 0.0;
381 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
382 for (CeedInt i = 0; i < Q_1D; i++) {
383 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
384 }
385 }
386 }
387
388 //------------------------------------------------------------------------------
389 // 3D transpose tensor contract y
390 //------------------------------------------------------------------------------
391 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddY3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)392 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
393 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
394 __syncthreads();
395 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
396 __syncthreads();
397 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
398 for (CeedInt i = 0; i < Q_1D; i++) {
399 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
400 }
401 }
402 }
403
404 //------------------------------------------------------------------------------
405 // 3D transpose tensor contract x
406 //------------------------------------------------------------------------------
407 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)408 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
409 const CeedScalar *B, CeedScalar *V) {
410 __syncthreads();
411 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
412 __syncthreads();
413 *V = 0.0;
414 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
415 for (CeedInt i = 0; i < Q_1D; i++) {
416 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
417 }
418 }
419 }
420
421 //------------------------------------------------------------------------------
422 // 3D transpose tensor contract add x
423 //------------------------------------------------------------------------------
424 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)425 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
426 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
427 __syncthreads();
428 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
429 __syncthreads();
430 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
431 for (CeedInt i = 0; i < Q_1D; i++) {
432 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
433 }
434 }
435 }
436
437 //------------------------------------------------------------------------------
438 // 3D pack/unpack quadrature values
439 //------------------------------------------------------------------------------
440 template <int NUM_COMP, int Q_1D, int T_1D>
QPack3d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,CeedScalar * U)441 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
442 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
443
444 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
445 __syncthreads();
446 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
447 __syncthreads();
448 U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
449 }
450 }
451
452 template <int NUM_COMP, int Q_1D, int T_1D>
QUnpack3d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,CeedScalar * U)453 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
454 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
455
456 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
457 __syncthreads();
458 if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
459 __syncthreads();
460 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
461 }
462 }
463
464 //------------------------------------------------------------------------------
465 // 3D interpolate to quadrature points
466 //------------------------------------------------------------------------------
467 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)468 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
469 CeedScalar *__restrict__ r_V) {
470 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
471 CeedScalar r_t1[1], r_t2[1];
472
473 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
474 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
475 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
476 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
477 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
478 }
479 __syncthreads();
480 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
481 if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
482 }
483
484 //------------------------------------------------------------------------------
485 // 3D interpolate transpose
486 //------------------------------------------------------------------------------
487 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)488 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
489 CeedScalar *__restrict__ r_V) {
490 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
491 CeedScalar r_t1[1], r_t2[1];
492
493 if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
494 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
496 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
497 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
498 }
499 __syncthreads();
500 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
501 }
502
503 //------------------------------------------------------------------------------
504 // 3D interpolate to quadrature points, nodes and quadrature points collocated
505 //------------------------------------------------------------------------------
506 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)507 inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
508 CeedScalar *__restrict__ r_V) {
509 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
510
511 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
512 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513 r_V[comp] = r_U[comp];
514 }
515 __syncthreads();
516 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
517 if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
518 }
519
520 //------------------------------------------------------------------------------
521 // 3D interpolate transpose, nodes and quadrature points collocated
522 //------------------------------------------------------------------------------
523 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)524 inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
525 CeedScalar *__restrict__ r_V) {
526 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
527
528 if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
529 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
530 r_V[comp] = r_U[comp];
531 }
532 __syncthreads();
533 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
534 }
535
536 //------------------------------------------------------------------------------
537 // 3D derivatives at quadrature points
538 //------------------------------------------------------------------------------
539 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)540 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
541 CeedScalar *__restrict__ r_V) {
542 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
543 CeedScalar r_t1[1], r_t2[1];
544
545 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
546 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
547 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
548 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
549 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
550 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
551 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
552 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
553 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
554 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
555 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
556 }
557 __syncthreads();
558 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
559 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
560 }
561
562 //------------------------------------------------------------------------------
563 // 3D derivatives transpose
564 //------------------------------------------------------------------------------
565 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)566 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
567 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
568 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
569 CeedScalar r_t1[1], r_t2[1];
570
571 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
572 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
573 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
574 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
575 ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
576 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
577 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
578 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
579 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
580 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
581 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
582 }
583 __syncthreads();
584 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
585 }
586
587 //------------------------------------------------------------------------------
588 // 3D derivatives at quadrature points
589 //------------------------------------------------------------------------------
590 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocated3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)591 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
592 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
593 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
594 CeedScalar r_t1[1], r_t2[1];
595
596 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
597 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
598 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
599 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
600 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
601 ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
602 ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
603 ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
604 }
605 __syncthreads();
606 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
607 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
608 }
609
610 //------------------------------------------------------------------------------
611 // 3D derivatives transpose
612 //------------------------------------------------------------------------------
613 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)614 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
615 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
616 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
617 CeedScalar r_t1[1], r_t2[1];
618
619 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
620 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
621 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
622 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
623 ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
624 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
625 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
626 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
627 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
628 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
629 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
630 }
631 __syncthreads();
632 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
633 }
634
635 //------------------------------------------------------------------------------
636 // 3D derivatives at quadrature points, nodes and quadrature points collocated
637 //------------------------------------------------------------------------------
638 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)639 inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
640 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
641 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
642
643 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
644 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
645 ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
646 ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
647 ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
648 }
649 __syncthreads();
650 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
651 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
652 }
653
654 //------------------------------------------------------------------------------
655 // 3D derivatives transpose, nodes and quadrature points collocated
656 //------------------------------------------------------------------------------
657 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)658 inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
659 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
660 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
661
662 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
663 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
664 ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
665 ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
666 ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
667 }
668 __syncthreads();
669 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
670 }
671
672 //------------------------------------------------------------------------------
673 // 3D quadrature weights
674 //------------------------------------------------------------------------------
675 template <int P_1D, int Q_1D>
WeightTensor3dFlattened(SharedData_Cuda & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)676 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
677 const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
678
679 *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
680 }
681