xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision 4b6745b1d129af063418b6e3473d154613a9a699)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 2D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 2D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21                                             CeedScalar *V) {
22   data.slice[t_id_x + t_id_y * T_1D] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (t_id_x < Q_1D && t_id_y < P_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
28     }
29   }
30   __syncthreads();
31 }
32 
33 //------------------------------------------------------------------------------
34 // 2D tensor contract y
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38                                             CeedScalar *V) {
39   data.slice[t_id_x + t_id_y * T_1D] = *U;
40   __syncthreads();
41   *V = 0.0;
42   if (t_id_x < Q_1D && t_id_y < Q_1D) {
43     for (CeedInt i = 0; i < P_1D; i++) {
44       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
45     }
46   }
47   __syncthreads();
48 }
49 
50 //------------------------------------------------------------------------------
51 // 2D transpose tensor contract y
52 //------------------------------------------------------------------------------
53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55                                                      const CeedScalar *B, CeedScalar *V) {
56   data.slice[t_id_x + t_id_y * T_1D] = *U;
57   __syncthreads();
58   *V = 0.0;
59   if (t_id_x < Q_1D && t_id_y < P_1D) {
60     for (CeedInt i = 0; i < Q_1D; i++) {
61       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
62     }
63   }
64   __syncthreads();
65 }
66 
67 //------------------------------------------------------------------------------
68 // 2D transpose tensor contract x
69 //------------------------------------------------------------------------------
70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72                                                      const CeedScalar *B, CeedScalar *V) {
73   data.slice[t_id_x + t_id_y * T_1D] = *U;
74   __syncthreads();
75   *V = 0.0;
76   if (t_id_x < P_1D && t_id_y < P_1D) {
77     for (CeedInt i = 0; i < Q_1D; i++) {
78       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
79     }
80   }
81   __syncthreads();
82 }
83 
84 //------------------------------------------------------------------------------
85 // 2D transpose tensor contract and add x
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89                                                         const CeedScalar *B, CeedScalar *V) {
90   data.slice[t_id_x + t_id_y * T_1D] = *U;
91   __syncthreads();
92   if (t_id_x < P_1D && t_id_y < P_1D) {
93     for (CeedInt i = 0; i < Q_1D; i++) {
94       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
95     }
96   }
97   __syncthreads();
98 }
99 
100 //------------------------------------------------------------------------------
101 // 2D pack/unpack quadrature values
102 //------------------------------------------------------------------------------
103 template <int NUM_COMP, int Q_1D, int T_1D>
104 inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106 
107   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
109     __syncthreads();
110     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
111     __syncthreads();
112   }
113 }
114 
115 template <int NUM_COMP, int Q_1D, int T_1D>
116 inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118 
119   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
121     __syncthreads();
122     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
123     __syncthreads();
124   }
125 }
126 
127 //------------------------------------------------------------------------------
128 // 2D interpolate to quadrature points
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132                                                CeedScalar *__restrict__ r_V) {
133   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134   CeedScalar r_t[1];
135 
136   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140   }
141   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
142   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
143 }
144 
145 //------------------------------------------------------------------------------
146 // 2D interpolate transpose
147 //------------------------------------------------------------------------------
148 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
149 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
150                                                         CeedScalar *__restrict__ r_V) {
151   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
152   CeedScalar r_t[1];
153 
154   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
155   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
156     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
157     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
158   }
159   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
160 }
161 
162 //------------------------------------------------------------------------------
163 // 2D derivatives at quadrature points
164 //------------------------------------------------------------------------------
165 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
166 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
167                                              CeedScalar *__restrict__ r_V) {
168   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
169   CeedScalar r_t[1];
170 
171   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
172   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
173     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
174     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
175     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
176     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
177   }
178   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
179   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
180 }
181 
182 //------------------------------------------------------------------------------
183 // 2D derivatives transpose
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
187                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
188   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
189   CeedScalar r_t[1];
190 
191   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
192   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
194     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
195     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
196     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
197   }
198   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
199 }
200 
201 //------------------------------------------------------------------------------
202 // 2D quadrature weights
203 //------------------------------------------------------------------------------
204 template <int P_1D, int Q_1D>
205 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
206   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
207 
208   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
209 }
210 
211 //------------------------------------------------------------------------------
212 // 3D
213 //------------------------------------------------------------------------------
214 
215 //------------------------------------------------------------------------------
216 // 3D tensor contract x
217 //------------------------------------------------------------------------------
218 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
219 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
220                                             const CeedScalar *B, CeedScalar *V) {
221   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
222   __syncthreads();
223   *V = 0.0;
224   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
225     for (CeedInt i = 0; i < P_1D; i++) {
226       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
227     }
228   }
229   __syncthreads();
230 }
231 
232 //------------------------------------------------------------------------------
233 // 3D tensor contract y
234 //------------------------------------------------------------------------------
235 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
236 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
237                                             const CeedScalar *B, CeedScalar *V) {
238   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
239   __syncthreads();
240   *V = 0.0;
241   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
242     for (CeedInt i = 0; i < P_1D; i++) {
243       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
244     }
245   }
246   __syncthreads();
247 }
248 
249 //------------------------------------------------------------------------------
250 // 3D tensor contract z
251 //------------------------------------------------------------------------------
252 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
254                                             const CeedScalar *B, CeedScalar *V) {
255   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
256   __syncthreads();
257   *V = 0.0;
258   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
259     for (CeedInt i = 0; i < P_1D; i++) {
260       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
261     }
262   }
263   __syncthreads();
264 }
265 
266 //------------------------------------------------------------------------------
267 // 3D tensor contract z
268 //------------------------------------------------------------------------------
269 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
271                                                      const CeedScalar *B, CeedScalar *V) {
272   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
273   __syncthreads();
274   *V = 0.0;
275   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
276     for (CeedInt i = 0; i < Q_1D; i++) {
277       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
278     }
279   }
280   __syncthreads();
281 }
282 
283 //------------------------------------------------------------------------------
284 // 3D transpose tensor contract z
285 //------------------------------------------------------------------------------
286 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
287 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
288                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
290   __syncthreads();
291   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
292     for (CeedInt i = 0; i < Q_1D; i++) {
293       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
294     }
295   }
296   __syncthreads();
297 }
298 
299 //------------------------------------------------------------------------------
300 // 3D transpose tensor contract y
301 //------------------------------------------------------------------------------
302 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
303 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
304                                                      const CeedScalar *B, CeedScalar *V) {
305   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
306   __syncthreads();
307   *V = 0.0;
308   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
309     for (CeedInt i = 0; i < Q_1D; i++) {
310       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
311     }
312   }
313   __syncthreads();
314 }
315 
316 //------------------------------------------------------------------------------
317 // 3D transpose tensor contract y
318 //------------------------------------------------------------------------------
319 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
320 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
321                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
323   __syncthreads();
324   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
325     for (CeedInt i = 0; i < Q_1D; i++) {
326       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
327     }
328   }
329   __syncthreads();
330 }
331 
332 //------------------------------------------------------------------------------
333 // 3D transpose tensor contract x
334 //------------------------------------------------------------------------------
335 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
336 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
337                                                      const CeedScalar *B, CeedScalar *V) {
338   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
339   __syncthreads();
340   *V = 0.0;
341   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
342     for (CeedInt i = 0; i < Q_1D; i++) {
343       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
344     }
345   }
346   __syncthreads();
347 }
348 
349 //------------------------------------------------------------------------------
350 // 3D transpose tensor contract add x
351 //------------------------------------------------------------------------------
352 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
354                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
356   __syncthreads();
357   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
358     for (CeedInt i = 0; i < Q_1D; i++) {
359       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
360     }
361   }
362   __syncthreads();
363 }
364 
365 //------------------------------------------------------------------------------
366 // 3D pack/unpack quadrature values
367 //------------------------------------------------------------------------------
368 template <int NUM_COMP, int Q_1D, int T_1D>
369 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
370   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
371 
372   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
373     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
374     __syncthreads();
375     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
376     __syncthreads();
377   }
378 }
379 
380 template <int NUM_COMP, int Q_1D, int T_1D>
381 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
382   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
383 
384   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
386     __syncthreads();
387     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
388     __syncthreads();
389   }
390 }
391 
392 //------------------------------------------------------------------------------
393 // 3D interpolate to quadrature points
394 //------------------------------------------------------------------------------
395 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
396 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
397                                                CeedScalar *__restrict__ r_V) {
398   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
399   CeedScalar    r_t1[1], r_t2[1];
400 
401   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
402   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
403     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
404     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
405     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
406   }
407   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
408   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
409 }
410 
411 //------------------------------------------------------------------------------
412 // 3D interpolate transpose
413 //------------------------------------------------------------------------------
414 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
415 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
416                                                         CeedScalar *__restrict__ r_V) {
417   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
418   CeedScalar    r_t1[1], r_t2[1];
419 
420   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
421   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
422     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
423     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
424     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
425   }
426   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
427 }
428 
429 //------------------------------------------------------------------------------
430 // 3D derivatives at quadrature points
431 //------------------------------------------------------------------------------
432 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
433 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
434                                              CeedScalar *__restrict__ r_V) {
435   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
436   CeedScalar    r_t1[1], r_t2[1];
437 
438   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
439   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
440     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
441     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
442     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
443     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
444     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
445     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
446     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
447     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
448     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
449   }
450   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
451   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
452 }
453 
454 //------------------------------------------------------------------------------
455 // 3D derivatives transpose
456 //------------------------------------------------------------------------------
457 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
458 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
459                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
460   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
461   CeedScalar    r_t1[1], r_t2[1];
462 
463   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
464   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
465     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
466     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
467     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
468     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
469     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
470     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
471     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
472     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
473     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
474   }
475   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
476 }
477 
478 //------------------------------------------------------------------------------
479 // 3D derivatives at quadrature points
480 //------------------------------------------------------------------------------
481 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
482 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
483                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
484   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
485   CeedScalar    r_t1[1], r_t2[1];
486 
487   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
488   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
489     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
490     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
491     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
492     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
493     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
494     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
495   }
496   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
497   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
498 }
499 
500 //------------------------------------------------------------------------------
501 // 3D derivatives transpose
502 //------------------------------------------------------------------------------
503 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
504 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
505                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
506   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
507   CeedScalar    r_t1[1], r_t2[1];
508 
509   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
510   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
512     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
513     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
514     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
515     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
516     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
517   }
518   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
519 }
520 
521 //------------------------------------------------------------------------------
522 // 3D quadrature weights
523 //------------------------------------------------------------------------------
524 template <int P_1D, int Q_1D>
525 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
526   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
527 
528   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
529 }
530