xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision c8e372f0bd7b7998d8acb114f0b7d6adab16132f)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 2D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 2D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21                                             CeedScalar *V) {
22   data.slice[t_id_x + t_id_y * T_1D] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (t_id_x < Q_1D && t_id_y < P_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
28     }
29   }
30   __syncthreads();
31 }
32 
33 //------------------------------------------------------------------------------
34 // 2D tensor contract y
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38                                             CeedScalar *V) {
39   data.slice[t_id_x + t_id_y * T_1D] = *U;
40   __syncthreads();
41   *V = 0.0;
42   if (t_id_x < Q_1D && t_id_y < Q_1D) {
43     for (CeedInt i = 0; i < P_1D; i++) {
44       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
45     }
46   }
47   __syncthreads();
48 }
49 
50 //------------------------------------------------------------------------------
51 // 2D transpose tensor contract y
52 //------------------------------------------------------------------------------
53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55                                                      const CeedScalar *B, CeedScalar *V) {
56   data.slice[t_id_x + t_id_y * T_1D] = *U;
57   __syncthreads();
58   *V = 0.0;
59   if (t_id_x < Q_1D && t_id_y < P_1D) {
60     for (CeedInt i = 0; i < Q_1D; i++) {
61       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
62     }
63   }
64   __syncthreads();
65 }
66 
67 //------------------------------------------------------------------------------
68 // 2D transpose tensor contract x
69 //------------------------------------------------------------------------------
70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72                                                      const CeedScalar *B, CeedScalar *V) {
73   data.slice[t_id_x + t_id_y * T_1D] = *U;
74   __syncthreads();
75   *V = 0.0;
76   if (t_id_x < P_1D && t_id_y < P_1D) {
77     for (CeedInt i = 0; i < Q_1D; i++) {
78       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
79     }
80   }
81   __syncthreads();
82 }
83 
84 //------------------------------------------------------------------------------
85 // 2D transpose tensor contract and add x
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89                                                         const CeedScalar *B, CeedScalar *V) {
90   data.slice[t_id_x + t_id_y * T_1D] = *U;
91   __syncthreads();
92   if (t_id_x < P_1D && t_id_y < P_1D) {
93     for (CeedInt i = 0; i < Q_1D; i++) {
94       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
95     }
96   }
97   __syncthreads();
98 }
99 
100 //------------------------------------------------------------------------------
101 // 2D pack/unpack quadrature values
102 //------------------------------------------------------------------------------
103 template <int NUM_COMP, int Q_1D, int T_1D>
104 inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106 
107   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
109     __syncthreads();
110     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
111     __syncthreads();
112   }
113 }
114 
115 template <int NUM_COMP, int Q_1D, int T_1D>
116 inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118 
119   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
121     __syncthreads();
122     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
123     __syncthreads();
124   }
125 }
126 
127 //------------------------------------------------------------------------------
128 // 2D interpolate to quadrature points
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132                                                CeedScalar *__restrict__ r_V) {
133   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134   CeedScalar r_t[1];
135 
136   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140   }
141   QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
142 }
143 
144 //------------------------------------------------------------------------------
145 // 2D interpolate transpose
146 //------------------------------------------------------------------------------
147 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
148 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
149                                                         CeedScalar *__restrict__ r_V) {
150   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
151   CeedScalar r_t[1];
152 
153   QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
154   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
155     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
156     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
157   }
158   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
159 }
160 
161 //------------------------------------------------------------------------------
162 // 2D derivatives at quadrature points
163 //------------------------------------------------------------------------------
164 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
165 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
166                                              CeedScalar *__restrict__ r_V) {
167   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
168   CeedScalar r_t[1];
169 
170   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
171   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
172     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
173     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
174     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
175     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
176   }
177   QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
178 }
179 
180 //------------------------------------------------------------------------------
181 // 2D derivatives transpose
182 //------------------------------------------------------------------------------
183 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
184 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
185                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
186   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
187   CeedScalar r_t[1];
188 
189   QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
190   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
191     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
192     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
193     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
194     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
195   }
196   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
197 }
198 
199 //------------------------------------------------------------------------------
200 // 2D quadrature weights
201 //------------------------------------------------------------------------------
202 template <int P_1D, int Q_1D>
203 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
204   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
205 
206   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
207 }
208 
209 //------------------------------------------------------------------------------
210 // 3D
211 //------------------------------------------------------------------------------
212 
213 //------------------------------------------------------------------------------
214 // 3D tensor contract x
215 //------------------------------------------------------------------------------
216 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
217 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
218                                             const CeedScalar *B, CeedScalar *V) {
219   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
220   __syncthreads();
221   *V = 0.0;
222   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
223     for (CeedInt i = 0; i < P_1D; i++) {
224       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
225     }
226   }
227   __syncthreads();
228 }
229 
230 //------------------------------------------------------------------------------
231 // 3D tensor contract y
232 //------------------------------------------------------------------------------
233 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
234 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
235                                             const CeedScalar *B, CeedScalar *V) {
236   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
237   __syncthreads();
238   *V = 0.0;
239   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
240     for (CeedInt i = 0; i < P_1D; i++) {
241       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
242     }
243   }
244   __syncthreads();
245 }
246 
247 //------------------------------------------------------------------------------
248 // 3D tensor contract z
249 //------------------------------------------------------------------------------
250 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
251 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
252                                             const CeedScalar *B, CeedScalar *V) {
253   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
254   __syncthreads();
255   *V = 0.0;
256   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
257     for (CeedInt i = 0; i < P_1D; i++) {
258       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
259     }
260   }
261   __syncthreads();
262 }
263 
264 //------------------------------------------------------------------------------
265 // 3D tensor contract z
266 //------------------------------------------------------------------------------
267 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
268 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
269                                                      const CeedScalar *B, CeedScalar *V) {
270   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
271   __syncthreads();
272   *V = 0.0;
273   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
274     for (CeedInt i = 0; i < Q_1D; i++) {
275       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
276     }
277   }
278   __syncthreads();
279 }
280 
281 //------------------------------------------------------------------------------
282 // 3D transpose tensor contract z
283 //------------------------------------------------------------------------------
284 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
285 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
286                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
287   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
288   __syncthreads();
289   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
290     for (CeedInt i = 0; i < Q_1D; i++) {
291       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
292     }
293   }
294   __syncthreads();
295 }
296 
297 //------------------------------------------------------------------------------
298 // 3D transpose tensor contract y
299 //------------------------------------------------------------------------------
300 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
301 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
302                                                      const CeedScalar *B, CeedScalar *V) {
303   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
304   __syncthreads();
305   *V = 0.0;
306   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
307     for (CeedInt i = 0; i < Q_1D; i++) {
308       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
309     }
310   }
311   __syncthreads();
312 }
313 
314 //------------------------------------------------------------------------------
315 // 3D transpose tensor contract y
316 //------------------------------------------------------------------------------
317 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
318 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
319                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
320   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
321   __syncthreads();
322   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
323     for (CeedInt i = 0; i < Q_1D; i++) {
324       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
325     }
326   }
327   __syncthreads();
328 }
329 
330 //------------------------------------------------------------------------------
331 // 3D transpose tensor contract x
332 //------------------------------------------------------------------------------
333 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
334 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
335                                                      const CeedScalar *B, CeedScalar *V) {
336   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
337   __syncthreads();
338   *V = 0.0;
339   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
340     for (CeedInt i = 0; i < Q_1D; i++) {
341       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
342     }
343   }
344   __syncthreads();
345 }
346 
347 //------------------------------------------------------------------------------
348 // 3D transpose tensor contract add x
349 //------------------------------------------------------------------------------
350 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
351 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
352                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
353   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
354   __syncthreads();
355   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
356     for (CeedInt i = 0; i < Q_1D; i++) {
357       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
358     }
359   }
360   __syncthreads();
361 }
362 
363 //------------------------------------------------------------------------------
364 // 3D pack/unpack quadrature values
365 //------------------------------------------------------------------------------
366 template <int NUM_COMP, int Q_1D, int T_1D>
367 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
368   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
369 
370   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
371     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
372     __syncthreads();
373     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
374     __syncthreads();
375   }
376 }
377 
378 template <int NUM_COMP, int Q_1D, int T_1D>
379 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
380   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
381 
382   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
383     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
384     __syncthreads();
385     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
386     __syncthreads();
387   }
388 }
389 
390 //------------------------------------------------------------------------------
391 // 3D interpolate to quadrature points
392 //------------------------------------------------------------------------------
393 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
394 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
395                                                CeedScalar *__restrict__ r_V) {
396   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
397   CeedScalar    r_t1[1], r_t2[1];
398 
399   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
400   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
401     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
402     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
403     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
404   }
405   QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
406 }
407 
408 //------------------------------------------------------------------------------
409 // 3D interpolate transpose
410 //------------------------------------------------------------------------------
411 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
412 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
413                                                         CeedScalar *__restrict__ r_V) {
414   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
415   CeedScalar    r_t1[1], r_t2[1];
416 
417   QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
418   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
419     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
420     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
421     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
422   }
423   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
424 }
425 
426 //------------------------------------------------------------------------------
427 // 3D derivatives at quadrature points
428 //------------------------------------------------------------------------------
429 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
430 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
431                                              CeedScalar *__restrict__ r_V) {
432   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
433   CeedScalar    r_t1[1], r_t2[1];
434 
435   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
436   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
437     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
438     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
439     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
440     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
441     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
442     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
443     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
444     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
445     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
446   }
447   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
448 }
449 
450 //------------------------------------------------------------------------------
451 // 3D derivatives transpose
452 //------------------------------------------------------------------------------
453 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
454 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
455                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
456   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
457   CeedScalar    r_t1[1], r_t2[1];
458 
459   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
460   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
461     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
462     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
463     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
464     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
465     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
466     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
467     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
468     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
469     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
470   }
471   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
472 }
473 
474 //------------------------------------------------------------------------------
475 // 3D derivatives at quadrature points
476 //------------------------------------------------------------------------------
477 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
478 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
479                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
480   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
481   CeedScalar    r_t1[1], r_t2[1];
482 
483   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
484   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
485     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
486     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
487     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
488     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
489     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
490     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
491   }
492   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
493 }
494 
495 //------------------------------------------------------------------------------
496 // 3D derivatives transpose
497 //------------------------------------------------------------------------------
498 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
499 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
500                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
501   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
502   CeedScalar    r_t1[1], r_t2[1];
503 
504   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
505   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
506     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
507     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
508     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
509     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
510     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
511     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
512   }
513   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
514 }
515 
516 //------------------------------------------------------------------------------
517 // 3D quadrature weights
518 //------------------------------------------------------------------------------
519 template <int P_1D, int Q_1D>
520 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
521   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
522 
523   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
524 }
525