xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h (revision 0ccda8ebe42db3fb90cdb724a58e4e5d2aedf1a1)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 2D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 2D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20 inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21                                             CeedScalar *V) {
22   __syncthreads();
23   data.slice[t_id_x + t_id_y * T_1D] = *U;
24   __syncthreads();
25   *V = 0.0;
26   if (t_id_x < Q_1D && t_id_y < P_1D) {
27     for (CeedInt i = 0; i < P_1D; i++) {
28       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
29     }
30   }
31 }
32 
33 //------------------------------------------------------------------------------
34 // 2D tensor contract y
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37 inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38                                             CeedScalar *V) {
39   __syncthreads();
40   data.slice[t_id_x + t_id_y * T_1D] = *U;
41   __syncthreads();
42   *V = 0.0;
43   if (t_id_x < Q_1D && t_id_y < Q_1D) {
44     for (CeedInt i = 0; i < P_1D; i++) {
45       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
46     }
47   }
48 }
49 
50 //------------------------------------------------------------------------------
51 // 2D transpose tensor contract y
52 //------------------------------------------------------------------------------
53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55                                                      const CeedScalar *B, CeedScalar *V) {
56   __syncthreads();
57   data.slice[t_id_x + t_id_y * T_1D] = *U;
58   __syncthreads();
59   *V = 0.0;
60   if (t_id_x < Q_1D && t_id_y < P_1D) {
61     for (CeedInt i = 0; i < Q_1D; i++) {
62       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
63     }
64   }
65 }
66 
67 //------------------------------------------------------------------------------
68 // 2D transpose tensor contract x
69 //------------------------------------------------------------------------------
70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72                                                      const CeedScalar *B, CeedScalar *V) {
73   __syncthreads();
74   data.slice[t_id_x + t_id_y * T_1D] = *U;
75   __syncthreads();
76   *V = 0.0;
77   if (t_id_x < P_1D && t_id_y < P_1D) {
78     for (CeedInt i = 0; i < Q_1D; i++) {
79       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
80     }
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // 2D transpose tensor contract and add x
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89                                                         const CeedScalar *B, CeedScalar *V) {
90   __syncthreads();
91   data.slice[t_id_x + t_id_y * T_1D] = *U;
92   __syncthreads();
93   if (t_id_x < P_1D && t_id_y < P_1D) {
94     for (CeedInt i = 0; i < Q_1D; i++) {
95       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
96     }
97   }
98 }
99 
100 //------------------------------------------------------------------------------
101 // 2D pack/unpack quadrature values
102 //------------------------------------------------------------------------------
103 template <int NUM_COMP, int Q_1D, int T_1D>
104 inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106 
107   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108     __syncthreads();
109     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110     __syncthreads();
111     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112   }
113 }
114 
115 template <int NUM_COMP, int Q_1D, int T_1D>
116 inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118 
119   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120     __syncthreads();
121     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122     __syncthreads();
123     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124   }
125 }
126 
127 //------------------------------------------------------------------------------
128 // 2D interpolate to quadrature points
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131 inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132                                                CeedScalar *__restrict__ r_V) {
133   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134   CeedScalar r_t[1];
135 
136   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140   }
141   __syncthreads();
142   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
144 }
145 
146 //------------------------------------------------------------------------------
147 // 2D interpolate transpose
148 //------------------------------------------------------------------------------
149 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
150 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
151                                                         CeedScalar *__restrict__ r_V) {
152   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
153   CeedScalar r_t[1];
154 
155   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
156   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
157     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
158     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
159   }
160   __syncthreads();
161   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
162 }
163 
164 //------------------------------------------------------------------------------
165 // 2D interpolate to quadrature points, nodes and quadrature points collocated
166 //------------------------------------------------------------------------------
167 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
168 inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
169                                                               CeedScalar *__restrict__ r_V) {
170   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
171 
172   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
173   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
174     r_V[comp] = r_U[comp];
175   }
176   __syncthreads();
177   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
178   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
179 }
180 
181 //------------------------------------------------------------------------------
182 // 2D interpolate transpose, nodes and quadrature points collocated
183 //------------------------------------------------------------------------------
184 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
185 inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
186                                                                        CeedScalar *__restrict__ r_V) {
187   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
188 
189   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
190   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
191     r_V[comp] = r_U[comp];
192   }
193   __syncthreads();
194   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
195 }
196 
197 //------------------------------------------------------------------------------
198 // 2D derivatives at quadrature points
199 //------------------------------------------------------------------------------
200 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
201 inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
202                                              CeedScalar *__restrict__ r_V) {
203   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
204   CeedScalar r_t[1];
205 
206   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
207   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
208     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
209     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
210     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
211     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
212   }
213   __syncthreads();
214   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
215   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
216 }
217 
218 //------------------------------------------------------------------------------
219 // 2D derivatives transpose
220 //------------------------------------------------------------------------------
221 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
222 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
223                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
224   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
225   CeedScalar r_t[1];
226 
227   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
228   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
229     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
230     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
231     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
232     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
233   }
234   __syncthreads();
235   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
236 }
237 
238 //------------------------------------------------------------------------------
239 // 2D derivatives at quadrature points, nodes and quadrature points collocated
240 //------------------------------------------------------------------------------
241 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
242 inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
243                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
244   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
245 
246   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
247   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
249     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
250   }
251   __syncthreads();
252   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
253   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
254 }
255 
256 //------------------------------------------------------------------------------
257 // 2D derivatives transpose, nodes and quadrature points collocated
258 //------------------------------------------------------------------------------
259 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
260 inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
261                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
262   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
263 
264   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
265   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
266     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
267     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
268   }
269   __syncthreads();
270   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
271 }
272 
273 //------------------------------------------------------------------------------
274 // 2D quadrature weights
275 //------------------------------------------------------------------------------
276 template <int P_1D, int Q_1D>
277 inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
278   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
279 
280   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
281 }
282 
283 //------------------------------------------------------------------------------
284 // 3D
285 //------------------------------------------------------------------------------
286 
287 //------------------------------------------------------------------------------
288 // 3D tensor contract x
289 //------------------------------------------------------------------------------
290 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
291 inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
292                                             const CeedScalar *B, CeedScalar *V) {
293   __syncthreads();
294   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
295   __syncthreads();
296   *V = 0.0;
297   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
298     for (CeedInt i = 0; i < P_1D; i++) {
299       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
300     }
301   }
302 }
303 
304 //------------------------------------------------------------------------------
305 // 3D tensor contract y
306 //------------------------------------------------------------------------------
307 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
308 inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
309                                             const CeedScalar *B, CeedScalar *V) {
310   __syncthreads();
311   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
312   __syncthreads();
313   *V = 0.0;
314   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
315     for (CeedInt i = 0; i < P_1D; i++) {
316       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
317     }
318   }
319 }
320 
321 //------------------------------------------------------------------------------
322 // 3D tensor contract z
323 //------------------------------------------------------------------------------
324 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
325 inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
326                                             const CeedScalar *B, CeedScalar *V) {
327   __syncthreads();
328   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
329   __syncthreads();
330   *V = 0.0;
331   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
332     for (CeedInt i = 0; i < P_1D; i++) {
333       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
334     }
335   }
336 }
337 
338 //------------------------------------------------------------------------------
339 // 3D tensor contract z
340 //------------------------------------------------------------------------------
341 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
342 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
343                                                      const CeedScalar *B, CeedScalar *V) {
344   __syncthreads();
345   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
346   __syncthreads();
347   *V = 0.0;
348   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
349     for (CeedInt i = 0; i < Q_1D; i++) {
350       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
351     }
352   }
353 }
354 
355 //------------------------------------------------------------------------------
356 // 3D transpose tensor contract z
357 //------------------------------------------------------------------------------
358 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
359 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
360                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
361   __syncthreads();
362   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
363   __syncthreads();
364   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
365     for (CeedInt i = 0; i < Q_1D; i++) {
366       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
367     }
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D transpose tensor contract y
373 //------------------------------------------------------------------------------
374 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
375 inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
376                                                      const CeedScalar *B, CeedScalar *V) {
377   __syncthreads();
378   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
379   __syncthreads();
380   *V = 0.0;
381   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
382     for (CeedInt i = 0; i < Q_1D; i++) {
383       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
384     }
385   }
386 }
387 
388 //------------------------------------------------------------------------------
389 // 3D transpose tensor contract y
390 //------------------------------------------------------------------------------
391 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
392 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
393                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
394   __syncthreads();
395   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
396   __syncthreads();
397   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
398     for (CeedInt i = 0; i < Q_1D; i++) {
399       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
400     }
401   }
402 }
403 
404 //------------------------------------------------------------------------------
405 // 3D transpose tensor contract x
406 //------------------------------------------------------------------------------
407 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
408 inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
409                                                      const CeedScalar *B, CeedScalar *V) {
410   __syncthreads();
411   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
412   __syncthreads();
413   *V = 0.0;
414   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
415     for (CeedInt i = 0; i < Q_1D; i++) {
416       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
417     }
418   }
419 }
420 
421 //------------------------------------------------------------------------------
422 // 3D transpose tensor contract add x
423 //------------------------------------------------------------------------------
424 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
425 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
426                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
427   __syncthreads();
428   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
429   __syncthreads();
430   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
431     for (CeedInt i = 0; i < Q_1D; i++) {
432       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
433     }
434   }
435 }
436 
437 //------------------------------------------------------------------------------
438 // 3D pack/unpack quadrature values
439 //------------------------------------------------------------------------------
440 template <int NUM_COMP, int Q_1D, int T_1D>
441 inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
442   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
443 
444   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
445     __syncthreads();
446     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
447     __syncthreads();
448     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
449   }
450 }
451 
452 template <int NUM_COMP, int Q_1D, int T_1D>
453 inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
454   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
455 
456   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
457     __syncthreads();
458     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
459     __syncthreads();
460     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
461   }
462 }
463 
464 //------------------------------------------------------------------------------
465 // 3D interpolate to quadrature points
466 //------------------------------------------------------------------------------
467 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
468 inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
469                                                CeedScalar *__restrict__ r_V) {
470   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
471   CeedScalar    r_t1[1], r_t2[1];
472 
473   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
474   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
475     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
476     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
477     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
478   }
479   __syncthreads();
480   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
481   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
482 }
483 
484 //------------------------------------------------------------------------------
485 // 3D interpolate transpose
486 //------------------------------------------------------------------------------
487 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
488 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
489                                                         CeedScalar *__restrict__ r_V) {
490   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
491   CeedScalar    r_t1[1], r_t2[1];
492 
493   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
494   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
496     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
497     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
498   }
499   __syncthreads();
500   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
501 }
502 
503 //------------------------------------------------------------------------------
504 // 3D interpolate to quadrature points, nodes and quadrature points collocated
505 //------------------------------------------------------------------------------
506 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
507 inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
508                                                               CeedScalar *__restrict__ r_V) {
509   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
510 
511   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
512   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513     r_V[comp] = r_U[comp];
514   }
515   __syncthreads();
516   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
517   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
518 }
519 
520 //------------------------------------------------------------------------------
521 // 3D interpolate transpose, nodes and quadrature points collocated
522 //------------------------------------------------------------------------------
523 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
524 inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
525                                                                        CeedScalar *__restrict__ r_V) {
526   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
527 
528   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
529   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
530     r_V[comp] = r_U[comp];
531   }
532   __syncthreads();
533   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
534 }
535 
536 //------------------------------------------------------------------------------
537 // 3D derivatives at quadrature points
538 //------------------------------------------------------------------------------
539 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
540 inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
541                                              CeedScalar *__restrict__ r_V) {
542   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
543   CeedScalar    r_t1[1], r_t2[1];
544 
545   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
546   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
547     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
548     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
549     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
550     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
551     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
552     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
553     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
554     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
555     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
556   }
557   __syncthreads();
558   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
559   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D derivatives transpose
564 //------------------------------------------------------------------------------
565 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
566 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
567                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
568   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
569   CeedScalar    r_t1[1], r_t2[1];
570 
571   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
572   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
573     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
574     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
575     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
576     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
577     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
578     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
579     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
580     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
581     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
582   }
583   __syncthreads();
584   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
585 }
586 
587 //------------------------------------------------------------------------------
588 // 3D derivatives at quadrature points
589 //------------------------------------------------------------------------------
590 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
591 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
592                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
593   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
594   CeedScalar    r_t1[1], r_t2[1];
595 
596   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
597   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
598     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
599     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
600     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
601     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
602     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
603     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
604   }
605   __syncthreads();
606   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
607   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
608 }
609 
610 //------------------------------------------------------------------------------
611 // 3D derivatives transpose
612 //------------------------------------------------------------------------------
613 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
614 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
615                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
616   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
617   CeedScalar    r_t1[1], r_t2[1];
618 
619   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
620   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
621     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
622     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
623     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
624     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
625     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
626     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
627   }
628   __syncthreads();
629   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
630 }
631 
632 //------------------------------------------------------------------------------
633 // 3D derivatives at quadrature points, nodes and quadrature points collocated
634 //------------------------------------------------------------------------------
635 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
636 inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
637                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
638   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
639 
640   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
641   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
642     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
643     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
644     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
645   }
646   __syncthreads();
647   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
648   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
649 }
650 
651 //------------------------------------------------------------------------------
652 // 3D derivatives transpose, nodes and quadrature points collocated
653 //------------------------------------------------------------------------------
654 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
655 inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
656                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
657   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
658 
659   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
660   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
661     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
662     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
663     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
664   }
665   __syncthreads();
666   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
667 }
668 
669 //------------------------------------------------------------------------------
670 // 3D quadrature weights
671 //------------------------------------------------------------------------------
672 template <int P_1D, int Q_1D>
673 inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
674   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
675 
676   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
677 }
678