xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h (revision 0ccda8ebe42db3fb90cdb724a58e4e5d2aedf1a1)
19b91271bSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39b91271bSJeremy L Thompson //
49b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59b91271bSJeremy L Thompson //
69b91271bSJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
79b91271bSJeremy L Thompson 
89b91271bSJeremy L Thompson /// @file
99b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates
109b91271bSJeremy L Thompson #include <ceed/types.h>
119b91271bSJeremy L Thompson 
129b91271bSJeremy L Thompson //------------------------------------------------------------------------------
139b91271bSJeremy L Thompson // 2D
149b91271bSJeremy L Thompson //------------------------------------------------------------------------------
159b91271bSJeremy L Thompson 
169b91271bSJeremy L Thompson //------------------------------------------------------------------------------
179b91271bSJeremy L Thompson // 2D tensor contraction x
189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
209b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
219b91271bSJeremy L Thompson                                             CeedScalar *V) {
22d6c19ee8SJeremy L Thompson   __syncthreads();
239b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
249b91271bSJeremy L Thompson   __syncthreads();
259b91271bSJeremy L Thompson   *V = 0.0;
269b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
279b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
289b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
299b91271bSJeremy L Thompson     }
309b91271bSJeremy L Thompson   }
319b91271bSJeremy L Thompson }
329b91271bSJeremy L Thompson 
339b91271bSJeremy L Thompson //------------------------------------------------------------------------------
349b91271bSJeremy L Thompson // 2D tensor contract y
359b91271bSJeremy L Thompson //------------------------------------------------------------------------------
369b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
379b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
389b91271bSJeremy L Thompson                                             CeedScalar *V) {
39d6c19ee8SJeremy L Thompson   __syncthreads();
409b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
419b91271bSJeremy L Thompson   __syncthreads();
429b91271bSJeremy L Thompson   *V = 0.0;
439b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
449b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
459b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
469b91271bSJeremy L Thompson     }
479b91271bSJeremy L Thompson   }
489b91271bSJeremy L Thompson }
499b91271bSJeremy L Thompson 
509b91271bSJeremy L Thompson //------------------------------------------------------------------------------
519b91271bSJeremy L Thompson // 2D transpose tensor contract y
529b91271bSJeremy L Thompson //------------------------------------------------------------------------------
539b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
549b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
559b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56d6c19ee8SJeremy L Thompson   __syncthreads();
579b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
589b91271bSJeremy L Thompson   __syncthreads();
599b91271bSJeremy L Thompson   *V = 0.0;
609b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
619b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
629b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
639b91271bSJeremy L Thompson     }
649b91271bSJeremy L Thompson   }
659b91271bSJeremy L Thompson }
669b91271bSJeremy L Thompson 
679b91271bSJeremy L Thompson //------------------------------------------------------------------------------
689b91271bSJeremy L Thompson // 2D transpose tensor contract x
699b91271bSJeremy L Thompson //------------------------------------------------------------------------------
709b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
719b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
729b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73d6c19ee8SJeremy L Thompson   __syncthreads();
749b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
759b91271bSJeremy L Thompson   __syncthreads();
769b91271bSJeremy L Thompson   *V = 0.0;
779b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
789b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
799b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
809b91271bSJeremy L Thompson     }
819b91271bSJeremy L Thompson   }
829b91271bSJeremy L Thompson }
839b91271bSJeremy L Thompson 
849b91271bSJeremy L Thompson //------------------------------------------------------------------------------
859b91271bSJeremy L Thompson // 2D transpose tensor contract and add x
869b91271bSJeremy L Thompson //------------------------------------------------------------------------------
879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
889b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
899b91271bSJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90d6c19ee8SJeremy L Thompson   __syncthreads();
919b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
929b91271bSJeremy L Thompson   __syncthreads();
939b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
949b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
959b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
969b91271bSJeremy L Thompson     }
979b91271bSJeremy L Thompson   }
989b91271bSJeremy L Thompson }
999b91271bSJeremy L Thompson 
1009b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1019b91271bSJeremy L Thompson // 2D pack/unpack quadrature values
1029b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1039b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
1049b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1059b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
1069b91271bSJeremy L Thompson 
1079b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108d6c19ee8SJeremy L Thompson     __syncthreads();
1099b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
1109b91271bSJeremy L Thompson     __syncthreads();
1119b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
1129b91271bSJeremy L Thompson   }
1139b91271bSJeremy L Thompson }
1149b91271bSJeremy L Thompson 
1159b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
1169b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1179b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
1189b91271bSJeremy L Thompson 
1199b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120d6c19ee8SJeremy L Thompson     __syncthreads();
1219b91271bSJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
1229b91271bSJeremy L Thompson     __syncthreads();
1239b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
1249b91271bSJeremy L Thompson   }
1259b91271bSJeremy L Thompson }
1269b91271bSJeremy L Thompson 
1279b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1289b91271bSJeremy L Thompson // 2D interpolate to quadrature points
1299b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1309b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1319b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1329b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
1339b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1349b91271bSJeremy L Thompson   CeedScalar r_t[1];
1359b91271bSJeremy L Thompson 
136ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1379b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1389b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1399b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1409b91271bSJeremy L Thompson   }
1413e2e790dSJeremy L Thompson   __syncthreads();
142ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1449b91271bSJeremy L Thompson }
1459b91271bSJeremy L Thompson 
1469b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1479b91271bSJeremy L Thompson // 2D interpolate transpose
1489b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1499b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1509b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1519b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
1529b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1539b91271bSJeremy L Thompson   CeedScalar r_t[1];
1549b91271bSJeremy L Thompson 
155ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1569b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1579b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1589b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1599b91271bSJeremy L Thompson   }
1603e2e790dSJeremy L Thompson   __syncthreads();
161ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1629b91271bSJeremy L Thompson }
1639b91271bSJeremy L Thompson 
1649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
165*0ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated
166*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
167*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
168*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
169*0ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
170*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
171*0ccda8ebSJeremy L Thompson 
172*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
173*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
174*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
175*0ccda8ebSJeremy L Thompson   }
176*0ccda8ebSJeremy L Thompson   __syncthreads();
177*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
178*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
179*0ccda8ebSJeremy L Thompson }
180*0ccda8ebSJeremy L Thompson 
181*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
182*0ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated
183*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
184*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
185*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
186*0ccda8ebSJeremy L Thompson                                                                        CeedScalar *__restrict__ r_V) {
187*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
188*0ccda8ebSJeremy L Thompson 
189*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
190*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
191*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
192*0ccda8ebSJeremy L Thompson   }
193*0ccda8ebSJeremy L Thompson   __syncthreads();
194*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
195*0ccda8ebSJeremy L Thompson }
196*0ccda8ebSJeremy L Thompson 
197*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1989b91271bSJeremy L Thompson // 2D derivatives at quadrature points
1999b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2009b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2019b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2029b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
2039b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2049b91271bSJeremy L Thompson   CeedScalar r_t[1];
2059b91271bSJeremy L Thompson 
206ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2079b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2089b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
2099b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
2109b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
2119b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
2129b91271bSJeremy L Thompson   }
2133e2e790dSJeremy L Thompson   __syncthreads();
214ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
215ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2169b91271bSJeremy L Thompson }
2179b91271bSJeremy L Thompson 
2189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2199b91271bSJeremy L Thompson // 2D derivatives transpose
2209b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2219b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2229b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2239b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2249b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2259b91271bSJeremy L Thompson   CeedScalar r_t[1];
2269b91271bSJeremy L Thompson 
227ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2289b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2299b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
2309b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
2319b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
2329b91271bSJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
2339b91271bSJeremy L Thompson   }
2343e2e790dSJeremy L Thompson   __syncthreads();
235ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2369b91271bSJeremy L Thompson }
2379b91271bSJeremy L Thompson 
2389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
239*0ccda8ebSJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated
240*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
241*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
242*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
243*0ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
244*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
245*0ccda8ebSJeremy L Thompson 
246*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
247*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248*0ccda8ebSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
249*0ccda8ebSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
250*0ccda8ebSJeremy L Thompson   }
251*0ccda8ebSJeremy L Thompson   __syncthreads();
252*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
253*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
254*0ccda8ebSJeremy L Thompson }
255*0ccda8ebSJeremy L Thompson 
256*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
257*0ccda8ebSJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated
258*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
259*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
260*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
261*0ccda8ebSJeremy L Thompson                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
262*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
263*0ccda8ebSJeremy L Thompson 
264*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
265*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
266*0ccda8ebSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
267*0ccda8ebSJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
268*0ccda8ebSJeremy L Thompson   }
269*0ccda8ebSJeremy L Thompson   __syncthreads();
270*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
271*0ccda8ebSJeremy L Thompson }
272*0ccda8ebSJeremy L Thompson 
273*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2749b91271bSJeremy L Thompson // 2D quadrature weights
2759b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2769b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
2779b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
2789b91271bSJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
2799b91271bSJeremy L Thompson 
2809b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
2819b91271bSJeremy L Thompson }
2829b91271bSJeremy L Thompson 
2839b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2849b91271bSJeremy L Thompson // 3D
2859b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2869b91271bSJeremy L Thompson 
2879b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2889b91271bSJeremy L Thompson // 3D tensor contract x
2899b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2919b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2929b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
293d6c19ee8SJeremy L Thompson   __syncthreads();
2949b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2959b91271bSJeremy L Thompson   __syncthreads();
2969b91271bSJeremy L Thompson   *V = 0.0;
2979b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
2989b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
2999b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
3009b91271bSJeremy L Thompson     }
3019b91271bSJeremy L Thompson   }
3029b91271bSJeremy L Thompson }
3039b91271bSJeremy L Thompson 
3049b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3059b91271bSJeremy L Thompson // 3D tensor contract y
3069b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3079b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3089b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3099b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
310d6c19ee8SJeremy L Thompson   __syncthreads();
3119b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3129b91271bSJeremy L Thompson   __syncthreads();
3139b91271bSJeremy L Thompson   *V = 0.0;
3149b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
3159b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
3169b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
3179b91271bSJeremy L Thompson     }
3189b91271bSJeremy L Thompson   }
3199b91271bSJeremy L Thompson }
3209b91271bSJeremy L Thompson 
3219b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3229b91271bSJeremy L Thompson // 3D tensor contract z
3239b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3249b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3259b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3269b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
327d6c19ee8SJeremy L Thompson   __syncthreads();
3289b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3299b91271bSJeremy L Thompson   __syncthreads();
3309b91271bSJeremy L Thompson   *V = 0.0;
3319b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
3329b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
3339b91271bSJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
3349b91271bSJeremy L Thompson     }
3359b91271bSJeremy L Thompson   }
3369b91271bSJeremy L Thompson }
3379b91271bSJeremy L Thompson 
3389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3399b91271bSJeremy L Thompson // 3D tensor contract z
3409b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3419b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3429b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3439b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
344d6c19ee8SJeremy L Thompson   __syncthreads();
3459b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3469b91271bSJeremy L Thompson   __syncthreads();
3479b91271bSJeremy L Thompson   *V = 0.0;
3489b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
3499b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3509b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
3519b91271bSJeremy L Thompson     }
3529b91271bSJeremy L Thompson   }
3539b91271bSJeremy L Thompson }
3549b91271bSJeremy L Thompson 
3559b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3569b91271bSJeremy L Thompson // 3D transpose tensor contract z
3579b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3589b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3599b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3609b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
361d6c19ee8SJeremy L Thompson   __syncthreads();
3629b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3639b91271bSJeremy L Thompson   __syncthreads();
3649b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
3659b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3669b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
3679b91271bSJeremy L Thompson     }
3689b91271bSJeremy L Thompson   }
3699b91271bSJeremy L Thompson }
3709b91271bSJeremy L Thompson 
3719b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3729b91271bSJeremy L Thompson // 3D transpose tensor contract y
3739b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3749b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3759b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3769b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
377d6c19ee8SJeremy L Thompson   __syncthreads();
3789b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3799b91271bSJeremy L Thompson   __syncthreads();
3809b91271bSJeremy L Thompson   *V = 0.0;
3819b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3829b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3839b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
3849b91271bSJeremy L Thompson     }
3859b91271bSJeremy L Thompson   }
3869b91271bSJeremy L Thompson }
3879b91271bSJeremy L Thompson 
3889b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3899b91271bSJeremy L Thompson // 3D transpose tensor contract y
3909b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3919b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3929b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3939b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
394d6c19ee8SJeremy L Thompson   __syncthreads();
3959b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3969b91271bSJeremy L Thompson   __syncthreads();
3979b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3989b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3999b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
4009b91271bSJeremy L Thompson     }
4019b91271bSJeremy L Thompson   }
4029b91271bSJeremy L Thompson }
4039b91271bSJeremy L Thompson 
4049b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4059b91271bSJeremy L Thompson // 3D transpose tensor contract x
4069b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4079b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4089b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
4099b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
410d6c19ee8SJeremy L Thompson   __syncthreads();
4119b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
4129b91271bSJeremy L Thompson   __syncthreads();
4139b91271bSJeremy L Thompson   *V = 0.0;
4149b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
4159b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
4169b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
4179b91271bSJeremy L Thompson     }
4189b91271bSJeremy L Thompson   }
4199b91271bSJeremy L Thompson }
4209b91271bSJeremy L Thompson 
4219b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4229b91271bSJeremy L Thompson // 3D transpose tensor contract add x
4239b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4249b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4259b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
4269b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
427d6c19ee8SJeremy L Thompson   __syncthreads();
4289b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
4299b91271bSJeremy L Thompson   __syncthreads();
4309b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
4319b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
4329b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
4339b91271bSJeremy L Thompson     }
4349b91271bSJeremy L Thompson   }
4359b91271bSJeremy L Thompson }
4369b91271bSJeremy L Thompson 
4379b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4389b91271bSJeremy L Thompson // 3D pack/unpack quadrature values
4399b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4409b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
4419b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
4429b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
4439b91271bSJeremy L Thompson 
4449b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
445d6c19ee8SJeremy L Thompson     __syncthreads();
4469b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
4479b91271bSJeremy L Thompson     __syncthreads();
4489b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
4499b91271bSJeremy L Thompson   }
4509b91271bSJeremy L Thompson }
4519b91271bSJeremy L Thompson 
4529b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
4539b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
4549b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
4559b91271bSJeremy L Thompson 
4569b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
457d6c19ee8SJeremy L Thompson     __syncthreads();
4589b91271bSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
4599b91271bSJeremy L Thompson     __syncthreads();
4609b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
4619b91271bSJeremy L Thompson   }
4629b91271bSJeremy L Thompson }
4639b91271bSJeremy L Thompson 
4649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4659b91271bSJeremy L Thompson // 3D interpolate to quadrature points
4669b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4679b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4689b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4699b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
4709b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4719b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4729b91271bSJeremy L Thompson 
473ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4749b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4759b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4769b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4779b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4789b91271bSJeremy L Thompson   }
4793e2e790dSJeremy L Thompson   __syncthreads();
480ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
481ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4829b91271bSJeremy L Thompson }
4839b91271bSJeremy L Thompson 
4849b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4859b91271bSJeremy L Thompson // 3D interpolate transpose
4869b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4889b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4899b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
4909b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4919b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4929b91271bSJeremy L Thompson 
493ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4949b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4959b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4969b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4979b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4989b91271bSJeremy L Thompson   }
4993e2e790dSJeremy L Thompson   __syncthreads();
500ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5019b91271bSJeremy L Thompson }
5029b91271bSJeremy L Thompson 
5039b91271bSJeremy L Thompson //------------------------------------------------------------------------------
504*0ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated
505*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
506*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
507*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
508*0ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
509*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
510*0ccda8ebSJeremy L Thompson 
511*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
512*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
514*0ccda8ebSJeremy L Thompson   }
515*0ccda8ebSJeremy L Thompson   __syncthreads();
516*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
517*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
518*0ccda8ebSJeremy L Thompson }
519*0ccda8ebSJeremy L Thompson 
520*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
521*0ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated
522*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
523*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
524*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
525*0ccda8ebSJeremy L Thompson                                                                        CeedScalar *__restrict__ r_V) {
526*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
527*0ccda8ebSJeremy L Thompson 
528*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
529*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
530*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
531*0ccda8ebSJeremy L Thompson   }
532*0ccda8ebSJeremy L Thompson   __syncthreads();
533*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
534*0ccda8ebSJeremy L Thompson }
535*0ccda8ebSJeremy L Thompson 
536*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5379b91271bSJeremy L Thompson // 3D derivatives at quadrature points
5389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5399b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5409b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5419b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
5429b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5439b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
5449b91271bSJeremy L Thompson 
545ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5469b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5479b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
5489b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
5499b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
5509b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
5519b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
5529b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
5539b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
5549b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
5559b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
5569b91271bSJeremy L Thompson   }
5573e2e790dSJeremy L Thompson   __syncthreads();
558ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
559ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5609b91271bSJeremy L Thompson }
5619b91271bSJeremy L Thompson 
5629b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5639b91271bSJeremy L Thompson // 3D derivatives transpose
5649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5659b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5669b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5679b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5689b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5699b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
5709b91271bSJeremy L Thompson 
571ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5729b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5739b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
5749b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5759b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
5769b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
5779b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
5789b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
5799b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
5809b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5819b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
5829b91271bSJeremy L Thompson   }
5833e2e790dSJeremy L Thompson   __syncthreads();
584ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5859b91271bSJeremy L Thompson }
5869b91271bSJeremy L Thompson 
5879b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5889b91271bSJeremy L Thompson // 3D derivatives at quadrature points
5899b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5919b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5929b91271bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5939b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5949b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
5959b91271bSJeremy L Thompson 
596ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5979b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5989b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
5999b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
6009b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
6019b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
6029b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
6039b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
6049b91271bSJeremy L Thompson   }
6053e2e790dSJeremy L Thompson   __syncthreads();
606ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
607ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6089b91271bSJeremy L Thompson }
6099b91271bSJeremy L Thompson 
6109b91271bSJeremy L Thompson //------------------------------------------------------------------------------
6119b91271bSJeremy L Thompson // 3D derivatives transpose
6129b91271bSJeremy L Thompson //------------------------------------------------------------------------------
6139b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
6149b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6159b91271bSJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6169b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
6179b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
6189b91271bSJeremy L Thompson 
619ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6209b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6219b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
6229b91271bSJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
6239b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
6249b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
6259b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
6269b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
6279b91271bSJeremy L Thompson   }
6283e2e790dSJeremy L Thompson   __syncthreads();
629ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6309b91271bSJeremy L Thompson }
6319b91271bSJeremy L Thompson 
6329b91271bSJeremy L Thompson //------------------------------------------------------------------------------
633*0ccda8ebSJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated
634*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
635*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
636*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
637*0ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
638*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
639*0ccda8ebSJeremy L Thompson 
640*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
641*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
642*0ccda8ebSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
643*0ccda8ebSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
644*0ccda8ebSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
645*0ccda8ebSJeremy L Thompson   }
646*0ccda8ebSJeremy L Thompson   __syncthreads();
647*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
648*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
649*0ccda8ebSJeremy L Thompson }
650*0ccda8ebSJeremy L Thompson 
651*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
652*0ccda8ebSJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated
653*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
654*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
655*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
656*0ccda8ebSJeremy L Thompson                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
657*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
658*0ccda8ebSJeremy L Thompson 
659*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
660*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
661*0ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
662*0ccda8ebSJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
663*0ccda8ebSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
664*0ccda8ebSJeremy L Thompson   }
665*0ccda8ebSJeremy L Thompson   __syncthreads();
666*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
667*0ccda8ebSJeremy L Thompson }
668*0ccda8ebSJeremy L Thompson 
669*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6709b91271bSJeremy L Thompson // 3D quadrature weights
6719b91271bSJeremy L Thompson //------------------------------------------------------------------------------
6729b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
6739b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
6749b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
6759b91271bSJeremy L Thompson 
6769b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
6779b91271bSJeremy L Thompson }
678