xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h (revision 3e2e790d55559d3802e94c4d2ca1c8066babe5db)
19b91271bSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39b91271bSJeremy L Thompson //
49b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59b91271bSJeremy L Thompson //
69b91271bSJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
79b91271bSJeremy L Thompson 
89b91271bSJeremy L Thompson /// @file
99b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates
109b91271bSJeremy L Thompson #include <ceed/types.h>
119b91271bSJeremy L Thompson 
129b91271bSJeremy L Thompson //------------------------------------------------------------------------------
139b91271bSJeremy L Thompson // 2D
149b91271bSJeremy L Thompson //------------------------------------------------------------------------------
159b91271bSJeremy L Thompson 
169b91271bSJeremy L Thompson //------------------------------------------------------------------------------
179b91271bSJeremy L Thompson // 2D tensor contraction x
189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
209b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
219b91271bSJeremy L Thompson                                             CeedScalar *V) {
22d6c19ee8SJeremy L Thompson   __syncthreads();
239b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
249b91271bSJeremy L Thompson   __syncthreads();
259b91271bSJeremy L Thompson   *V = 0.0;
269b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
279b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
289b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
299b91271bSJeremy L Thompson     }
309b91271bSJeremy L Thompson   }
319b91271bSJeremy L Thompson }
329b91271bSJeremy L Thompson 
339b91271bSJeremy L Thompson //------------------------------------------------------------------------------
349b91271bSJeremy L Thompson // 2D tensor contract y
359b91271bSJeremy L Thompson //------------------------------------------------------------------------------
369b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
379b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
389b91271bSJeremy L Thompson                                             CeedScalar *V) {
39d6c19ee8SJeremy L Thompson   __syncthreads();
409b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
419b91271bSJeremy L Thompson   __syncthreads();
429b91271bSJeremy L Thompson   *V = 0.0;
439b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
449b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
459b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
469b91271bSJeremy L Thompson     }
479b91271bSJeremy L Thompson   }
489b91271bSJeremy L Thompson }
499b91271bSJeremy L Thompson 
509b91271bSJeremy L Thompson //------------------------------------------------------------------------------
519b91271bSJeremy L Thompson // 2D transpose tensor contract y
529b91271bSJeremy L Thompson //------------------------------------------------------------------------------
539b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
549b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
559b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56d6c19ee8SJeremy L Thompson   __syncthreads();
579b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
589b91271bSJeremy L Thompson   __syncthreads();
599b91271bSJeremy L Thompson   *V = 0.0;
609b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
619b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
629b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
639b91271bSJeremy L Thompson     }
649b91271bSJeremy L Thompson   }
659b91271bSJeremy L Thompson }
669b91271bSJeremy L Thompson 
679b91271bSJeremy L Thompson //------------------------------------------------------------------------------
689b91271bSJeremy L Thompson // 2D transpose tensor contract x
699b91271bSJeremy L Thompson //------------------------------------------------------------------------------
709b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
719b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
729b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73d6c19ee8SJeremy L Thompson   __syncthreads();
749b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
759b91271bSJeremy L Thompson   __syncthreads();
769b91271bSJeremy L Thompson   *V = 0.0;
779b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
789b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
799b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
809b91271bSJeremy L Thompson     }
819b91271bSJeremy L Thompson   }
829b91271bSJeremy L Thompson }
839b91271bSJeremy L Thompson 
849b91271bSJeremy L Thompson //------------------------------------------------------------------------------
859b91271bSJeremy L Thompson // 2D transpose tensor contract and add x
869b91271bSJeremy L Thompson //------------------------------------------------------------------------------
879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
889b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
899b91271bSJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90d6c19ee8SJeremy L Thompson   __syncthreads();
919b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
929b91271bSJeremy L Thompson   __syncthreads();
939b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
949b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
959b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
969b91271bSJeremy L Thompson     }
979b91271bSJeremy L Thompson   }
989b91271bSJeremy L Thompson }
999b91271bSJeremy L Thompson 
1009b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1019b91271bSJeremy L Thompson // 2D pack/unpack quadrature values
1029b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1039b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
1049b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1059b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
1069b91271bSJeremy L Thompson 
1079b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108d6c19ee8SJeremy L Thompson     __syncthreads();
1099b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
1109b91271bSJeremy L Thompson     __syncthreads();
1119b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
1129b91271bSJeremy L Thompson   }
1139b91271bSJeremy L Thompson }
1149b91271bSJeremy L Thompson 
1159b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
1169b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1179b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
1189b91271bSJeremy L Thompson 
1199b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120d6c19ee8SJeremy L Thompson     __syncthreads();
1219b91271bSJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
1229b91271bSJeremy L Thompson     __syncthreads();
1239b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
1249b91271bSJeremy L Thompson   }
1259b91271bSJeremy L Thompson }
1269b91271bSJeremy L Thompson 
1279b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1289b91271bSJeremy L Thompson // 2D interpolate to quadrature points
1299b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1309b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1319b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1329b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
1339b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1349b91271bSJeremy L Thompson   CeedScalar r_t[1];
1359b91271bSJeremy L Thompson 
136ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1379b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1389b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1399b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1409b91271bSJeremy L Thompson   }
141*3e2e790dSJeremy L Thompson   __syncthreads();
142ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1449b91271bSJeremy L Thompson }
1459b91271bSJeremy L Thompson 
1469b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1479b91271bSJeremy L Thompson // 2D interpolate transpose
1489b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1499b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1509b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1519b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
1529b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1539b91271bSJeremy L Thompson   CeedScalar r_t[1];
1549b91271bSJeremy L Thompson 
155ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1569b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1579b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1589b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1599b91271bSJeremy L Thompson   }
160*3e2e790dSJeremy L Thompson   __syncthreads();
161ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1629b91271bSJeremy L Thompson }
1639b91271bSJeremy L Thompson 
1649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1659b91271bSJeremy L Thompson // 2D derivatives at quadrature points
1669b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1679b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1689b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
1699b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
1709b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1719b91271bSJeremy L Thompson   CeedScalar r_t[1];
1729b91271bSJeremy L Thompson 
173ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1749b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1759b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
1769b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
1779b91271bSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1789b91271bSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
1799b91271bSJeremy L Thompson   }
180*3e2e790dSJeremy L Thompson   __syncthreads();
181ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
182ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1839b91271bSJeremy L Thompson }
1849b91271bSJeremy L Thompson 
1859b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1869b91271bSJeremy L Thompson // 2D derivatives transpose
1879b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1889b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1899b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1909b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1919b91271bSJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1929b91271bSJeremy L Thompson   CeedScalar r_t[1];
1939b91271bSJeremy L Thompson 
194ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1959b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1969b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
1979b91271bSJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
1989b91271bSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
1999b91271bSJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
2009b91271bSJeremy L Thompson   }
201*3e2e790dSJeremy L Thompson   __syncthreads();
202ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2039b91271bSJeremy L Thompson }
2049b91271bSJeremy L Thompson 
2059b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2069b91271bSJeremy L Thompson // 2D quadrature weights
2079b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2089b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
2099b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
2109b91271bSJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
2119b91271bSJeremy L Thompson 
2129b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
2139b91271bSJeremy L Thompson }
2149b91271bSJeremy L Thompson 
2159b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2169b91271bSJeremy L Thompson // 3D
2179b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2189b91271bSJeremy L Thompson 
2199b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2209b91271bSJeremy L Thompson // 3D tensor contract x
2219b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2229b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2239b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2249b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
225d6c19ee8SJeremy L Thompson   __syncthreads();
2269b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2279b91271bSJeremy L Thompson   __syncthreads();
2289b91271bSJeremy L Thompson   *V = 0.0;
2299b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
2309b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
2319b91271bSJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
2329b91271bSJeremy L Thompson     }
2339b91271bSJeremy L Thompson   }
2349b91271bSJeremy L Thompson }
2359b91271bSJeremy L Thompson 
2369b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2379b91271bSJeremy L Thompson // 3D tensor contract y
2389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2399b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2409b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2419b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
242d6c19ee8SJeremy L Thompson   __syncthreads();
2439b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2449b91271bSJeremy L Thompson   __syncthreads();
2459b91271bSJeremy L Thompson   *V = 0.0;
2469b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
2479b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
2489b91271bSJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
2499b91271bSJeremy L Thompson     }
2509b91271bSJeremy L Thompson   }
2519b91271bSJeremy L Thompson }
2529b91271bSJeremy L Thompson 
2539b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2549b91271bSJeremy L Thompson // 3D tensor contract z
2559b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2569b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2579b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2589b91271bSJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
259d6c19ee8SJeremy L Thompson   __syncthreads();
2609b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2619b91271bSJeremy L Thompson   __syncthreads();
2629b91271bSJeremy L Thompson   *V = 0.0;
2639b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
2649b91271bSJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
2659b91271bSJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
2669b91271bSJeremy L Thompson     }
2679b91271bSJeremy L Thompson   }
2689b91271bSJeremy L Thompson }
2699b91271bSJeremy L Thompson 
2709b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2719b91271bSJeremy L Thompson // 3D tensor contract z
2729b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2739b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2749b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2759b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
276d6c19ee8SJeremy L Thompson   __syncthreads();
2779b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2789b91271bSJeremy L Thompson   __syncthreads();
2799b91271bSJeremy L Thompson   *V = 0.0;
2809b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
2819b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
2829b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
2839b91271bSJeremy L Thompson     }
2849b91271bSJeremy L Thompson   }
2859b91271bSJeremy L Thompson }
2869b91271bSJeremy L Thompson 
2879b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2889b91271bSJeremy L Thompson // 3D transpose tensor contract z
2899b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2919b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
2929b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
293d6c19ee8SJeremy L Thompson   __syncthreads();
2949b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2959b91271bSJeremy L Thompson   __syncthreads();
2969b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
2979b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
2989b91271bSJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
2999b91271bSJeremy L Thompson     }
3009b91271bSJeremy L Thompson   }
3019b91271bSJeremy L Thompson }
3029b91271bSJeremy L Thompson 
3039b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3049b91271bSJeremy L Thompson // 3D transpose tensor contract y
3059b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3069b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3079b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3089b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
309d6c19ee8SJeremy L Thompson   __syncthreads();
3109b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3119b91271bSJeremy L Thompson   __syncthreads();
3129b91271bSJeremy L Thompson   *V = 0.0;
3139b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3149b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3159b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
3169b91271bSJeremy L Thompson     }
3179b91271bSJeremy L Thompson   }
3189b91271bSJeremy L Thompson }
3199b91271bSJeremy L Thompson 
3209b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3219b91271bSJeremy L Thompson // 3D transpose tensor contract y
3229b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3239b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3249b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3259b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
326d6c19ee8SJeremy L Thompson   __syncthreads();
3279b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3289b91271bSJeremy L Thompson   __syncthreads();
3299b91271bSJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3309b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3319b91271bSJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
3329b91271bSJeremy L Thompson     }
3339b91271bSJeremy L Thompson   }
3349b91271bSJeremy L Thompson }
3359b91271bSJeremy L Thompson 
3369b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3379b91271bSJeremy L Thompson // 3D transpose tensor contract x
3389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3399b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3409b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3419b91271bSJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
342d6c19ee8SJeremy L Thompson   __syncthreads();
3439b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3449b91271bSJeremy L Thompson   __syncthreads();
3459b91271bSJeremy L Thompson   *V = 0.0;
3469b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
3479b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3489b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
3499b91271bSJeremy L Thompson     }
3509b91271bSJeremy L Thompson   }
3519b91271bSJeremy L Thompson }
3529b91271bSJeremy L Thompson 
3539b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3549b91271bSJeremy L Thompson // 3D transpose tensor contract add x
3559b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3569b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3579b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3589b91271bSJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
359d6c19ee8SJeremy L Thompson   __syncthreads();
3609b91271bSJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3619b91271bSJeremy L Thompson   __syncthreads();
3629b91271bSJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
3639b91271bSJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
3649b91271bSJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
3659b91271bSJeremy L Thompson     }
3669b91271bSJeremy L Thompson   }
3679b91271bSJeremy L Thompson }
3689b91271bSJeremy L Thompson 
3699b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3709b91271bSJeremy L Thompson // 3D pack/unpack quadrature values
3719b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3729b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
3739b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
3749b91271bSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
3759b91271bSJeremy L Thompson 
3769b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
377d6c19ee8SJeremy L Thompson     __syncthreads();
3789b91271bSJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
3799b91271bSJeremy L Thompson     __syncthreads();
3809b91271bSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
3819b91271bSJeremy L Thompson   }
3829b91271bSJeremy L Thompson }
3839b91271bSJeremy L Thompson 
3849b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
3859b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
3869b91271bSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
3879b91271bSJeremy L Thompson 
3889b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
389d6c19ee8SJeremy L Thompson     __syncthreads();
3909b91271bSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
3919b91271bSJeremy L Thompson     __syncthreads();
3929b91271bSJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
3939b91271bSJeremy L Thompson   }
3949b91271bSJeremy L Thompson }
3959b91271bSJeremy L Thompson 
3969b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3979b91271bSJeremy L Thompson // 3D interpolate to quadrature points
3989b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3999b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4009b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4019b91271bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
4029b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4039b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4049b91271bSJeremy L Thompson 
405ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4069b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4079b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4089b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4099b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4109b91271bSJeremy L Thompson   }
411*3e2e790dSJeremy L Thompson   __syncthreads();
412ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
413ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4149b91271bSJeremy L Thompson }
4159b91271bSJeremy L Thompson 
4169b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4179b91271bSJeremy L Thompson // 3D interpolate transpose
4189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4209b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4219b91271bSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
4229b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4239b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4249b91271bSJeremy L Thompson 
425ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4269b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4279b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4289b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4299b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4309b91271bSJeremy L Thompson   }
431*3e2e790dSJeremy L Thompson   __syncthreads();
432ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4339b91271bSJeremy L Thompson }
4349b91271bSJeremy L Thompson 
4359b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4369b91271bSJeremy L Thompson // 3D derivatives at quadrature points
4379b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4389b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4399b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4409b91271bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
4419b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4429b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4439b91271bSJeremy L Thompson 
444ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4459b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4469b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
4479b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4489b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
4499b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4509b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
4519b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
4529b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4539b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4549b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
4559b91271bSJeremy L Thompson   }
456*3e2e790dSJeremy L Thompson   __syncthreads();
457ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
458ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4599b91271bSJeremy L Thompson }
4609b91271bSJeremy L Thompson 
4619b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4629b91271bSJeremy L Thompson // 3D derivatives transpose
4639b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4649b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4659b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4669b91271bSJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
4679b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4689b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4699b91271bSJeremy L Thompson 
470ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4719b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4729b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
4739b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
4749b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
4759b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
4769b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
4779b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
4789b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
4799b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
4809b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
4819b91271bSJeremy L Thompson   }
482*3e2e790dSJeremy L Thompson   __syncthreads();
483ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4849b91271bSJeremy L Thompson }
4859b91271bSJeremy L Thompson 
4869b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4879b91271bSJeremy L Thompson // 3D derivatives at quadrature points
4889b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4899b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4909b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4919b91271bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
4929b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4939b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
4949b91271bSJeremy L Thompson 
495ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4969b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4979b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4989b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4999b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
5009b91271bSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
5019b91271bSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
5029b91271bSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
5039b91271bSJeremy L Thompson   }
504*3e2e790dSJeremy L Thompson   __syncthreads();
505ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
506ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5079b91271bSJeremy L Thompson }
5089b91271bSJeremy L Thompson 
5099b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5109b91271bSJeremy L Thompson // 3D derivatives transpose
5119b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5129b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5139b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5149b91271bSJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5159b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5169b91271bSJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
5179b91271bSJeremy L Thompson 
518ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5199b91271bSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5209b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
5219b91271bSJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
5229b91271bSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
5239b91271bSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
5249b91271bSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
5259b91271bSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
5269b91271bSJeremy L Thompson   }
527*3e2e790dSJeremy L Thompson   __syncthreads();
528ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5299b91271bSJeremy L Thompson }
5309b91271bSJeremy L Thompson 
5319b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5329b91271bSJeremy L Thompson // 3D quadrature weights
5339b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5349b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
5359b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
5369b91271bSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
5379b91271bSJeremy L Thompson 
5389b91271bSJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
5399b91271bSJeremy L Thompson }
540