xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision ca595be6df907a4366bcc1f56f7a62068f97f05f)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for HIP shared memory tensor product basis templates
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // 1D
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann 
169e201c85SYohann //------------------------------------------------------------------------------
179e201c85SYohann // 1D tensor contraction x
189e201c85SYohann //------------------------------------------------------------------------------
199e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
209e201c85SYohann inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
219e201c85SYohann   data.slice[data.t_id_x] = *U;
229e201c85SYohann   __syncthreads();
239e201c85SYohann   *V = 0.0;
249e201c85SYohann   if (data.t_id_x < Q_1D) {
259e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
269e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
279e201c85SYohann     }
289e201c85SYohann   }
299e201c85SYohann   __syncthreads();
309e201c85SYohann }
319e201c85SYohann 
329e201c85SYohann //------------------------------------------------------------------------------
339e201c85SYohann // 1D transpose tensor contraction x
349e201c85SYohann //------------------------------------------------------------------------------
359e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
369e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
379e201c85SYohann   data.slice[data.t_id_x] = *U;
389e201c85SYohann   __syncthreads();
399e201c85SYohann   *V = 0.0;
409e201c85SYohann   if (data.t_id_x < P_1D) {
419e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
429e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
439e201c85SYohann     }
449e201c85SYohann   }
459e201c85SYohann   __syncthreads();
469e201c85SYohann }
479e201c85SYohann 
489e201c85SYohann //------------------------------------------------------------------------------
499e201c85SYohann // 1D interpolate to quadrature points
509e201c85SYohann //------------------------------------------------------------------------------
516b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
529e201c85SYohann inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
539e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
559e201c85SYohann   }
569e201c85SYohann }
579e201c85SYohann 
589e201c85SYohann //------------------------------------------------------------------------------
599e201c85SYohann // 1D interpolate transpose
609e201c85SYohann //------------------------------------------------------------------------------
616b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
622b730f8bSJeremy L Thompson inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
632b730f8bSJeremy L Thompson                                          CeedScalar *__restrict__ r_V) {
649e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
669e201c85SYohann   }
679e201c85SYohann }
689e201c85SYohann 
699e201c85SYohann //------------------------------------------------------------------------------
709e201c85SYohann // 1D derivatives at quadrature points
719e201c85SYohann //------------------------------------------------------------------------------
726b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
732b730f8bSJeremy L Thompson inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
742b730f8bSJeremy L Thompson                               CeedScalar *__restrict__ r_V) {
759e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
779e201c85SYohann   }
789e201c85SYohann }
799e201c85SYohann 
809e201c85SYohann //------------------------------------------------------------------------------
819e201c85SYohann // 1D derivatives transpose
829e201c85SYohann //------------------------------------------------------------------------------
836b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
842b730f8bSJeremy L Thompson inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
852b730f8bSJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
869e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
889e201c85SYohann   }
899e201c85SYohann }
909e201c85SYohann 
919e201c85SYohann //------------------------------------------------------------------------------
929e201c85SYohann // 1D quadrature weights
939e201c85SYohann //------------------------------------------------------------------------------
949e201c85SYohann template <int Q_1D>
959e201c85SYohann inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
969e201c85SYohann   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
979e201c85SYohann }
989e201c85SYohann 
999e201c85SYohann //------------------------------------------------------------------------------
1009e201c85SYohann // 2D
1019e201c85SYohann //------------------------------------------------------------------------------
1029e201c85SYohann 
1039e201c85SYohann //------------------------------------------------------------------------------
1049e201c85SYohann // 2D tensor contraction x
1059e201c85SYohann //------------------------------------------------------------------------------
1066b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107343e3094SJeremy L Thompson inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
108343e3094SJeremy L Thompson                                    CeedScalar *V) {
109343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1109e201c85SYohann   __syncthreads();
1119e201c85SYohann   *V = 0.0;
112343e3094SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
1139e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
114343e3094SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
1159e201c85SYohann     }
1169e201c85SYohann   }
1179e201c85SYohann   __syncthreads();
1189e201c85SYohann }
1199e201c85SYohann 
1209e201c85SYohann //------------------------------------------------------------------------------
1219e201c85SYohann // 2D tensor contract y
1229e201c85SYohann //------------------------------------------------------------------------------
1236b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
124343e3094SJeremy L Thompson inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
125343e3094SJeremy L Thompson                                    CeedScalar *V) {
126343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
127343e3094SJeremy L Thompson >>>>>>> b855402d (gpu - isolate core 2D tensor logic to allow flat version)
1289e201c85SYohann   __syncthreads();
1299e201c85SYohann   *V = 0.0;
130343e3094SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
1319e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
132343e3094SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
1339e201c85SYohann     }
1349e201c85SYohann   }
1359e201c85SYohann   __syncthreads();
1369e201c85SYohann }
1379e201c85SYohann 
1389e201c85SYohann //------------------------------------------------------------------------------
1399e201c85SYohann // 2D transpose tensor contract y
1409e201c85SYohann //------------------------------------------------------------------------------
1416b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
142343e3094SJeremy L Thompson inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
143343e3094SJeremy L Thompson                                             CeedScalar *V) {
144343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1459e201c85SYohann   __syncthreads();
1469e201c85SYohann   *V = 0.0;
147343e3094SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
1489e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
149343e3094SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
1509e201c85SYohann     }
1519e201c85SYohann   }
1529e201c85SYohann   __syncthreads();
1539e201c85SYohann }
1549e201c85SYohann 
1559e201c85SYohann //------------------------------------------------------------------------------
1569e201c85SYohann // 2D transpose tensor contract x
1579e201c85SYohann //------------------------------------------------------------------------------
1586b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
159343e3094SJeremy L Thompson inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
160343e3094SJeremy L Thompson                                             CeedScalar *V) {
161343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1629e201c85SYohann   __syncthreads();
1639e201c85SYohann   *V = 0.0;
164343e3094SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
1659e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
166343e3094SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
1679e201c85SYohann     }
1689e201c85SYohann   }
1699e201c85SYohann   __syncthreads();
1709e201c85SYohann }
1719e201c85SYohann 
1729e201c85SYohann //------------------------------------------------------------------------------
1739e201c85SYohann // 2D transpose tensor contract and add x
1749e201c85SYohann //------------------------------------------------------------------------------
1756b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
176343e3094SJeremy L Thompson inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
177343e3094SJeremy L Thompson                                                CeedScalar *V) {
178343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1799e201c85SYohann   __syncthreads();
180343e3094SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
1819e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
182343e3094SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
1839e201c85SYohann     }
1849e201c85SYohann   }
1859e201c85SYohann   __syncthreads();
1869e201c85SYohann }
1879e201c85SYohann 
1889e201c85SYohann //------------------------------------------------------------------------------
1899e201c85SYohann // 2D interpolate to quadrature points
1909e201c85SYohann //------------------------------------------------------------------------------
1916b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
192343e3094SJeremy L Thompson inline __device__ void InterpTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
193343e3094SJeremy L Thompson                                            const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1949e201c85SYohann   CeedScalar r_t[1];
1959e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
196343e3094SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
197343e3094SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1989e201c85SYohann   }
1999e201c85SYohann }
2009e201c85SYohann 
201343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
202343e3094SJeremy L Thompson inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
203343e3094SJeremy L Thompson   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
204343e3094SJeremy L Thompson }
205343e3094SJeremy L Thompson 
206*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D>
207*ca595be6SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
208*ca595be6SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
209*ca595be6SJeremy L Thompson   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
210*ca595be6SJeremy L Thompson 
211*ca595be6SJeremy L Thompson   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
212*ca595be6SJeremy L Thompson }
213*ca595be6SJeremy L Thompson 
2149e201c85SYohann //------------------------------------------------------------------------------
2159e201c85SYohann // 2D interpolate transpose
2169e201c85SYohann //------------------------------------------------------------------------------
2176b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
218343e3094SJeremy L Thompson inline __device__ void InterpTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
219343e3094SJeremy L Thompson                                                     const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
2209e201c85SYohann   CeedScalar r_t[1];
2219e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
222343e3094SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
223343e3094SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
2249e201c85SYohann   }
2259e201c85SYohann }
2269e201c85SYohann 
227343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
228343e3094SJeremy L Thompson inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
229343e3094SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
230343e3094SJeremy L Thompson   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
231343e3094SJeremy L Thompson }
232343e3094SJeremy L Thompson 
233*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D>
234*ca595be6SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
235*ca595be6SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
236*ca595be6SJeremy L Thompson   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
237*ca595be6SJeremy L Thompson 
238*ca595be6SJeremy L Thompson   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
239*ca595be6SJeremy L Thompson }
240*ca595be6SJeremy L Thompson 
2419e201c85SYohann //------------------------------------------------------------------------------
2429e201c85SYohann // 2D derivatives at quadrature points
2439e201c85SYohann //------------------------------------------------------------------------------
2446b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
245343e3094SJeremy L Thompson inline __device__ void GradTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
246343e3094SJeremy L Thompson                                          const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2479e201c85SYohann   CeedScalar r_t[1];
2489e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
249343e3094SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
250343e3094SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
251343e3094SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
252343e3094SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
2539e201c85SYohann   }
2549e201c85SYohann }
2559e201c85SYohann 
256343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
257343e3094SJeremy L Thompson inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
258343e3094SJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
259343e3094SJeremy L Thompson   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
260343e3094SJeremy L Thompson }
261343e3094SJeremy L Thompson 
262*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D>
263*ca595be6SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
264*ca595be6SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
265*ca595be6SJeremy L Thompson   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
266*ca595be6SJeremy L Thompson 
267*ca595be6SJeremy L Thompson   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
268*ca595be6SJeremy L Thompson }
269*ca595be6SJeremy L Thompson 
2709e201c85SYohann //------------------------------------------------------------------------------
2719e201c85SYohann // 2D derivatives transpose
2729e201c85SYohann //------------------------------------------------------------------------------
2736b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
274343e3094SJeremy L Thompson inline __device__ void GradTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
275343e3094SJeremy L Thompson                                                   const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2769e201c85SYohann   CeedScalar r_t[1];
2779e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
278343e3094SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
279343e3094SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
280343e3094SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
281343e3094SJeremy L Thompson     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
2829e201c85SYohann   }
2839e201c85SYohann }
2849e201c85SYohann 
285343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
286343e3094SJeremy L Thompson inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
287343e3094SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
288343e3094SJeremy L Thompson   GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
289343e3094SJeremy L Thompson }
290343e3094SJeremy L Thompson 
291*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D>
292*ca595be6SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
293*ca595be6SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
294*ca595be6SJeremy L Thompson   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
295*ca595be6SJeremy L Thompson 
296*ca595be6SJeremy L Thompson   GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
297*ca595be6SJeremy L Thompson }
298*ca595be6SJeremy L Thompson 
2999e201c85SYohann //------------------------------------------------------------------------------
3009e201c85SYohann // 2D quadrature weights
3019e201c85SYohann //------------------------------------------------------------------------------
3029e201c85SYohann template <int Q_1D>
303*ca595be6SJeremy L Thompson inline __device__ void WeightTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
304*ca595be6SJeremy L Thompson                                            CeedScalar *w) {
305*ca595be6SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
306*ca595be6SJeremy L Thompson }
307*ca595be6SJeremy L Thompson 
308*ca595be6SJeremy L Thompson template <int P_1D, int Q_1D>
3099e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
310*ca595be6SJeremy L Thompson   WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
311*ca595be6SJeremy L Thompson }
312*ca595be6SJeremy L Thompson 
313*ca595be6SJeremy L Thompson template <int P_1D, int Q_1D>
314*ca595be6SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
315*ca595be6SJeremy L Thompson   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
316*ca595be6SJeremy L Thompson 
317*ca595be6SJeremy L Thompson   WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
3189e201c85SYohann }
3199e201c85SYohann 
3209e201c85SYohann //------------------------------------------------------------------------------
3219e201c85SYohann // 3D
3229e201c85SYohann //------------------------------------------------------------------------------
3239e201c85SYohann 
3249e201c85SYohann //------------------------------------------------------------------------------
3259e201c85SYohann // 3D tensor contract x
3269e201c85SYohann //------------------------------------------------------------------------------
3276b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3289e201c85SYohann inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3299e201c85SYohann   CeedScalar r_B[P_1D];
3309e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
3319e201c85SYohann     r_B[i] = B[i + data.t_id_x * P_1D];
3329e201c85SYohann   }
3339e201c85SYohann 
3349e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3359e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3369e201c85SYohann     __syncthreads();
3379e201c85SYohann     V[k] = 0.0;
3389e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3399e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3409e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
3419e201c85SYohann       }
3429e201c85SYohann     }
3439e201c85SYohann     __syncthreads();
3449e201c85SYohann   }
3459e201c85SYohann }
3469e201c85SYohann 
3479e201c85SYohann //------------------------------------------------------------------------------
3489e201c85SYohann // 3D tensor contract y
3499e201c85SYohann //------------------------------------------------------------------------------
3506b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3519e201c85SYohann inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3529e201c85SYohann   CeedScalar r_B[P_1D];
3539e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
3549e201c85SYohann     r_B[i] = B[i + data.t_id_y * P_1D];
3559e201c85SYohann   }
3569e201c85SYohann 
3579e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3589e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3599e201c85SYohann     __syncthreads();
3609e201c85SYohann     V[k] = 0.0;
3619e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3629e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3639e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3649e201c85SYohann       }
3659e201c85SYohann     }
3669e201c85SYohann     __syncthreads();
3679e201c85SYohann   }
3689e201c85SYohann }
3699e201c85SYohann 
3709e201c85SYohann //------------------------------------------------------------------------------
3719e201c85SYohann // 3D tensor contract z
3729e201c85SYohann //------------------------------------------------------------------------------
3736b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3749e201c85SYohann inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3759e201c85SYohann   for (CeedInt k = 0; k < Q_1D; k++) {
3769e201c85SYohann     V[k] = 0.0;
3779e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3789e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3799e201c85SYohann         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
3809e201c85SYohann       }
3819e201c85SYohann     }
3829e201c85SYohann   }
3839e201c85SYohann }
3849e201c85SYohann 
3859e201c85SYohann //------------------------------------------------------------------------------
3869e201c85SYohann // 3D transpose tensor contract z
3879e201c85SYohann //------------------------------------------------------------------------------
3886b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3899e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3909e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3919e201c85SYohann     V[k] = 0.0;
3929e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3939e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3949e201c85SYohann         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
3959e201c85SYohann       }
3969e201c85SYohann     }
3979e201c85SYohann   }
3989e201c85SYohann }
3999e201c85SYohann 
4009e201c85SYohann //------------------------------------------------------------------------------
4019e201c85SYohann // 3D transpose tensor contract y
4029e201c85SYohann //------------------------------------------------------------------------------
4036b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4049e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4059e201c85SYohann   CeedScalar r_B[Q_1D];
4069e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4079e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
4089e201c85SYohann   }
4099e201c85SYohann 
4109e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
4119e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4129e201c85SYohann     __syncthreads();
4139e201c85SYohann     V[k] = 0.0;
4149e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
4159e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4169e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
4179e201c85SYohann       }
4189e201c85SYohann     }
4199e201c85SYohann     __syncthreads();
4209e201c85SYohann   }
4219e201c85SYohann }
4229e201c85SYohann 
4239e201c85SYohann //------------------------------------------------------------------------------
4249e201c85SYohann // 3D transpose tensor contract y
4259e201c85SYohann //------------------------------------------------------------------------------
4266b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4279e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4289e201c85SYohann   CeedScalar r_B[Q_1D];
4299e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4309e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
4319e201c85SYohann   }
4329e201c85SYohann 
4339e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
4349e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4359e201c85SYohann     __syncthreads();
4369e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
4379e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4389e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
4399e201c85SYohann       }
4409e201c85SYohann     }
4419e201c85SYohann     __syncthreads();
4429e201c85SYohann   }
4439e201c85SYohann }
4449e201c85SYohann 
4459e201c85SYohann //------------------------------------------------------------------------------
4469e201c85SYohann // 3D transpose tensor contract x
4479e201c85SYohann //------------------------------------------------------------------------------
4486b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4499e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4509e201c85SYohann   CeedScalar r_B[Q_1D];
4519e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4529e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4539e201c85SYohann   }
4549e201c85SYohann 
4559e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
4569e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4579e201c85SYohann     __syncthreads();
4589e201c85SYohann     V[k] = 0.0;
4599e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4609e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4619e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4629e201c85SYohann       }
4639e201c85SYohann     }
4649e201c85SYohann     __syncthreads();
4659e201c85SYohann   }
4669e201c85SYohann }
4679e201c85SYohann 
4689e201c85SYohann //------------------------------------------------------------------------------
4699e201c85SYohann // 3D transpose tensor contract add x
4709e201c85SYohann //------------------------------------------------------------------------------
4716b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4729e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4739e201c85SYohann   CeedScalar r_B[Q_1D];
4749e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4759e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4769e201c85SYohann   }
4779e201c85SYohann 
4789e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
4799e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4809e201c85SYohann     __syncthreads();
4819e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4829e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4839e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4849e201c85SYohann       }
4859e201c85SYohann     }
4869e201c85SYohann     __syncthreads();
4879e201c85SYohann   }
4889e201c85SYohann }
4899e201c85SYohann 
4909e201c85SYohann //------------------------------------------------------------------------------
4919e201c85SYohann // 3D interpolate to quadrature points
4929e201c85SYohann //------------------------------------------------------------------------------
4936b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4949e201c85SYohann inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
4959e201c85SYohann   CeedScalar r_t1[T_1D];
4969e201c85SYohann   CeedScalar r_t2[T_1D];
4979e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4986b92dc4bSJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
4996b92dc4bSJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5006b92dc4bSJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
5019e201c85SYohann   }
5029e201c85SYohann }
5039e201c85SYohann 
5049e201c85SYohann //------------------------------------------------------------------------------
5059e201c85SYohann // 3D interpolate transpose
5069e201c85SYohann //------------------------------------------------------------------------------
5076b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5082b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5092b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
5109e201c85SYohann   CeedScalar r_t1[T_1D];
5119e201c85SYohann   CeedScalar r_t2[T_1D];
5129e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5136b92dc4bSJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
5146b92dc4bSJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5156b92dc4bSJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5169e201c85SYohann   }
5179e201c85SYohann }
5189e201c85SYohann 
5199e201c85SYohann //------------------------------------------------------------------------------
5209e201c85SYohann // 3D derivatives at quadrature points
5219e201c85SYohann //------------------------------------------------------------------------------
5226b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5232b730f8bSJeremy L Thompson inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5242b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
5259e201c85SYohann   CeedScalar r_t1[T_1D];
5269e201c85SYohann   CeedScalar r_t2[T_1D];
5279e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5286b92dc4bSJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
5296b92dc4bSJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5306b92dc4bSJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
5316b92dc4bSJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
5326b92dc4bSJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
5336b92dc4bSJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
5346b92dc4bSJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
5356b92dc4bSJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5366b92dc4bSJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5379e201c85SYohann   }
5389e201c85SYohann }
5399e201c85SYohann 
5409e201c85SYohann //------------------------------------------------------------------------------
5419e201c85SYohann // 3D derivatives transpose
5429e201c85SYohann //------------------------------------------------------------------------------
5436b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5442b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5452b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
5469e201c85SYohann   CeedScalar r_t1[T_1D];
5479e201c85SYohann   CeedScalar r_t2[T_1D];
5489e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5496b92dc4bSJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
5506b92dc4bSJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5516b92dc4bSJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
5526b92dc4bSJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
5536b92dc4bSJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
5546b92dc4bSJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5556b92dc4bSJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
5566b92dc4bSJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5576b92dc4bSJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5589e201c85SYohann   }
5599e201c85SYohann }
5609e201c85SYohann 
5619e201c85SYohann //------------------------------------------------------------------------------
5629e201c85SYohann // 3D derivatives at quadrature points
5639e201c85SYohann //------------------------------------------------------------------------------
5646b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5652b730f8bSJeremy L Thompson inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5662b730f8bSJeremy L Thompson                                               CeedScalar *__restrict__ r_V) {
5679e201c85SYohann   CeedScalar r_t1[T_1D];
5689e201c85SYohann   CeedScalar r_t2[T_1D];
5699e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5706b92dc4bSJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
5716b92dc4bSJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5726b92dc4bSJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
5736b92dc4bSJeremy L Thompson     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
5746b92dc4bSJeremy L Thompson     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
5756b92dc4bSJeremy L Thompson     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5769e201c85SYohann   }
5779e201c85SYohann }
5789e201c85SYohann 
5799e201c85SYohann //------------------------------------------------------------------------------
5809e201c85SYohann // 3D derivatives transpose
5819e201c85SYohann //------------------------------------------------------------------------------
5826b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5832b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5842b730f8bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5859e201c85SYohann   CeedScalar r_t1[T_1D];
5869e201c85SYohann   CeedScalar r_t2[T_1D];
5879e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5886b92dc4bSJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
5896b92dc4bSJeremy L Thompson     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
5906b92dc4bSJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
5916b92dc4bSJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
5926b92dc4bSJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5936b92dc4bSJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5949e201c85SYohann   }
5959e201c85SYohann }
5969e201c85SYohann 
5979e201c85SYohann //------------------------------------------------------------------------------
5989e201c85SYohann // 3D quadrature weights
5999e201c85SYohann //------------------------------------------------------------------------------
6009e201c85SYohann template <int Q_1D>
6019e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
6029e201c85SYohann   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
6039e201c85SYohann   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
6049e201c85SYohann   for (CeedInt q = 0; q < Q_1D; q++) {
6059e201c85SYohann     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
6069e201c85SYohann   }
6079e201c85SYohann }
608