xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 9ba83ac0e4b1fca39d6fa6737a318a9f0cbc172d)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA shared memory tensor product basis templates
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // 1D
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann 
169e201c85SYohann //------------------------------------------------------------------------------
179e201c85SYohann // 1D tensor contraction x
189e201c85SYohann //------------------------------------------------------------------------------
199e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
209e201c85SYohann inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21d6c19ee8SJeremy L Thompson   __syncthreads();
229e201c85SYohann   data.slice[data.t_id_x] = *U;
239e201c85SYohann   __syncthreads();
249e201c85SYohann   *V = 0.0;
259e201c85SYohann   if (data.t_id_x < Q_1D) {
269e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
279e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
289e201c85SYohann     }
299e201c85SYohann   }
309e201c85SYohann }
319e201c85SYohann 
329e201c85SYohann //------------------------------------------------------------------------------
339e201c85SYohann // 1D transpose tensor contraction x
349e201c85SYohann //------------------------------------------------------------------------------
359e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
369e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37d6c19ee8SJeremy L Thompson   __syncthreads();
389e201c85SYohann   data.slice[data.t_id_x] = *U;
399e201c85SYohann   __syncthreads();
409e201c85SYohann   *V = 0.0;
419e201c85SYohann   if (data.t_id_x < P_1D) {
429e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
439e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
449e201c85SYohann     }
459e201c85SYohann   }
469e201c85SYohann }
479e201c85SYohann 
489e201c85SYohann //------------------------------------------------------------------------------
499e201c85SYohann // 1D interpolate to quadrature points
509e201c85SYohann //------------------------------------------------------------------------------
5199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
529e201c85SYohann inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
539e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
559e201c85SYohann   }
569e201c85SYohann }
579e201c85SYohann 
589e201c85SYohann //------------------------------------------------------------------------------
599e201c85SYohann // 1D interpolate transpose
609e201c85SYohann //------------------------------------------------------------------------------
6199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
622b730f8bSJeremy L Thompson inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
632b730f8bSJeremy L Thompson                                          CeedScalar *__restrict__ r_V) {
649e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
669e201c85SYohann   }
679e201c85SYohann }
689e201c85SYohann 
699e201c85SYohann //------------------------------------------------------------------------------
700ccda8ebSJeremy L Thompson // 1D interpolate to quadrature points, nodes and quadrature points collocated
710ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
720ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
730ccda8ebSJeremy L Thompson inline __device__ void InterpCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
740ccda8ebSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
750ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
760ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
770ccda8ebSJeremy L Thompson   }
780ccda8ebSJeremy L Thompson }
790ccda8ebSJeremy L Thompson 
800ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
810ccda8ebSJeremy L Thompson // 1D interpolate transpose, nodes and quadrature points collocated
820ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
830ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
840ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
850ccda8ebSJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
860ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
870ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
880ccda8ebSJeremy L Thompson   }
890ccda8ebSJeremy L Thompson }
900ccda8ebSJeremy L Thompson 
910ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
929e201c85SYohann // 1D derivatives at quadrature points
939e201c85SYohann //------------------------------------------------------------------------------
9499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
952b730f8bSJeremy L Thompson inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
962b730f8bSJeremy L Thompson                               CeedScalar *__restrict__ r_V) {
979e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
999e201c85SYohann   }
1009e201c85SYohann }
1019e201c85SYohann 
1029e201c85SYohann //------------------------------------------------------------------------------
1039e201c85SYohann // 1D derivatives transpose
1049e201c85SYohann //------------------------------------------------------------------------------
10599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1062b730f8bSJeremy L Thompson inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
1072b730f8bSJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
1089e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
109db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
1109e201c85SYohann   }
1119e201c85SYohann }
1129e201c85SYohann 
1139e201c85SYohann //------------------------------------------------------------------------------
1149e201c85SYohann // 1D quadrature weights
1159e201c85SYohann //------------------------------------------------------------------------------
116343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
1179e201c85SYohann inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
1189e201c85SYohann   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
1199e201c85SYohann }
1209e201c85SYohann 
1219e201c85SYohann //------------------------------------------------------------------------------
1229e201c85SYohann // 2D
1239e201c85SYohann //------------------------------------------------------------------------------
1249e201c85SYohann 
1259e201c85SYohann //------------------------------------------------------------------------------
1269e201c85SYohann // 2D tensor contraction x
1279e201c85SYohann //------------------------------------------------------------------------------
12899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1299e201c85SYohann inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130d6c19ee8SJeremy L Thompson   __syncthreads();
1319e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1329e201c85SYohann   __syncthreads();
1339e201c85SYohann   *V = 0.0;
1349e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1359e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
1369e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1379e201c85SYohann     }
1389e201c85SYohann   }
1399e201c85SYohann }
1409e201c85SYohann 
1419e201c85SYohann //------------------------------------------------------------------------------
1429e201c85SYohann // 2D tensor contract y
1439e201c85SYohann //------------------------------------------------------------------------------
14499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1459e201c85SYohann inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
146d6c19ee8SJeremy L Thompson   __syncthreads();
1479e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1489e201c85SYohann   __syncthreads();
1499e201c85SYohann   *V = 0.0;
1509e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
1519e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
1529e201c85SYohann       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
1539e201c85SYohann     }
1549e201c85SYohann   }
1559e201c85SYohann }
1569e201c85SYohann 
1579e201c85SYohann //------------------------------------------------------------------------------
1589e201c85SYohann // 2D transpose tensor contract y
1599e201c85SYohann //------------------------------------------------------------------------------
16099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1619e201c85SYohann inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162d6c19ee8SJeremy L Thompson   __syncthreads();
1639e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1649e201c85SYohann   __syncthreads();
1659e201c85SYohann   *V = 0.0;
1669e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1679e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1689e201c85SYohann       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
1699e201c85SYohann     }
1709e201c85SYohann   }
1719e201c85SYohann }
1729e201c85SYohann 
1739e201c85SYohann //------------------------------------------------------------------------------
1749e201c85SYohann // 2D transpose tensor contract x
1759e201c85SYohann //------------------------------------------------------------------------------
17699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1779e201c85SYohann inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
178d6c19ee8SJeremy L Thompson   __syncthreads();
1799e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1809e201c85SYohann   __syncthreads();
1819e201c85SYohann   *V = 0.0;
1829e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1839e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1849e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1859e201c85SYohann     }
1869e201c85SYohann   }
1879e201c85SYohann }
1889e201c85SYohann 
1899e201c85SYohann //------------------------------------------------------------------------------
1909e201c85SYohann // 2D transpose tensor contract and add x
1919e201c85SYohann //------------------------------------------------------------------------------
19299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1939e201c85SYohann inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
194d6c19ee8SJeremy L Thompson   __syncthreads();
1959e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1969e201c85SYohann   __syncthreads();
1979e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1989e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1999e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
2009e201c85SYohann     }
2019e201c85SYohann   }
2029e201c85SYohann }
2039e201c85SYohann 
2049e201c85SYohann //------------------------------------------------------------------------------
2059e201c85SYohann // 2D interpolate to quadrature points
2069e201c85SYohann //------------------------------------------------------------------------------
20799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2082b730f8bSJeremy L Thompson inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2092b730f8bSJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
2109e201c85SYohann   CeedScalar r_t[1];
2119e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
21299421279SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
21399421279SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2149e201c85SYohann   }
2159e201c85SYohann }
2169e201c85SYohann 
2179e201c85SYohann //------------------------------------------------------------------------------
2189e201c85SYohann // 2D interpolate transpose
2199e201c85SYohann //------------------------------------------------------------------------------
22099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2212b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2222b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
2239e201c85SYohann   CeedScalar r_t[1];
2249e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
22599421279SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
22699421279SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2279e201c85SYohann   }
2289e201c85SYohann }
2299e201c85SYohann 
2309e201c85SYohann //------------------------------------------------------------------------------
2310ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated
2320ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2330ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2340ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2350ccda8ebSJeremy L Thompson                                                      CeedScalar *__restrict__ r_V) {
2360ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2370ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
2380ccda8ebSJeremy L Thompson   }
2390ccda8ebSJeremy L Thompson }
2400ccda8ebSJeremy L Thompson 
2410ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2420ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated
2430ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2440ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2450ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2460ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
2470ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2480ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
2490ccda8ebSJeremy L Thompson   }
2500ccda8ebSJeremy L Thompson }
2510ccda8ebSJeremy L Thompson 
2520ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2539e201c85SYohann // 2D derivatives at quadrature points
2549e201c85SYohann //------------------------------------------------------------------------------
25599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2562b730f8bSJeremy L Thompson inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2572b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
2589e201c85SYohann   CeedScalar r_t[1];
2599e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
26099421279SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
26199421279SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
26299421279SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
26399421279SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
2649e201c85SYohann   }
2659e201c85SYohann }
2669e201c85SYohann 
2679e201c85SYohann //------------------------------------------------------------------------------
2689e201c85SYohann // 2D derivatives transpose
2699e201c85SYohann //------------------------------------------------------------------------------
27099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2712b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2722b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
2739e201c85SYohann   CeedScalar r_t[1];
2749e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
27599421279SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
27699421279SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
27799421279SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
27899421279SJeremy L Thompson     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2799e201c85SYohann   }
2809e201c85SYohann }
2819e201c85SYohann 
2829e201c85SYohann //------------------------------------------------------------------------------
28321292910SJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated
28421292910SJeremy L Thompson //------------------------------------------------------------------------------
28521292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2860ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2870ccda8ebSJeremy L Thompson                                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
28821292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
28921292910SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
29021292910SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
29121292910SJeremy L Thompson   }
29221292910SJeremy L Thompson }
29321292910SJeremy L Thompson 
29421292910SJeremy L Thompson //------------------------------------------------------------------------------
29521292910SJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated
29621292910SJeremy L Thompson //------------------------------------------------------------------------------
29721292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2980ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2990ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
30021292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
30121292910SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
30221292910SJeremy L Thompson     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
30321292910SJeremy L Thompson   }
30421292910SJeremy L Thompson }
30521292910SJeremy L Thompson 
30621292910SJeremy L Thompson //------------------------------------------------------------------------------
3079e201c85SYohann // 2D quadrature weights
3089e201c85SYohann //------------------------------------------------------------------------------
309343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
3109e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
3112b730f8bSJeremy L Thompson   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
3129e201c85SYohann }
3139e201c85SYohann 
3149e201c85SYohann //------------------------------------------------------------------------------
3159e201c85SYohann // 3D
3169e201c85SYohann //------------------------------------------------------------------------------
3179e201c85SYohann 
3189e201c85SYohann //------------------------------------------------------------------------------
3199e201c85SYohann // 3D tensor contract x
3209e201c85SYohann //------------------------------------------------------------------------------
32199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3229e201c85SYohann inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3239e201c85SYohann   CeedScalar r_B[P_1D];
3249e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
3259e201c85SYohann     r_B[i] = B[i + data.t_id_x * P_1D];
3269e201c85SYohann   }
3279e201c85SYohann 
3289e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
329d6c19ee8SJeremy L Thompson     __syncthreads();
3309e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3319e201c85SYohann     __syncthreads();
3329e201c85SYohann     V[k] = 0.0;
3339e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3349e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3359e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
3369e201c85SYohann       }
3379e201c85SYohann     }
3389e201c85SYohann   }
3399e201c85SYohann }
3409e201c85SYohann 
3419e201c85SYohann //------------------------------------------------------------------------------
3429e201c85SYohann // 3D tensor contract y
3439e201c85SYohann //------------------------------------------------------------------------------
34499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3459e201c85SYohann inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3469e201c85SYohann   CeedScalar r_B[P_1D];
3479e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
3489e201c85SYohann     r_B[i] = B[i + data.t_id_y * P_1D];
3499e201c85SYohann   }
3509e201c85SYohann 
3519e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
352d6c19ee8SJeremy L Thompson     __syncthreads();
3539e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3549e201c85SYohann     __syncthreads();
3559e201c85SYohann     V[k] = 0.0;
3569e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3579e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3589e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3599e201c85SYohann       }
3609e201c85SYohann     }
3619e201c85SYohann   }
3629e201c85SYohann }
3639e201c85SYohann 
3649e201c85SYohann //------------------------------------------------------------------------------
3659e201c85SYohann // 3D tensor contract z
3669e201c85SYohann //------------------------------------------------------------------------------
36799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3689e201c85SYohann inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3699e201c85SYohann   for (CeedInt k = 0; k < Q_1D; k++) {
3709e201c85SYohann     V[k] = 0.0;
3719e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3729e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3739e201c85SYohann         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
3749e201c85SYohann       }
3759e201c85SYohann     }
3769e201c85SYohann   }
3779e201c85SYohann }
3789e201c85SYohann 
3799e201c85SYohann //------------------------------------------------------------------------------
3809e201c85SYohann // 3D transpose tensor contract z
3819e201c85SYohann //------------------------------------------------------------------------------
38299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3839e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3849e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3859e201c85SYohann     V[k] = 0.0;
3869e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3879e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3889e201c85SYohann         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
3899e201c85SYohann       }
3909e201c85SYohann     }
3919e201c85SYohann   }
3929e201c85SYohann }
3939e201c85SYohann 
3949e201c85SYohann //------------------------------------------------------------------------------
3959e201c85SYohann // 3D transpose tensor contract y
3969e201c85SYohann //------------------------------------------------------------------------------
39799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3989e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3999e201c85SYohann   CeedScalar r_B[Q_1D];
4009e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4019e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
4029e201c85SYohann   }
4039e201c85SYohann 
4049e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
405d6c19ee8SJeremy L Thompson     __syncthreads();
4069e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4079e201c85SYohann     __syncthreads();
4089e201c85SYohann     V[k] = 0.0;
4099e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
4109e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4119e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
4129e201c85SYohann       }
4139e201c85SYohann     }
4149e201c85SYohann   }
4159e201c85SYohann }
4169e201c85SYohann 
4179e201c85SYohann //------------------------------------------------------------------------------
4189e201c85SYohann // 3D transpose tensor contract y
4199e201c85SYohann //------------------------------------------------------------------------------
42099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4219e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4229e201c85SYohann   CeedScalar r_B[Q_1D];
4239e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4249e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
4259e201c85SYohann   }
4269e201c85SYohann 
4279e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
428d6c19ee8SJeremy L Thompson     __syncthreads();
4299e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4309e201c85SYohann     __syncthreads();
4319e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
4329e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4339e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
4349e201c85SYohann       }
4359e201c85SYohann     }
4369e201c85SYohann   }
4379e201c85SYohann }
4389e201c85SYohann 
4399e201c85SYohann //------------------------------------------------------------------------------
4409e201c85SYohann // 3D transpose tensor contract x
4419e201c85SYohann //------------------------------------------------------------------------------
44299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4439e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4449e201c85SYohann   CeedScalar r_B[Q_1D];
4459e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4469e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4479e201c85SYohann   }
4489e201c85SYohann 
4499e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
450d6c19ee8SJeremy L Thompson     __syncthreads();
4519e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4529e201c85SYohann     __syncthreads();
4539e201c85SYohann     V[k] = 0.0;
4549e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4559e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4569e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4579e201c85SYohann       }
4589e201c85SYohann     }
4599e201c85SYohann   }
4609e201c85SYohann }
4619e201c85SYohann 
4629e201c85SYohann //------------------------------------------------------------------------------
4639e201c85SYohann // 3D transpose tensor contract add x
4649e201c85SYohann //------------------------------------------------------------------------------
46599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4669e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4679e201c85SYohann   CeedScalar r_B[Q_1D];
4689e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4699e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4709e201c85SYohann   }
4719e201c85SYohann 
4729e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
473d6c19ee8SJeremy L Thompson     __syncthreads();
4749e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4759e201c85SYohann     __syncthreads();
4769e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4779e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4789e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4799e201c85SYohann       }
4809e201c85SYohann     }
4819e201c85SYohann   }
4829e201c85SYohann }
4839e201c85SYohann 
4849e201c85SYohann //------------------------------------------------------------------------------
4859e201c85SYohann // 3D interpolate to quadrature points
4869e201c85SYohann //------------------------------------------------------------------------------
48799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4882b730f8bSJeremy L Thompson inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4892b730f8bSJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
4909e201c85SYohann   CeedScalar r_t1[T_1D];
4919e201c85SYohann   CeedScalar r_t2[T_1D];
4929e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
49399421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
49499421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
49599421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
4969e201c85SYohann   }
4979e201c85SYohann }
4989e201c85SYohann 
4999e201c85SYohann //------------------------------------------------------------------------------
5009e201c85SYohann // 3D interpolate transpose
5019e201c85SYohann //------------------------------------------------------------------------------
50299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5032b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5042b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
5059e201c85SYohann   CeedScalar r_t1[T_1D];
5069e201c85SYohann   CeedScalar r_t2[T_1D];
5079e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
50899421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
50999421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
51099421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5119e201c85SYohann   }
5129e201c85SYohann }
5139e201c85SYohann 
5149e201c85SYohann //------------------------------------------------------------------------------
5150ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated
5160ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5170ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5180ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5190ccda8ebSJeremy L Thompson                                                      CeedScalar *__restrict__ r_V) {
5200ccda8ebSJeremy L Thompson   for (CeedInt i = 0; i < Q_1D; i++) {
5210ccda8ebSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5220ccda8ebSJeremy L Thompson       r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
5230ccda8ebSJeremy L Thompson     }
5240ccda8ebSJeremy L Thompson   }
5250ccda8ebSJeremy L Thompson }
5260ccda8ebSJeremy L Thompson 
5270ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5280ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated
5290ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5300ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5310ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5320ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
5330ccda8ebSJeremy L Thompson   for (CeedInt i = 0; i < Q_1D; i++) {
5340ccda8ebSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5350ccda8ebSJeremy L Thompson       r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
5360ccda8ebSJeremy L Thompson     }
5370ccda8ebSJeremy L Thompson   }
5380ccda8ebSJeremy L Thompson }
5390ccda8ebSJeremy L Thompson 
5400ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5419e201c85SYohann // 3D derivatives at quadrature points
5429e201c85SYohann //------------------------------------------------------------------------------
54399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5442b730f8bSJeremy L Thompson inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5452b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
5469e201c85SYohann   CeedScalar r_t1[T_1D];
5479e201c85SYohann   CeedScalar r_t2[T_1D];
5489e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54999421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
55099421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
55199421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
55299421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
55399421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
55499421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
55599421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
55699421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
55799421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5589e201c85SYohann   }
5599e201c85SYohann }
5609e201c85SYohann 
5619e201c85SYohann //------------------------------------------------------------------------------
5629e201c85SYohann // 3D derivatives transpose
5639e201c85SYohann //------------------------------------------------------------------------------
56499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5652b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5662b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
5679e201c85SYohann   CeedScalar r_t1[T_1D];
5689e201c85SYohann   CeedScalar r_t2[T_1D];
5699e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
57099421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
57199421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
57299421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
57399421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
57499421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
57599421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
57699421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
57799421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
57899421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5799e201c85SYohann   }
5809e201c85SYohann }
5819e201c85SYohann 
5829e201c85SYohann //------------------------------------------------------------------------------
5839e201c85SYohann // 3D derivatives at quadrature points
5849e201c85SYohann //------------------------------------------------------------------------------
58599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5862b730f8bSJeremy L Thompson inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5872b730f8bSJeremy L Thompson                                               CeedScalar *__restrict__ r_V) {
5889e201c85SYohann   CeedScalar r_t1[T_1D];
5899e201c85SYohann   CeedScalar r_t2[T_1D];
5909e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
59199421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
59299421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
59399421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
59499421279SJeremy L Thompson     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
59599421279SJeremy L Thompson     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
59699421279SJeremy L Thompson     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5979e201c85SYohann   }
5989e201c85SYohann }
5999e201c85SYohann 
6009e201c85SYohann //------------------------------------------------------------------------------
6019e201c85SYohann // 3D derivatives transpose
6029e201c85SYohann //------------------------------------------------------------------------------
60399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
6042b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6052b730f8bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6069e201c85SYohann   CeedScalar r_t1[T_1D];
6079e201c85SYohann   CeedScalar r_t2[T_1D];
6089e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
60999421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
61099421279SJeremy L Thompson     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
61199421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
61299421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
61399421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
61499421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
6159e201c85SYohann   }
6169e201c85SYohann }
6179e201c85SYohann 
6189e201c85SYohann //------------------------------------------------------------------------------
61921292910SJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated
62021292910SJeremy L Thompson //------------------------------------------------------------------------------
62121292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
6220ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6230ccda8ebSJeremy L Thompson                                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
62421292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
62521292910SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
62621292910SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
62721292910SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
62821292910SJeremy L Thompson   }
62921292910SJeremy L Thompson }
63021292910SJeremy L Thompson 
63121292910SJeremy L Thompson //------------------------------------------------------------------------------
63221292910SJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated
63321292910SJeremy L Thompson //------------------------------------------------------------------------------
63421292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
6350ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6360ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
63721292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
63821292910SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
63921292910SJeremy L Thompson     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
64021292910SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
64121292910SJeremy L Thompson   }
64221292910SJeremy L Thompson }
64321292910SJeremy L Thompson 
64421292910SJeremy L Thompson //------------------------------------------------------------------------------
6459e201c85SYohann // 3D quadrature weights
6469e201c85SYohann //------------------------------------------------------------------------------
647343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
6489e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
6499e201c85SYohann   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
6509e201c85SYohann   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
6519e201c85SYohann   for (CeedInt q = 0; q < Q_1D; q++) {
6529e201c85SYohann     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
6539e201c85SYohann   }
6549e201c85SYohann }
655