1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED: http://github.com/ceed
79e201c85SYohann
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for HIP shared memory tensor product basis templates
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // 1D
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann
169e201c85SYohann //------------------------------------------------------------------------------
179e201c85SYohann // 1D tensor contraction x
189e201c85SYohann //------------------------------------------------------------------------------
199e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
ContractX1d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)209e201c85SYohann inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21d6c19ee8SJeremy L Thompson __syncthreads();
229e201c85SYohann data.slice[data.t_id_x] = *U;
239e201c85SYohann __syncthreads();
249e201c85SYohann *V = 0.0;
259e201c85SYohann if (data.t_id_x < Q_1D) {
269e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
279e201c85SYohann *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction
289e201c85SYohann }
299e201c85SYohann }
309e201c85SYohann }
319e201c85SYohann
329e201c85SYohann //------------------------------------------------------------------------------
339e201c85SYohann // 1D transpose tensor contraction x
349e201c85SYohann //------------------------------------------------------------------------------
359e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
ContractTransposeX1d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)369e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37d6c19ee8SJeremy L Thompson __syncthreads();
389e201c85SYohann data.slice[data.t_id_x] = *U;
399e201c85SYohann __syncthreads();
409e201c85SYohann *V = 0.0;
419e201c85SYohann if (data.t_id_x < P_1D) {
429e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
439e201c85SYohann *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction
449e201c85SYohann }
459e201c85SYohann }
469e201c85SYohann }
479e201c85SYohann
489e201c85SYohann //------------------------------------------------------------------------------
499e201c85SYohann // 1D interpolate to quadrature points
509e201c85SYohann //------------------------------------------------------------------------------
516b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
Interp1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)529e201c85SYohann inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
539e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54db2becc9SJeremy L Thompson ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
559e201c85SYohann }
569e201c85SYohann }
579e201c85SYohann
589e201c85SYohann //------------------------------------------------------------------------------
599e201c85SYohann // 1D interpolate transpose
609e201c85SYohann //------------------------------------------------------------------------------
616b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTranspose1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)622b730f8bSJeremy L Thompson inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
632b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
649e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65db2becc9SJeremy L Thompson ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
669e201c85SYohann }
679e201c85SYohann }
689e201c85SYohann
699e201c85SYohann //------------------------------------------------------------------------------
700ccda8ebSJeremy L Thompson // 1D interpolate to quadrature points, nodes and quadrature points collocated
710ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
720ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpCollocatedNodes1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)730ccda8ebSJeremy L Thompson inline __device__ void InterpCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
740ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
750ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
760ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
770ccda8ebSJeremy L Thompson }
780ccda8ebSJeremy L Thompson }
790ccda8ebSJeremy L Thompson
800ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
810ccda8ebSJeremy L Thompson // 1D interpolate transpose, nodes and quadrature points collocated
820ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
830ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeCollocatedNodes1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)840ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
850ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
860ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
870ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
880ccda8ebSJeremy L Thompson }
890ccda8ebSJeremy L Thompson }
900ccda8ebSJeremy L Thompson
910ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
929e201c85SYohann // 1D derivatives at quadrature points
939e201c85SYohann //------------------------------------------------------------------------------
946b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
Grad1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)952b730f8bSJeremy L Thompson inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
962b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
979e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98db2becc9SJeremy L Thompson ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
999e201c85SYohann }
1009e201c85SYohann }
1019e201c85SYohann
1029e201c85SYohann //------------------------------------------------------------------------------
1039e201c85SYohann // 1D derivatives transpose
1049e201c85SYohann //------------------------------------------------------------------------------
1056b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTranspose1d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)1062b730f8bSJeremy L Thompson inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
1072b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
1089e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
109db2becc9SJeremy L Thompson ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
1109e201c85SYohann }
1119e201c85SYohann }
1129e201c85SYohann
1139e201c85SYohann //------------------------------------------------------------------------------
1149e201c85SYohann // 1D quadrature weights
1159e201c85SYohann //------------------------------------------------------------------------------
1169b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
Weight1d(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)1179e201c85SYohann inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
1189e201c85SYohann *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
1199e201c85SYohann }
1209e201c85SYohann
1219e201c85SYohann //------------------------------------------------------------------------------
1229e201c85SYohann // 2D
1239e201c85SYohann //------------------------------------------------------------------------------
1249e201c85SYohann
1259e201c85SYohann //------------------------------------------------------------------------------
1269e201c85SYohann // 2D tensor contraction x
1279e201c85SYohann //------------------------------------------------------------------------------
1286b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)1299e201c85SYohann inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130d6c19ee8SJeremy L Thompson __syncthreads();
1319e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1329e201c85SYohann __syncthreads();
1339e201c85SYohann *V = 0.0;
1349e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1359e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
1369e201c85SYohann *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
1379e201c85SYohann }
1389e201c85SYohann }
1399e201c85SYohann }
1409e201c85SYohann
1419e201c85SYohann //------------------------------------------------------------------------------
1429e201c85SYohann // 2D tensor contract y
1439e201c85SYohann //------------------------------------------------------------------------------
1446b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)1459e201c85SYohann inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
146d6c19ee8SJeremy L Thompson __syncthreads();
1479e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1489e201c85SYohann __syncthreads();
1499e201c85SYohann *V = 0.0;
1509e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
1519e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
1529e201c85SYohann *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
1539e201c85SYohann }
1549e201c85SYohann }
1559e201c85SYohann }
1569e201c85SYohann
1579e201c85SYohann //------------------------------------------------------------------------------
1589e201c85SYohann // 2D transpose tensor contract y
1599e201c85SYohann //------------------------------------------------------------------------------
1606b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)1619e201c85SYohann inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162d6c19ee8SJeremy L Thompson __syncthreads();
1639e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1649e201c85SYohann __syncthreads();
1659e201c85SYohann *V = 0.0;
1669e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1679e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
1689e201c85SYohann *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
1699e201c85SYohann }
1709e201c85SYohann }
1719e201c85SYohann }
1729e201c85SYohann
1739e201c85SYohann //------------------------------------------------------------------------------
1749e201c85SYohann // 2D transpose tensor contract x
1759e201c85SYohann //------------------------------------------------------------------------------
1766b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)1779e201c85SYohann inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
178d6c19ee8SJeremy L Thompson __syncthreads();
1799e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1809e201c85SYohann __syncthreads();
1819e201c85SYohann *V = 0.0;
1829e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1839e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
1849e201c85SYohann *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
1859e201c85SYohann }
1869e201c85SYohann }
1879e201c85SYohann }
1889e201c85SYohann
1899e201c85SYohann //------------------------------------------------------------------------------
1909e201c85SYohann // 2D transpose tensor contract and add x
1919e201c85SYohann //------------------------------------------------------------------------------
1926b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX2d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)1939e201c85SYohann inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
194d6c19ee8SJeremy L Thompson __syncthreads();
1959e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1969e201c85SYohann __syncthreads();
1979e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1989e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
1999e201c85SYohann *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
2009e201c85SYohann }
2019e201c85SYohann }
2029e201c85SYohann }
2039e201c85SYohann
2049e201c85SYohann //------------------------------------------------------------------------------
2059e201c85SYohann // 2D interpolate to quadrature points
2069e201c85SYohann //------------------------------------------------------------------------------
2076b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)2089e201c85SYohann inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
2099e201c85SYohann CeedScalar r_t[1];
2109e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2116b92dc4bSJeremy L Thompson ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
2126b92dc4bSJeremy L Thompson ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2139e201c85SYohann }
2149e201c85SYohann }
2159e201c85SYohann
2169e201c85SYohann //------------------------------------------------------------------------------
2179e201c85SYohann // 2D interpolate transpose
2189e201c85SYohann //------------------------------------------------------------------------------
2196b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)2202b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2212b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
2229e201c85SYohann CeedScalar r_t[1];
2239e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2246b92dc4bSJeremy L Thompson ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
2256b92dc4bSJeremy L Thompson ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2269e201c85SYohann }
2279e201c85SYohann }
2289e201c85SYohann
2299e201c85SYohann //------------------------------------------------------------------------------
2300ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated
2310ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2320ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)2330ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2340ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
2350ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2360ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
2370ccda8ebSJeremy L Thompson }
2380ccda8ebSJeremy L Thompson }
2390ccda8ebSJeremy L Thompson
2400ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2410ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated
2420ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2430ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)2440ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2450ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
2460ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2470ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
2480ccda8ebSJeremy L Thompson }
2490ccda8ebSJeremy L Thompson }
2500ccda8ebSJeremy L Thompson
2510ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2529e201c85SYohann // 2D derivatives at quadrature points
2539e201c85SYohann //------------------------------------------------------------------------------
2546b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2552b730f8bSJeremy L Thompson inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2562b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
2579e201c85SYohann CeedScalar r_t[1];
2589e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2596b92dc4bSJeremy L Thompson ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
2606b92dc4bSJeremy L Thompson ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
2616b92dc4bSJeremy L Thompson ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
2626b92dc4bSJeremy L Thompson ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
2639e201c85SYohann }
2649e201c85SYohann }
2659e201c85SYohann
2669e201c85SYohann //------------------------------------------------------------------------------
2679e201c85SYohann // 2D derivatives transpose
2689e201c85SYohann //------------------------------------------------------------------------------
2696b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2702b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2712b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
2729e201c85SYohann CeedScalar r_t[1];
2739e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2746b92dc4bSJeremy L Thompson ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
2756b92dc4bSJeremy L Thompson ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
2766b92dc4bSJeremy L Thompson ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
2776b92dc4bSJeremy L Thompson ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2789e201c85SYohann }
2799e201c85SYohann }
2809e201c85SYohann
2819e201c85SYohann //------------------------------------------------------------------------------
28202219a08SJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated
28302219a08SJeremy L Thompson //------------------------------------------------------------------------------
28402219a08SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2850ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2860ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
28702219a08SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
28802219a08SJeremy L Thompson ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
28902219a08SJeremy L Thompson ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
29002219a08SJeremy L Thompson }
29102219a08SJeremy L Thompson }
29202219a08SJeremy L Thompson
29302219a08SJeremy L Thompson //------------------------------------------------------------------------------
29402219a08SJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated
29502219a08SJeremy L Thompson //------------------------------------------------------------------------------
29602219a08SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes2d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2970ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2980ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
29902219a08SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
30002219a08SJeremy L Thompson ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
30102219a08SJeremy L Thompson ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
30202219a08SJeremy L Thompson }
30302219a08SJeremy L Thompson }
30402219a08SJeremy L Thompson
30502219a08SJeremy L Thompson //------------------------------------------------------------------------------
3069e201c85SYohann // 2D quadrature weights
3079e201c85SYohann //------------------------------------------------------------------------------
308ca595be6SJeremy L Thompson template <int P_1D, int Q_1D>
WeightTensor2d(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)3099e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
3102b730f8bSJeremy L Thompson *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
3119e201c85SYohann }
3129e201c85SYohann
3139e201c85SYohann //------------------------------------------------------------------------------
3149e201c85SYohann // 3D
3159e201c85SYohann //------------------------------------------------------------------------------
3169e201c85SYohann
3179e201c85SYohann //------------------------------------------------------------------------------
3189e201c85SYohann // 3D tensor contract x
3199e201c85SYohann //------------------------------------------------------------------------------
3206b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3219e201c85SYohann inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3229e201c85SYohann CeedScalar r_B[P_1D];
3239e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
3249e201c85SYohann r_B[i] = B[i + data.t_id_x * P_1D];
3259e201c85SYohann }
3269e201c85SYohann
3279e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) {
328d6c19ee8SJeremy L Thompson __syncthreads();
3299e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3309e201c85SYohann __syncthreads();
3319e201c85SYohann V[k] = 0.0;
3329e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3339e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
3349e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
3359e201c85SYohann }
3369e201c85SYohann }
3379e201c85SYohann }
3389e201c85SYohann }
3399e201c85SYohann
3409e201c85SYohann //------------------------------------------------------------------------------
3419e201c85SYohann // 3D tensor contract y
3429e201c85SYohann //------------------------------------------------------------------------------
3436b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3449e201c85SYohann inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3459e201c85SYohann CeedScalar r_B[P_1D];
3469e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
3479e201c85SYohann r_B[i] = B[i + data.t_id_y * P_1D];
3489e201c85SYohann }
3499e201c85SYohann
3509e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) {
351d6c19ee8SJeremy L Thompson __syncthreads();
3529e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3539e201c85SYohann __syncthreads();
3549e201c85SYohann V[k] = 0.0;
3559e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3569e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
3579e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
3589e201c85SYohann }
3599e201c85SYohann }
3609e201c85SYohann }
3619e201c85SYohann }
3629e201c85SYohann
3639e201c85SYohann //------------------------------------------------------------------------------
3649e201c85SYohann // 3D tensor contract z
3659e201c85SYohann //------------------------------------------------------------------------------
3666b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractZ3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3679e201c85SYohann inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3689e201c85SYohann for (CeedInt k = 0; k < Q_1D; k++) {
3699e201c85SYohann V[k] = 0.0;
3709e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3719e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) {
3729e201c85SYohann V[k] += B[i + k * P_1D] * U[i]; // Contract z direction
3739e201c85SYohann }
3749e201c85SYohann }
3759e201c85SYohann }
3769e201c85SYohann }
3779e201c85SYohann
3789e201c85SYohann //------------------------------------------------------------------------------
3799e201c85SYohann // 3D transpose tensor contract z
3809e201c85SYohann //------------------------------------------------------------------------------
3816b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeZ3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3829e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3839e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) {
3849e201c85SYohann V[k] = 0.0;
3859e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3869e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
3879e201c85SYohann V[k] += B[k + i * P_1D] * U[i]; // Contract z direction
3889e201c85SYohann }
3899e201c85SYohann }
3909e201c85SYohann }
3919e201c85SYohann }
3929e201c85SYohann
3939e201c85SYohann //------------------------------------------------------------------------------
3949e201c85SYohann // 3D transpose tensor contract y
3959e201c85SYohann //------------------------------------------------------------------------------
3966b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3979e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3989e201c85SYohann CeedScalar r_B[Q_1D];
3999e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4009e201c85SYohann r_B[i] = B[data.t_id_y + i * P_1D];
4019e201c85SYohann }
4029e201c85SYohann
4039e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) {
404d6c19ee8SJeremy L Thompson __syncthreads();
4059e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4069e201c85SYohann __syncthreads();
4079e201c85SYohann V[k] = 0.0;
4089e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
4099e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4109e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
4119e201c85SYohann }
4129e201c85SYohann }
4139e201c85SYohann }
4149e201c85SYohann }
4159e201c85SYohann
4169e201c85SYohann //------------------------------------------------------------------------------
4179e201c85SYohann // 3D transpose tensor contract y
4189e201c85SYohann //------------------------------------------------------------------------------
4196b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddY3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)4209e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4219e201c85SYohann CeedScalar r_B[Q_1D];
4229e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4239e201c85SYohann r_B[i] = B[data.t_id_y + i * P_1D];
4249e201c85SYohann }
4259e201c85SYohann
4269e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) {
427d6c19ee8SJeremy L Thompson __syncthreads();
4289e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4299e201c85SYohann __syncthreads();
4309e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
4319e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4329e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
4339e201c85SYohann }
4349e201c85SYohann }
4359e201c85SYohann }
4369e201c85SYohann }
4379e201c85SYohann
4389e201c85SYohann //------------------------------------------------------------------------------
4399e201c85SYohann // 3D transpose tensor contract x
4409e201c85SYohann //------------------------------------------------------------------------------
4416b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)4429e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4439e201c85SYohann CeedScalar r_B[Q_1D];
4449e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4459e201c85SYohann r_B[i] = B[data.t_id_x + i * P_1D];
4469e201c85SYohann }
4479e201c85SYohann
4489e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) {
449d6c19ee8SJeremy L Thompson __syncthreads();
4509e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4519e201c85SYohann __syncthreads();
4529e201c85SYohann V[k] = 0.0;
4539e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4549e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4559e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
4569e201c85SYohann }
4579e201c85SYohann }
4589e201c85SYohann }
4599e201c85SYohann }
4609e201c85SYohann
4619e201c85SYohann //------------------------------------------------------------------------------
4629e201c85SYohann // 3D transpose tensor contract add x
4639e201c85SYohann //------------------------------------------------------------------------------
4646b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX3d(SharedData_Hip & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)4659e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4669e201c85SYohann CeedScalar r_B[Q_1D];
4679e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4689e201c85SYohann r_B[i] = B[data.t_id_x + i * P_1D];
4699e201c85SYohann }
4709e201c85SYohann
4719e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) {
472d6c19ee8SJeremy L Thompson __syncthreads();
4739e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4749e201c85SYohann __syncthreads();
4759e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4769e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) {
4779e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
4789e201c85SYohann }
4799e201c85SYohann }
4809e201c85SYohann }
4819e201c85SYohann }
4829e201c85SYohann
4839e201c85SYohann //------------------------------------------------------------------------------
4849e201c85SYohann // 3D interpolate to quadrature points
4859e201c85SYohann //------------------------------------------------------------------------------
4866b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)4879e201c85SYohann inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
4889e201c85SYohann CeedScalar r_t1[T_1D];
4899e201c85SYohann CeedScalar r_t2[T_1D];
4909e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4916b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
4926b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
4936b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
4949e201c85SYohann }
4959e201c85SYohann }
4969e201c85SYohann
4979e201c85SYohann //------------------------------------------------------------------------------
4989e201c85SYohann // 3D interpolate transpose
4999e201c85SYohann //------------------------------------------------------------------------------
5006b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)5012b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5022b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5039e201c85SYohann CeedScalar r_t1[T_1D];
5049e201c85SYohann CeedScalar r_t2[T_1D];
5059e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5066b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
5076b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5086b92dc4bSJeremy L Thompson ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5099e201c85SYohann }
5109e201c85SYohann }
5119e201c85SYohann
5129e201c85SYohann //------------------------------------------------------------------------------
5130ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated
5140ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5150ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)5160ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5170ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5180ccda8ebSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
5190ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5200ccda8ebSJeremy L Thompson r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
5210ccda8ebSJeremy L Thompson }
5220ccda8ebSJeremy L Thompson }
5230ccda8ebSJeremy L Thompson }
5240ccda8ebSJeremy L Thompson
5250ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5260ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated
5270ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5280ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)5290ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5300ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5310ccda8ebSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
5320ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5330ccda8ebSJeremy L Thompson r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
5340ccda8ebSJeremy L Thompson }
5350ccda8ebSJeremy L Thompson }
5360ccda8ebSJeremy L Thompson }
5370ccda8ebSJeremy L Thompson
5380ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5399e201c85SYohann // 3D derivatives at quadrature points
5409e201c85SYohann //------------------------------------------------------------------------------
5416b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)5422b730f8bSJeremy L Thompson inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5432b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5449e201c85SYohann CeedScalar r_t1[T_1D];
5459e201c85SYohann CeedScalar r_t2[T_1D];
5469e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5476b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
5486b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5496b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
5506b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
5516b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
5526b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
5536b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
5546b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5556b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5569e201c85SYohann }
5579e201c85SYohann }
5589e201c85SYohann
5599e201c85SYohann //------------------------------------------------------------------------------
5609e201c85SYohann // 3D derivatives transpose
5619e201c85SYohann //------------------------------------------------------------------------------
5626b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)5632b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5642b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5659e201c85SYohann CeedScalar r_t1[T_1D];
5669e201c85SYohann CeedScalar r_t2[T_1D];
5679e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5686b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
5696b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5706b92dc4bSJeremy L Thompson ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
5716b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
5726b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
5736b92dc4bSJeremy L Thompson ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5746b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
5756b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5766b92dc4bSJeremy L Thompson ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5779e201c85SYohann }
5789e201c85SYohann }
5799e201c85SYohann
5809e201c85SYohann //------------------------------------------------------------------------------
5819e201c85SYohann // 3D derivatives at quadrature points
5829e201c85SYohann //------------------------------------------------------------------------------
5836b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocated3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)5842b730f8bSJeremy L Thompson inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5852b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5869e201c85SYohann CeedScalar r_t1[T_1D];
5879e201c85SYohann CeedScalar r_t2[T_1D];
5889e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5896b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
5906b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5916b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
5926b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
5936b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
5946b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5959e201c85SYohann }
5969e201c85SYohann }
5979e201c85SYohann
5989e201c85SYohann //------------------------------------------------------------------------------
5999e201c85SYohann // 3D derivatives transpose
6009e201c85SYohann //------------------------------------------------------------------------------
6016b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocated3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6022b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6032b730f8bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6049e201c85SYohann CeedScalar r_t1[T_1D];
6059e201c85SYohann CeedScalar r_t2[T_1D];
6069e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6076b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
6086b92dc4bSJeremy L Thompson ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
6096b92dc4bSJeremy L Thompson ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
6106b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
6116b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
6126b92dc4bSJeremy L Thompson ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
6139e201c85SYohann }
6149e201c85SYohann }
6159e201c85SYohann
6169e201c85SYohann //------------------------------------------------------------------------------
61702219a08SJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated
61802219a08SJeremy L Thompson //------------------------------------------------------------------------------
61902219a08SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6200ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6210ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
62202219a08SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
62302219a08SJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
62402219a08SJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
62502219a08SJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
62602219a08SJeremy L Thompson }
62702219a08SJeremy L Thompson }
62802219a08SJeremy L Thompson
62902219a08SJeremy L Thompson //------------------------------------------------------------------------------
63002219a08SJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated
63102219a08SJeremy L Thompson //------------------------------------------------------------------------------
63202219a08SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes3d(SharedData_Hip & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6330ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6340ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
63502219a08SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
63602219a08SJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
63702219a08SJeremy L Thompson ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
63802219a08SJeremy L Thompson ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
63902219a08SJeremy L Thompson }
64002219a08SJeremy L Thompson }
64102219a08SJeremy L Thompson
64202219a08SJeremy L Thompson //------------------------------------------------------------------------------
6439e201c85SYohann // 3D quadrature weights
6449e201c85SYohann //------------------------------------------------------------------------------
6459b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
WeightTensor3d(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)6469e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
6479e201c85SYohann const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
6489e201c85SYohann const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
6499e201c85SYohann for (CeedInt q = 0; q < Q_1D; q++) {
6509e201c85SYohann w[q] = quad ? pw * q_weight_1d[q] : 0.0;
6519e201c85SYohann }
6529e201c85SYohann }
653