15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e201c85SYohann // 49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause 59e201c85SYohann // 69e201c85SYohann // This file is part of CEED: http://github.com/ceed 79e201c85SYohann 89e201c85SYohann /// @file 99e201c85SYohann /// Internal header for HIP shared memory tensor product basis templates 10c0b5abf0SJeremy L Thompson #include <ceed/types.h> 119e201c85SYohann 129e201c85SYohann //------------------------------------------------------------------------------ 139e201c85SYohann // 1D 149e201c85SYohann //------------------------------------------------------------------------------ 159e201c85SYohann 169e201c85SYohann //------------------------------------------------------------------------------ 179e201c85SYohann // 1D tensor contraction x 189e201c85SYohann //------------------------------------------------------------------------------ 199e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 209e201c85SYohann inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 219e201c85SYohann data.slice[data.t_id_x] = *U; 229e201c85SYohann __syncthreads(); 239e201c85SYohann *V = 0.0; 249e201c85SYohann if (data.t_id_x < Q_1D) { 259e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 269e201c85SYohann *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 279e201c85SYohann } 289e201c85SYohann } 299e201c85SYohann __syncthreads(); 309e201c85SYohann } 319e201c85SYohann 329e201c85SYohann //------------------------------------------------------------------------------ 339e201c85SYohann // 1D transpose tensor contraction x 349e201c85SYohann //------------------------------------------------------------------------------ 359e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D> 369e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 379e201c85SYohann data.slice[data.t_id_x] = *U; 389e201c85SYohann __syncthreads(); 399e201c85SYohann *V = 0.0; 409e201c85SYohann if (data.t_id_x < P_1D) { 419e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 429e201c85SYohann *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 439e201c85SYohann } 449e201c85SYohann } 459e201c85SYohann __syncthreads(); 469e201c85SYohann } 479e201c85SYohann 489e201c85SYohann //------------------------------------------------------------------------------ 499e201c85SYohann // 1D interpolate to quadrature points 509e201c85SYohann //------------------------------------------------------------------------------ 516b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 529e201c85SYohann inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 539e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54db2becc9SJeremy L Thompson ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 559e201c85SYohann } 569e201c85SYohann } 579e201c85SYohann 589e201c85SYohann //------------------------------------------------------------------------------ 599e201c85SYohann // 1D interpolate transpose 609e201c85SYohann //------------------------------------------------------------------------------ 616b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 622b730f8bSJeremy L Thompson inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 632b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 649e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65db2becc9SJeremy L Thompson ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 669e201c85SYohann } 679e201c85SYohann } 689e201c85SYohann 699e201c85SYohann //------------------------------------------------------------------------------ 709e201c85SYohann // 1D derivatives at quadrature points 719e201c85SYohann //------------------------------------------------------------------------------ 726b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 732b730f8bSJeremy L Thompson inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 742b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 759e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76db2becc9SJeremy L Thompson ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 779e201c85SYohann } 789e201c85SYohann } 799e201c85SYohann 809e201c85SYohann //------------------------------------------------------------------------------ 819e201c85SYohann // 1D derivatives transpose 829e201c85SYohann //------------------------------------------------------------------------------ 836b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 842b730f8bSJeremy L Thompson inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 852b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 869e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87db2becc9SJeremy L Thompson ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 889e201c85SYohann } 899e201c85SYohann } 909e201c85SYohann 919e201c85SYohann //------------------------------------------------------------------------------ 929e201c85SYohann // 1D quadrature weights 939e201c85SYohann //------------------------------------------------------------------------------ 949e201c85SYohann template <int Q_1D> 959e201c85SYohann inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 969e201c85SYohann *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 979e201c85SYohann } 989e201c85SYohann 999e201c85SYohann //------------------------------------------------------------------------------ 1009e201c85SYohann // 2D 1019e201c85SYohann //------------------------------------------------------------------------------ 1029e201c85SYohann 1039e201c85SYohann //------------------------------------------------------------------------------ 1049e201c85SYohann // 2D tensor contraction x 1059e201c85SYohann //------------------------------------------------------------------------------ 1066b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107343e3094SJeremy L Thompson inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 108343e3094SJeremy L Thompson CeedScalar *V) { 109343e3094SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 1109e201c85SYohann __syncthreads(); 1119e201c85SYohann *V = 0.0; 112343e3094SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 1139e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 114343e3094SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 1159e201c85SYohann } 1169e201c85SYohann } 1179e201c85SYohann __syncthreads(); 1189e201c85SYohann } 1199e201c85SYohann 1209e201c85SYohann //------------------------------------------------------------------------------ 1219e201c85SYohann // 2D tensor contract y 1229e201c85SYohann //------------------------------------------------------------------------------ 1236b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 124343e3094SJeremy L Thompson inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 125343e3094SJeremy L Thompson CeedScalar *V) { 126343e3094SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 127343e3094SJeremy L Thompson >>>>>>> b855402d (gpu - isolate core 2D tensor logic to allow flat version) 1289e201c85SYohann __syncthreads(); 1299e201c85SYohann *V = 0.0; 130343e3094SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 1319e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 132343e3094SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 1339e201c85SYohann } 1349e201c85SYohann } 1359e201c85SYohann __syncthreads(); 1369e201c85SYohann } 1379e201c85SYohann 1389e201c85SYohann //------------------------------------------------------------------------------ 1399e201c85SYohann // 2D transpose tensor contract y 1409e201c85SYohann //------------------------------------------------------------------------------ 1416b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 142343e3094SJeremy L Thompson inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 143343e3094SJeremy L Thompson CeedScalar *V) { 144343e3094SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 1459e201c85SYohann __syncthreads(); 1469e201c85SYohann *V = 0.0; 147343e3094SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 1489e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 149343e3094SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 1509e201c85SYohann } 1519e201c85SYohann } 1529e201c85SYohann __syncthreads(); 1539e201c85SYohann } 1549e201c85SYohann 1559e201c85SYohann //------------------------------------------------------------------------------ 1569e201c85SYohann // 2D transpose tensor contract x 1579e201c85SYohann //------------------------------------------------------------------------------ 1586b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 159343e3094SJeremy L Thompson inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 160343e3094SJeremy L Thompson CeedScalar *V) { 161343e3094SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 1629e201c85SYohann __syncthreads(); 1639e201c85SYohann *V = 0.0; 164343e3094SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 1659e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 166343e3094SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 1679e201c85SYohann } 1689e201c85SYohann } 1699e201c85SYohann __syncthreads(); 1709e201c85SYohann } 1719e201c85SYohann 1729e201c85SYohann //------------------------------------------------------------------------------ 1739e201c85SYohann // 2D transpose tensor contract and add x 1749e201c85SYohann //------------------------------------------------------------------------------ 1756b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 176343e3094SJeremy L Thompson inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 177343e3094SJeremy L Thompson CeedScalar *V) { 178343e3094SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 1799e201c85SYohann __syncthreads(); 180343e3094SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 1819e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 182343e3094SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 1839e201c85SYohann } 1849e201c85SYohann } 1859e201c85SYohann __syncthreads(); 1869e201c85SYohann } 1879e201c85SYohann 1889e201c85SYohann //------------------------------------------------------------------------------ 1899e201c85SYohann // 2D interpolate to quadrature points 1909e201c85SYohann //------------------------------------------------------------------------------ 1916b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 192343e3094SJeremy L Thompson inline __device__ void InterpTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 193343e3094SJeremy L Thompson const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 1949e201c85SYohann CeedScalar r_t[1]; 1959e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 196343e3094SJeremy L Thompson ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 197343e3094SJeremy L Thompson ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 1989e201c85SYohann } 1999e201c85SYohann } 2009e201c85SYohann 201343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 202343e3094SJeremy L Thompson inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 203343e3094SJeremy L Thompson InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 204343e3094SJeremy L Thompson } 205343e3094SJeremy L Thompson 206*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D> 207*ca595be6SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 208*ca595be6SJeremy L Thompson CeedScalar *__restrict__ r_V) { 209*ca595be6SJeremy L Thompson const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 210*ca595be6SJeremy L Thompson 211*ca595be6SJeremy L Thompson InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V); 212*ca595be6SJeremy L Thompson } 213*ca595be6SJeremy L Thompson 2149e201c85SYohann //------------------------------------------------------------------------------ 2159e201c85SYohann // 2D interpolate transpose 2169e201c85SYohann //------------------------------------------------------------------------------ 2176b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 218343e3094SJeremy L Thompson inline __device__ void InterpTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 219343e3094SJeremy L Thompson const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 2209e201c85SYohann CeedScalar r_t[1]; 2219e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 222343e3094SJeremy L Thompson ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 223343e3094SJeremy L Thompson ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 2249e201c85SYohann } 2259e201c85SYohann } 2269e201c85SYohann 227343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 228343e3094SJeremy L Thompson inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 229343e3094SJeremy L Thompson CeedScalar *__restrict__ r_V) { 230343e3094SJeremy L Thompson InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 231343e3094SJeremy L Thompson } 232343e3094SJeremy L Thompson 233*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D> 234*ca595be6SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 235*ca595be6SJeremy L Thompson CeedScalar *__restrict__ r_V) { 236*ca595be6SJeremy L Thompson const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 237*ca595be6SJeremy L Thompson 238*ca595be6SJeremy L Thompson InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V); 239*ca595be6SJeremy L Thompson } 240*ca595be6SJeremy L Thompson 2419e201c85SYohann //------------------------------------------------------------------------------ 2429e201c85SYohann // 2D derivatives at quadrature points 2439e201c85SYohann //------------------------------------------------------------------------------ 2446b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 245343e3094SJeremy L Thompson inline __device__ void GradTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 246343e3094SJeremy L Thompson const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 2479e201c85SYohann CeedScalar r_t[1]; 2489e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 249343e3094SJeremy L Thompson ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 250343e3094SJeremy L Thompson ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 251343e3094SJeremy L Thompson ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 252343e3094SJeremy L Thompson ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 2539e201c85SYohann } 2549e201c85SYohann } 2559e201c85SYohann 256343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 257343e3094SJeremy L Thompson inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 258343e3094SJeremy L Thompson CeedScalar *__restrict__ r_V) { 259343e3094SJeremy L Thompson GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 260343e3094SJeremy L Thompson } 261343e3094SJeremy L Thompson 262*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D> 263*ca595be6SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 264*ca595be6SJeremy L Thompson CeedScalar *__restrict__ r_V) { 265*ca595be6SJeremy L Thompson const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 266*ca595be6SJeremy L Thompson 267*ca595be6SJeremy L Thompson GradTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V); 268*ca595be6SJeremy L Thompson } 269*ca595be6SJeremy L Thompson 2709e201c85SYohann //------------------------------------------------------------------------------ 2719e201c85SYohann // 2D derivatives transpose 2729e201c85SYohann //------------------------------------------------------------------------------ 2736b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 274343e3094SJeremy L Thompson inline __device__ void GradTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 275343e3094SJeremy L Thompson const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 2769e201c85SYohann CeedScalar r_t[1]; 2779e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 278343e3094SJeremy L Thompson ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 279343e3094SJeremy L Thompson ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 280343e3094SJeremy L Thompson ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 281343e3094SJeremy L Thompson ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 2829e201c85SYohann } 2839e201c85SYohann } 2849e201c85SYohann 285343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 286343e3094SJeremy L Thompson inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 287343e3094SJeremy L Thompson CeedScalar *__restrict__ r_V) { 288343e3094SJeremy L Thompson GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 289343e3094SJeremy L Thompson } 290343e3094SJeremy L Thompson 291*ca595be6SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D> 292*ca595be6SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 293*ca595be6SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 294*ca595be6SJeremy L Thompson const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 295*ca595be6SJeremy L Thompson 296*ca595be6SJeremy L Thompson GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V); 297*ca595be6SJeremy L Thompson } 298*ca595be6SJeremy L Thompson 2999e201c85SYohann //------------------------------------------------------------------------------ 3009e201c85SYohann // 2D quadrature weights 3019e201c85SYohann //------------------------------------------------------------------------------ 3029e201c85SYohann template <int Q_1D> 303*ca595be6SJeremy L Thompson inline __device__ void WeightTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d, 304*ca595be6SJeremy L Thompson CeedScalar *w) { 305*ca595be6SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 306*ca595be6SJeremy L Thompson } 307*ca595be6SJeremy L Thompson 308*ca595be6SJeremy L Thompson template <int P_1D, int Q_1D> 3099e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 310*ca595be6SJeremy L Thompson WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w); 311*ca595be6SJeremy L Thompson } 312*ca595be6SJeremy L Thompson 313*ca595be6SJeremy L Thompson template <int P_1D, int Q_1D> 314*ca595be6SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 315*ca595be6SJeremy L Thompson const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 316*ca595be6SJeremy L Thompson 317*ca595be6SJeremy L Thompson WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w); 3189e201c85SYohann } 3199e201c85SYohann 3209e201c85SYohann //------------------------------------------------------------------------------ 3219e201c85SYohann // 3D 3229e201c85SYohann //------------------------------------------------------------------------------ 3239e201c85SYohann 3249e201c85SYohann //------------------------------------------------------------------------------ 3259e201c85SYohann // 3D tensor contract x 3269e201c85SYohann //------------------------------------------------------------------------------ 3276b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3289e201c85SYohann inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 3299e201c85SYohann CeedScalar r_B[P_1D]; 3309e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 3319e201c85SYohann r_B[i] = B[i + data.t_id_x * P_1D]; 3329e201c85SYohann } 3339e201c85SYohann 3349e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 3359e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 3369e201c85SYohann __syncthreads(); 3379e201c85SYohann V[k] = 0.0; 3389e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 3399e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 3409e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 3419e201c85SYohann } 3429e201c85SYohann } 3439e201c85SYohann __syncthreads(); 3449e201c85SYohann } 3459e201c85SYohann } 3469e201c85SYohann 3479e201c85SYohann //------------------------------------------------------------------------------ 3489e201c85SYohann // 3D tensor contract y 3499e201c85SYohann //------------------------------------------------------------------------------ 3506b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3519e201c85SYohann inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 3529e201c85SYohann CeedScalar r_B[P_1D]; 3539e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 3549e201c85SYohann r_B[i] = B[i + data.t_id_y * P_1D]; 3559e201c85SYohann } 3569e201c85SYohann 3579e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 3589e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 3599e201c85SYohann __syncthreads(); 3609e201c85SYohann V[k] = 0.0; 3619e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 3629e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 3639e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 3649e201c85SYohann } 3659e201c85SYohann } 3669e201c85SYohann __syncthreads(); 3679e201c85SYohann } 3689e201c85SYohann } 3699e201c85SYohann 3709e201c85SYohann //------------------------------------------------------------------------------ 3719e201c85SYohann // 3D tensor contract z 3729e201c85SYohann //------------------------------------------------------------------------------ 3736b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3749e201c85SYohann inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 3759e201c85SYohann for (CeedInt k = 0; k < Q_1D; k++) { 3769e201c85SYohann V[k] = 0.0; 3779e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 3789e201c85SYohann for (CeedInt i = 0; i < P_1D; i++) { 3799e201c85SYohann V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 3809e201c85SYohann } 3819e201c85SYohann } 3829e201c85SYohann } 3839e201c85SYohann } 3849e201c85SYohann 3859e201c85SYohann //------------------------------------------------------------------------------ 3869e201c85SYohann // 3D transpose tensor contract z 3879e201c85SYohann //------------------------------------------------------------------------------ 3886b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3899e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 3909e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 3919e201c85SYohann V[k] = 0.0; 3929e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 3939e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 3949e201c85SYohann V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 3959e201c85SYohann } 3969e201c85SYohann } 3979e201c85SYohann } 3989e201c85SYohann } 3999e201c85SYohann 4009e201c85SYohann //------------------------------------------------------------------------------ 4019e201c85SYohann // 3D transpose tensor contract y 4029e201c85SYohann //------------------------------------------------------------------------------ 4036b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4049e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 4059e201c85SYohann CeedScalar r_B[Q_1D]; 4069e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4079e201c85SYohann r_B[i] = B[data.t_id_y + i * P_1D]; 4089e201c85SYohann } 4099e201c85SYohann 4109e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 4119e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 4129e201c85SYohann __syncthreads(); 4139e201c85SYohann V[k] = 0.0; 4149e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 4159e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4169e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 4179e201c85SYohann } 4189e201c85SYohann } 4199e201c85SYohann __syncthreads(); 4209e201c85SYohann } 4219e201c85SYohann } 4229e201c85SYohann 4239e201c85SYohann //------------------------------------------------------------------------------ 4249e201c85SYohann // 3D transpose tensor contract y 4259e201c85SYohann //------------------------------------------------------------------------------ 4266b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4279e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 4289e201c85SYohann CeedScalar r_B[Q_1D]; 4299e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4309e201c85SYohann r_B[i] = B[data.t_id_y + i * P_1D]; 4319e201c85SYohann } 4329e201c85SYohann 4339e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 4349e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 4359e201c85SYohann __syncthreads(); 4369e201c85SYohann if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 4379e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4389e201c85SYohann V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 4399e201c85SYohann } 4409e201c85SYohann } 4419e201c85SYohann __syncthreads(); 4429e201c85SYohann } 4439e201c85SYohann } 4449e201c85SYohann 4459e201c85SYohann //------------------------------------------------------------------------------ 4469e201c85SYohann // 3D transpose tensor contract x 4479e201c85SYohann //------------------------------------------------------------------------------ 4486b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4499e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 4509e201c85SYohann CeedScalar r_B[Q_1D]; 4519e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4529e201c85SYohann r_B[i] = B[data.t_id_x + i * P_1D]; 4539e201c85SYohann } 4549e201c85SYohann 4559e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 4569e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 4579e201c85SYohann __syncthreads(); 4589e201c85SYohann V[k] = 0.0; 4599e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 4609e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4619e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 4629e201c85SYohann } 4639e201c85SYohann } 4649e201c85SYohann __syncthreads(); 4659e201c85SYohann } 4669e201c85SYohann } 4679e201c85SYohann 4689e201c85SYohann //------------------------------------------------------------------------------ 4699e201c85SYohann // 3D transpose tensor contract add x 4709e201c85SYohann //------------------------------------------------------------------------------ 4716b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4729e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 4739e201c85SYohann CeedScalar r_B[Q_1D]; 4749e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4759e201c85SYohann r_B[i] = B[data.t_id_x + i * P_1D]; 4769e201c85SYohann } 4779e201c85SYohann 4789e201c85SYohann for (CeedInt k = 0; k < P_1D; k++) { 4799e201c85SYohann data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 4809e201c85SYohann __syncthreads(); 4819e201c85SYohann if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 4829e201c85SYohann for (CeedInt i = 0; i < Q_1D; i++) { 4839e201c85SYohann V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 4849e201c85SYohann } 4859e201c85SYohann } 4869e201c85SYohann __syncthreads(); 4879e201c85SYohann } 4889e201c85SYohann } 4899e201c85SYohann 4909e201c85SYohann //------------------------------------------------------------------------------ 4919e201c85SYohann // 3D interpolate to quadrature points 4929e201c85SYohann //------------------------------------------------------------------------------ 4936b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4949e201c85SYohann inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 4959e201c85SYohann CeedScalar r_t1[T_1D]; 4969e201c85SYohann CeedScalar r_t2[T_1D]; 4979e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4986b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 4996b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5006b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 5019e201c85SYohann } 5029e201c85SYohann } 5039e201c85SYohann 5049e201c85SYohann //------------------------------------------------------------------------------ 5059e201c85SYohann // 3D interpolate transpose 5069e201c85SYohann //------------------------------------------------------------------------------ 5076b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5082b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 5092b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 5109e201c85SYohann CeedScalar r_t1[T_1D]; 5119e201c85SYohann CeedScalar r_t2[T_1D]; 5129e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5136b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 5146b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5156b92dc4bSJeremy L Thompson ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 5169e201c85SYohann } 5179e201c85SYohann } 5189e201c85SYohann 5199e201c85SYohann //------------------------------------------------------------------------------ 5209e201c85SYohann // 3D derivatives at quadrature points 5219e201c85SYohann //------------------------------------------------------------------------------ 5226b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5232b730f8bSJeremy L Thompson inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 5242b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 5259e201c85SYohann CeedScalar r_t1[T_1D]; 5269e201c85SYohann CeedScalar r_t2[T_1D]; 5279e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5286b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 5296b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5306b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 5316b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 5326b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 5336b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 5346b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 5356b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5366b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 5379e201c85SYohann } 5389e201c85SYohann } 5399e201c85SYohann 5409e201c85SYohann //------------------------------------------------------------------------------ 5419e201c85SYohann // 3D derivatives transpose 5429e201c85SYohann //------------------------------------------------------------------------------ 5436b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5442b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 5452b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 5469e201c85SYohann CeedScalar r_t1[T_1D]; 5479e201c85SYohann CeedScalar r_t2[T_1D]; 5489e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5496b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 5506b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5516b92dc4bSJeremy L Thompson ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 5526b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 5536b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 5546b92dc4bSJeremy L Thompson ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 5556b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 5566b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5576b92dc4bSJeremy L Thompson ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 5589e201c85SYohann } 5599e201c85SYohann } 5609e201c85SYohann 5619e201c85SYohann //------------------------------------------------------------------------------ 5629e201c85SYohann // 3D derivatives at quadrature points 5639e201c85SYohann //------------------------------------------------------------------------------ 5646b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5652b730f8bSJeremy L Thompson inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 5662b730f8bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 5679e201c85SYohann CeedScalar r_t1[T_1D]; 5689e201c85SYohann CeedScalar r_t2[T_1D]; 5699e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5706b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 5716b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5726b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 5736b92dc4bSJeremy L Thompson ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 5746b92dc4bSJeremy L Thompson ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 5756b92dc4bSJeremy L Thompson ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 5769e201c85SYohann } 5779e201c85SYohann } 5789e201c85SYohann 5799e201c85SYohann //------------------------------------------------------------------------------ 5809e201c85SYohann // 3D derivatives transpose 5819e201c85SYohann //------------------------------------------------------------------------------ 5826b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5832b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 5842b730f8bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 5859e201c85SYohann CeedScalar r_t1[T_1D]; 5869e201c85SYohann CeedScalar r_t2[T_1D]; 5879e201c85SYohann for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5886b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 5896b92dc4bSJeremy L Thompson ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 5906b92dc4bSJeremy L Thompson ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 5916b92dc4bSJeremy L Thompson ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 5926b92dc4bSJeremy L Thompson ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5936b92dc4bSJeremy L Thompson ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 5949e201c85SYohann } 5959e201c85SYohann } 5969e201c85SYohann 5979e201c85SYohann //------------------------------------------------------------------------------ 5989e201c85SYohann // 3D quadrature weights 5999e201c85SYohann //------------------------------------------------------------------------------ 6009e201c85SYohann template <int Q_1D> 6019e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 6029e201c85SYohann const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 6039e201c85SYohann const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 6049e201c85SYohann for (CeedInt q = 0; q < Q_1D; q++) { 6059e201c85SYohann w[q] = quad ? pw * q_weight_1d[q] : 0.0; 6069e201c85SYohann } 6079e201c85SYohann } 608