19b91271bSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39b91271bSJeremy L Thompson // 49b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59b91271bSJeremy L Thompson // 69b91271bSJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79b91271bSJeremy L Thompson 89b91271bSJeremy L Thompson /// @file 99b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates 109b91271bSJeremy L Thompson #include <ceed/types.h> 119b91271bSJeremy L Thompson 129b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 139b91271bSJeremy L Thompson // 2D 149b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 159b91271bSJeremy L Thompson 169b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 179b91271bSJeremy L Thompson // 2D tensor contraction x 189b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 209b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 219b91271bSJeremy L Thompson CeedScalar *V) { 22d6c19ee8SJeremy L Thompson __syncthreads(); 239b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 249b91271bSJeremy L Thompson __syncthreads(); 259b91271bSJeremy L Thompson *V = 0.0; 269b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 279b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 289b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 299b91271bSJeremy L Thompson } 309b91271bSJeremy L Thompson } 319b91271bSJeremy L Thompson } 329b91271bSJeremy L Thompson 339b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 349b91271bSJeremy L Thompson // 2D tensor contract y 359b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 369b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 379b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 389b91271bSJeremy L Thompson CeedScalar *V) { 39d6c19ee8SJeremy L Thompson __syncthreads(); 409b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 419b91271bSJeremy L Thompson __syncthreads(); 429b91271bSJeremy L Thompson *V = 0.0; 439b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 449b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 459b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 469b91271bSJeremy L Thompson } 479b91271bSJeremy L Thompson } 489b91271bSJeremy L Thompson } 499b91271bSJeremy L Thompson 509b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 519b91271bSJeremy L Thompson // 2D transpose tensor contract y 529b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 539b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 549b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 559b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56d6c19ee8SJeremy L Thompson __syncthreads(); 579b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 589b91271bSJeremy L Thompson __syncthreads(); 599b91271bSJeremy L Thompson *V = 0.0; 609b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 619b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 629b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 639b91271bSJeremy L Thompson } 649b91271bSJeremy L Thompson } 659b91271bSJeremy L Thompson } 669b91271bSJeremy L Thompson 679b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 689b91271bSJeremy L Thompson // 2D transpose tensor contract x 699b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 709b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 719b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 729b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73d6c19ee8SJeremy L Thompson __syncthreads(); 749b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 759b91271bSJeremy L Thompson __syncthreads(); 769b91271bSJeremy L Thompson *V = 0.0; 779b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 789b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 799b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 809b91271bSJeremy L Thompson } 819b91271bSJeremy L Thompson } 829b91271bSJeremy L Thompson } 839b91271bSJeremy L Thompson 849b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 859b91271bSJeremy L Thompson // 2D transpose tensor contract and add x 869b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 889b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 899b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90d6c19ee8SJeremy L Thompson __syncthreads(); 919b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 929b91271bSJeremy L Thompson __syncthreads(); 939b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 949b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 959b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 969b91271bSJeremy L Thompson } 979b91271bSJeremy L Thompson } 989b91271bSJeremy L Thompson } 999b91271bSJeremy L Thompson 1009b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1019b91271bSJeremy L Thompson // 2D pack/unpack quadrature values 1029b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1039b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 1049b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 1059b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 1069b91271bSJeremy L Thompson 1079b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108d6c19ee8SJeremy L Thompson __syncthreads(); 1099b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 1109b91271bSJeremy L Thompson __syncthreads(); 1119b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 1129b91271bSJeremy L Thompson } 1139b91271bSJeremy L Thompson } 1149b91271bSJeremy L Thompson 1159b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 1169b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 1179b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 1189b91271bSJeremy L Thompson 1199b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120d6c19ee8SJeremy L Thompson __syncthreads(); 1219b91271bSJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 1229b91271bSJeremy L Thompson __syncthreads(); 1239b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 1249b91271bSJeremy L Thompson } 1259b91271bSJeremy L Thompson } 1269b91271bSJeremy L Thompson 1279b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1289b91271bSJeremy L Thompson // 2D interpolate to quadrature points 1299b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1309b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 1319b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 1329b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 1339b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 1349b91271bSJeremy L Thompson CeedScalar r_t[1]; 1359b91271bSJeremy L Thompson 136ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 1379b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1389b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 1399b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 1409b91271bSJeremy L Thompson } 141*3e2e790dSJeremy L Thompson __syncthreads(); 142ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 143ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 1449b91271bSJeremy L Thompson } 1459b91271bSJeremy L Thompson 1469b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1479b91271bSJeremy L Thompson // 2D interpolate transpose 1489b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1499b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 1509b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 1519b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 1529b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 1539b91271bSJeremy L Thompson CeedScalar r_t[1]; 1549b91271bSJeremy L Thompson 155ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 1569b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1579b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 1589b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 1599b91271bSJeremy L Thompson } 160*3e2e790dSJeremy L Thompson __syncthreads(); 161ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 1629b91271bSJeremy L Thompson } 1639b91271bSJeremy L Thompson 1649b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1659b91271bSJeremy L Thompson // 2D derivatives at quadrature points 1669b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1679b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 1689b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 1699b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 1709b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 1719b91271bSJeremy L Thompson CeedScalar r_t[1]; 1729b91271bSJeremy L Thompson 173ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 1749b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1759b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 1769b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 1779b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 1789b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 1799b91271bSJeremy L Thompson } 180*3e2e790dSJeremy L Thompson __syncthreads(); 181ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 182ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 1839b91271bSJeremy L Thompson } 1849b91271bSJeremy L Thompson 1859b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1869b91271bSJeremy L Thompson // 2D derivatives transpose 1879b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1889b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 1899b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 1909b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 1919b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 1929b91271bSJeremy L Thompson CeedScalar r_t[1]; 1939b91271bSJeremy L Thompson 194ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 1959b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1969b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 1979b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 1989b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 1999b91271bSJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 2009b91271bSJeremy L Thompson } 201*3e2e790dSJeremy L Thompson __syncthreads(); 202ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 2039b91271bSJeremy L Thompson } 2049b91271bSJeremy L Thompson 2059b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2069b91271bSJeremy L Thompson // 2D quadrature weights 2079b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2089b91271bSJeremy L Thompson template <int P_1D, int Q_1D> 2099b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 2109b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 2119b91271bSJeremy L Thompson 2129b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 2139b91271bSJeremy L Thompson } 2149b91271bSJeremy L Thompson 2159b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2169b91271bSJeremy L Thompson // 3D 2179b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2189b91271bSJeremy L Thompson 2199b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2209b91271bSJeremy L Thompson // 3D tensor contract x 2219b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2229b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2239b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 2249b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 225d6c19ee8SJeremy L Thompson __syncthreads(); 2269b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 2279b91271bSJeremy L Thompson __syncthreads(); 2289b91271bSJeremy L Thompson *V = 0.0; 2299b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 2309b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 2319b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 2329b91271bSJeremy L Thompson } 2339b91271bSJeremy L Thompson } 2349b91271bSJeremy L Thompson } 2359b91271bSJeremy L Thompson 2369b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2379b91271bSJeremy L Thompson // 3D tensor contract y 2389b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2399b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2409b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 2419b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 242d6c19ee8SJeremy L Thompson __syncthreads(); 2439b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 2449b91271bSJeremy L Thompson __syncthreads(); 2459b91271bSJeremy L Thompson *V = 0.0; 2469b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 2479b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 2489b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 2499b91271bSJeremy L Thompson } 2509b91271bSJeremy L Thompson } 2519b91271bSJeremy L Thompson } 2529b91271bSJeremy L Thompson 2539b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2549b91271bSJeremy L Thompson // 3D tensor contract z 2559b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2569b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2579b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 2589b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 259d6c19ee8SJeremy L Thompson __syncthreads(); 2609b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 2619b91271bSJeremy L Thompson __syncthreads(); 2629b91271bSJeremy L Thompson *V = 0.0; 2639b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 2649b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 2659b91271bSJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 2669b91271bSJeremy L Thompson } 2679b91271bSJeremy L Thompson } 2689b91271bSJeremy L Thompson } 2699b91271bSJeremy L Thompson 2709b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2719b91271bSJeremy L Thompson // 3D tensor contract z 2729b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2739b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2749b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 2759b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 276d6c19ee8SJeremy L Thompson __syncthreads(); 2779b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 2789b91271bSJeremy L Thompson __syncthreads(); 2799b91271bSJeremy L Thompson *V = 0.0; 2809b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 2819b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2829b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 2839b91271bSJeremy L Thompson } 2849b91271bSJeremy L Thompson } 2859b91271bSJeremy L Thompson } 2869b91271bSJeremy L Thompson 2879b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2889b91271bSJeremy L Thompson // 3D transpose tensor contract z 2899b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2919b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 2929b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 293d6c19ee8SJeremy L Thompson __syncthreads(); 2949b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 2959b91271bSJeremy L Thompson __syncthreads(); 2969b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 2979b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 2989b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 2999b91271bSJeremy L Thompson } 3009b91271bSJeremy L Thompson } 3019b91271bSJeremy L Thompson } 3029b91271bSJeremy L Thompson 3039b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3049b91271bSJeremy L Thompson // 3D transpose tensor contract y 3059b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3069b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3079b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 3089b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 309d6c19ee8SJeremy L Thompson __syncthreads(); 3109b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3119b91271bSJeremy L Thompson __syncthreads(); 3129b91271bSJeremy L Thompson *V = 0.0; 3139b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 3149b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3159b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 3169b91271bSJeremy L Thompson } 3179b91271bSJeremy L Thompson } 3189b91271bSJeremy L Thompson } 3199b91271bSJeremy L Thompson 3209b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3219b91271bSJeremy L Thompson // 3D transpose tensor contract y 3229b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3239b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3249b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 3259b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 326d6c19ee8SJeremy L Thompson __syncthreads(); 3279b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3289b91271bSJeremy L Thompson __syncthreads(); 3299b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 3309b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3319b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 3329b91271bSJeremy L Thompson } 3339b91271bSJeremy L Thompson } 3349b91271bSJeremy L Thompson } 3359b91271bSJeremy L Thompson 3369b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3379b91271bSJeremy L Thompson // 3D transpose tensor contract x 3389b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3399b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3409b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 3419b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 342d6c19ee8SJeremy L Thompson __syncthreads(); 3439b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3449b91271bSJeremy L Thompson __syncthreads(); 3459b91271bSJeremy L Thompson *V = 0.0; 3469b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 3479b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3489b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 3499b91271bSJeremy L Thompson } 3509b91271bSJeremy L Thompson } 3519b91271bSJeremy L Thompson } 3529b91271bSJeremy L Thompson 3539b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3549b91271bSJeremy L Thompson // 3D transpose tensor contract add x 3559b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3569b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3579b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 3589b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 359d6c19ee8SJeremy L Thompson __syncthreads(); 3609b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3619b91271bSJeremy L Thompson __syncthreads(); 3629b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 3639b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3649b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 3659b91271bSJeremy L Thompson } 3669b91271bSJeremy L Thompson } 3679b91271bSJeremy L Thompson } 3689b91271bSJeremy L Thompson 3699b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3709b91271bSJeremy L Thompson // 3D pack/unpack quadrature values 3719b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3729b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 3739b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 3749b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 3759b91271bSJeremy L Thompson 3769b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 377d6c19ee8SJeremy L Thompson __syncthreads(); 3789b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 3799b91271bSJeremy L Thompson __syncthreads(); 3809b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 3819b91271bSJeremy L Thompson } 3829b91271bSJeremy L Thompson } 3839b91271bSJeremy L Thompson 3849b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 3859b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 3869b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 3879b91271bSJeremy L Thompson 3889b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 389d6c19ee8SJeremy L Thompson __syncthreads(); 3909b91271bSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 3919b91271bSJeremy L Thompson __syncthreads(); 3929b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 3939b91271bSJeremy L Thompson } 3949b91271bSJeremy L Thompson } 3959b91271bSJeremy L Thompson 3969b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3979b91271bSJeremy L Thompson // 3D interpolate to quadrature points 3989b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3999b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4009b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 4019b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 4029b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 4039b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 4049b91271bSJeremy L Thompson 405ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 4069b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4079b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 4089b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 4099b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 4109b91271bSJeremy L Thompson } 411*3e2e790dSJeremy L Thompson __syncthreads(); 412ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 413ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 4149b91271bSJeremy L Thompson } 4159b91271bSJeremy L Thompson 4169b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4179b91271bSJeremy L Thompson // 3D interpolate transpose 4189b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4209b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 4219b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 4229b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 4239b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 4249b91271bSJeremy L Thompson 425ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 4269b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4279b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 4289b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 4299b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 4309b91271bSJeremy L Thompson } 431*3e2e790dSJeremy L Thompson __syncthreads(); 432ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 4339b91271bSJeremy L Thompson } 4349b91271bSJeremy L Thompson 4359b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4369b91271bSJeremy L Thompson // 3D derivatives at quadrature points 4379b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4389b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4399b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 4409b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 4419b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 4429b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 4439b91271bSJeremy L Thompson 444ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 4459b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4469b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 4479b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 4489b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 4499b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 4509b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 4519b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 4529b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 4539b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 4549b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 4559b91271bSJeremy L Thompson } 456*3e2e790dSJeremy L Thompson __syncthreads(); 457ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 458ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 4599b91271bSJeremy L Thompson } 4609b91271bSJeremy L Thompson 4619b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4629b91271bSJeremy L Thompson // 3D derivatives transpose 4639b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4649b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4659b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 4669b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 4679b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 4689b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 4699b91271bSJeremy L Thompson 470ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 4719b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4729b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 4739b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 4749b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 4759b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 4769b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 4779b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 4789b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 4799b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 4809b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 4819b91271bSJeremy L Thompson } 482*3e2e790dSJeremy L Thompson __syncthreads(); 483ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 4849b91271bSJeremy L Thompson } 4859b91271bSJeremy L Thompson 4869b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4879b91271bSJeremy L Thompson // 3D derivatives at quadrature points 4889b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4899b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4909b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 4919b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 4929b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 4939b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 4949b91271bSJeremy L Thompson 495ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 4969b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4979b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 4989b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 4999b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 5009b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 5019b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 5029b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 5039b91271bSJeremy L Thompson } 504*3e2e790dSJeremy L Thompson __syncthreads(); 505ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 506ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 5079b91271bSJeremy L Thompson } 5089b91271bSJeremy L Thompson 5099b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5109b91271bSJeremy L Thompson // 3D derivatives transpose 5119b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5129b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5139b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 5149b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 5159b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 5169b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 5179b91271bSJeremy L Thompson 518ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 5199b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5209b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 5219b91271bSJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 5229b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 5239b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 5249b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 5259b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 5269b91271bSJeremy L Thompson } 527*3e2e790dSJeremy L Thompson __syncthreads(); 528ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 5299b91271bSJeremy L Thompson } 5309b91271bSJeremy L Thompson 5319b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5329b91271bSJeremy L Thompson // 3D quadrature weights 5339b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5349b91271bSJeremy L Thompson template <int P_1D, int Q_1D> 5359b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 5369b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 5379b91271bSJeremy L Thompson 5389b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 5399b91271bSJeremy L Thompson } 540