19b91271bSJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39b91271bSJeremy L Thompson // 49b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59b91271bSJeremy L Thompson // 69b91271bSJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79b91271bSJeremy L Thompson 89b91271bSJeremy L Thompson /// @file 99b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates 109b91271bSJeremy L Thompson #include <ceed/types.h> 119b91271bSJeremy L Thompson 129b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 139b91271bSJeremy L Thompson // 2D 149b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 159b91271bSJeremy L Thompson 169b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 179b91271bSJeremy L Thompson // 2D tensor contraction x 189b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 209b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 219b91271bSJeremy L Thompson CeedScalar *V) { 22d6c19ee8SJeremy L Thompson __syncthreads(); 239b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 249b91271bSJeremy L Thompson __syncthreads(); 259b91271bSJeremy L Thompson *V = 0.0; 269b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 279b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 289b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 299b91271bSJeremy L Thompson } 309b91271bSJeremy L Thompson } 319b91271bSJeremy L Thompson } 329b91271bSJeremy L Thompson 339b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 349b91271bSJeremy L Thompson // 2D tensor contract y 359b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 369b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 379b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 389b91271bSJeremy L Thompson CeedScalar *V) { 39d6c19ee8SJeremy L Thompson __syncthreads(); 409b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 419b91271bSJeremy L Thompson __syncthreads(); 429b91271bSJeremy L Thompson *V = 0.0; 439b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 449b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 459b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 469b91271bSJeremy L Thompson } 479b91271bSJeremy L Thompson } 489b91271bSJeremy L Thompson } 499b91271bSJeremy L Thompson 509b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 519b91271bSJeremy L Thompson // 2D transpose tensor contract y 529b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 539b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 549b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 559b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56d6c19ee8SJeremy L Thompson __syncthreads(); 579b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 589b91271bSJeremy L Thompson __syncthreads(); 599b91271bSJeremy L Thompson *V = 0.0; 609b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 619b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 629b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 639b91271bSJeremy L Thompson } 649b91271bSJeremy L Thompson } 659b91271bSJeremy L Thompson } 669b91271bSJeremy L Thompson 679b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 689b91271bSJeremy L Thompson // 2D transpose tensor contract x 699b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 709b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 719b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 729b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73d6c19ee8SJeremy L Thompson __syncthreads(); 749b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 759b91271bSJeremy L Thompson __syncthreads(); 769b91271bSJeremy L Thompson *V = 0.0; 779b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 789b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 799b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 809b91271bSJeremy L Thompson } 819b91271bSJeremy L Thompson } 829b91271bSJeremy L Thompson } 839b91271bSJeremy L Thompson 849b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 859b91271bSJeremy L Thompson // 2D transpose tensor contract and add x 869b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 889b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 899b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90d6c19ee8SJeremy L Thompson __syncthreads(); 919b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 929b91271bSJeremy L Thompson __syncthreads(); 939b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 949b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 959b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 969b91271bSJeremy L Thompson } 979b91271bSJeremy L Thompson } 989b91271bSJeremy L Thompson } 999b91271bSJeremy L Thompson 1009b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1019b91271bSJeremy L Thompson // 2D pack/unpack quadrature values 1029b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1039b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 1049b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 1059b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 1069b91271bSJeremy L Thompson 1079b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108d6c19ee8SJeremy L Thompson __syncthreads(); 1099b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 1109b91271bSJeremy L Thompson __syncthreads(); 1119b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 1129b91271bSJeremy L Thompson } 1139b91271bSJeremy L Thompson } 1149b91271bSJeremy L Thompson 1159b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 1169b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 1179b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 1189b91271bSJeremy L Thompson 1199b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120d6c19ee8SJeremy L Thompson __syncthreads(); 1219b91271bSJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 1229b91271bSJeremy L Thompson __syncthreads(); 1239b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 1249b91271bSJeremy L Thompson } 1259b91271bSJeremy L Thompson } 1269b91271bSJeremy L Thompson 1279b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1289b91271bSJeremy L Thompson // 2D interpolate to quadrature points 1299b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1309b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 1319b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 1329b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 1339b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 1349b91271bSJeremy L Thompson CeedScalar r_t[1]; 1359b91271bSJeremy L Thompson 136ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 1379b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1389b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 1399b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 1409b91271bSJeremy L Thompson } 1413e2e790dSJeremy L Thompson __syncthreads(); 142ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 143ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 1449b91271bSJeremy L Thompson } 1459b91271bSJeremy L Thompson 1469b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1479b91271bSJeremy L Thompson // 2D interpolate transpose 1489b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 1499b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 1509b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 1519b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 1529b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 1539b91271bSJeremy L Thompson CeedScalar r_t[1]; 1549b91271bSJeremy L Thompson 155ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 1569b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 1579b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 1589b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 1599b91271bSJeremy L Thompson } 1603e2e790dSJeremy L Thompson __syncthreads(); 161ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 1629b91271bSJeremy L Thompson } 1639b91271bSJeremy L Thompson 1649b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 165*0ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated 166*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 167*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 168*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 169*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 170*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 171*0ccda8ebSJeremy L Thompson 172*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 173*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 174*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 175*0ccda8ebSJeremy L Thompson } 176*0ccda8ebSJeremy L Thompson __syncthreads(); 177*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 178*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 179*0ccda8ebSJeremy L Thompson } 180*0ccda8ebSJeremy L Thompson 181*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 182*0ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated 183*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 184*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 185*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 186*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 187*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 188*0ccda8ebSJeremy L Thompson 189*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 190*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 191*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 192*0ccda8ebSJeremy L Thompson } 193*0ccda8ebSJeremy L Thompson __syncthreads(); 194*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 195*0ccda8ebSJeremy L Thompson } 196*0ccda8ebSJeremy L Thompson 197*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 1989b91271bSJeremy L Thompson // 2D derivatives at quadrature points 1999b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2009b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2019b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 2029b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 2039b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 2049b91271bSJeremy L Thompson CeedScalar r_t[1]; 2059b91271bSJeremy L Thompson 206ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 2079b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2089b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 2099b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 2109b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 2119b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 2129b91271bSJeremy L Thompson } 2133e2e790dSJeremy L Thompson __syncthreads(); 214ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 215ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 2169b91271bSJeremy L Thompson } 2179b91271bSJeremy L Thompson 2189b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2199b91271bSJeremy L Thompson // 2D derivatives transpose 2209b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2219b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2229b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 2239b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 2249b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 2259b91271bSJeremy L Thompson CeedScalar r_t[1]; 2269b91271bSJeremy L Thompson 227ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 2289b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 2299b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 2309b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 2319b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 2329b91271bSJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 2339b91271bSJeremy L Thompson } 2343e2e790dSJeremy L Thompson __syncthreads(); 235ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 2369b91271bSJeremy L Thompson } 2379b91271bSJeremy L Thompson 2389b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 239*0ccda8ebSJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated 240*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 241*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 242*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 243*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 244*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 245*0ccda8ebSJeremy L Thompson 246*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 247*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 248*0ccda8ebSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 249*0ccda8ebSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 250*0ccda8ebSJeremy L Thompson } 251*0ccda8ebSJeremy L Thompson __syncthreads(); 252*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 253*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 254*0ccda8ebSJeremy L Thompson } 255*0ccda8ebSJeremy L Thompson 256*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 257*0ccda8ebSJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated 258*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 259*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 260*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 261*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 262*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 263*0ccda8ebSJeremy L Thompson 264*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 265*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 266*0ccda8ebSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 267*0ccda8ebSJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 268*0ccda8ebSJeremy L Thompson } 269*0ccda8ebSJeremy L Thompson __syncthreads(); 270*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 271*0ccda8ebSJeremy L Thompson } 272*0ccda8ebSJeremy L Thompson 273*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 2749b91271bSJeremy L Thompson // 2D quadrature weights 2759b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2769b91271bSJeremy L Thompson template <int P_1D, int Q_1D> 2779b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 2789b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 2799b91271bSJeremy L Thompson 2809b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 2819b91271bSJeremy L Thompson } 2829b91271bSJeremy L Thompson 2839b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2849b91271bSJeremy L Thompson // 3D 2859b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2869b91271bSJeremy L Thompson 2879b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2889b91271bSJeremy L Thompson // 3D tensor contract x 2899b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 2909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 2919b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 2929b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 293d6c19ee8SJeremy L Thompson __syncthreads(); 2949b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 2959b91271bSJeremy L Thompson __syncthreads(); 2969b91271bSJeremy L Thompson *V = 0.0; 2979b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 2989b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 2999b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 3009b91271bSJeremy L Thompson } 3019b91271bSJeremy L Thompson } 3029b91271bSJeremy L Thompson } 3039b91271bSJeremy L Thompson 3049b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3059b91271bSJeremy L Thompson // 3D tensor contract y 3069b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3079b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3089b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 3099b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 310d6c19ee8SJeremy L Thompson __syncthreads(); 3119b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3129b91271bSJeremy L Thompson __syncthreads(); 3139b91271bSJeremy L Thompson *V = 0.0; 3149b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 3159b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 3169b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 3179b91271bSJeremy L Thompson } 3189b91271bSJeremy L Thompson } 3199b91271bSJeremy L Thompson } 3209b91271bSJeremy L Thompson 3219b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3229b91271bSJeremy L Thompson // 3D tensor contract z 3239b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3249b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3259b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 3269b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 327d6c19ee8SJeremy L Thompson __syncthreads(); 3289b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3299b91271bSJeremy L Thompson __syncthreads(); 3309b91271bSJeremy L Thompson *V = 0.0; 3319b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 3329b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 3339b91271bSJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 3349b91271bSJeremy L Thompson } 3359b91271bSJeremy L Thompson } 3369b91271bSJeremy L Thompson } 3379b91271bSJeremy L Thompson 3389b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3399b91271bSJeremy L Thompson // 3D tensor contract z 3409b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3419b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3429b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 3439b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 344d6c19ee8SJeremy L Thompson __syncthreads(); 3459b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3469b91271bSJeremy L Thompson __syncthreads(); 3479b91271bSJeremy L Thompson *V = 0.0; 3489b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 3499b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3509b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 3519b91271bSJeremy L Thompson } 3529b91271bSJeremy L Thompson } 3539b91271bSJeremy L Thompson } 3549b91271bSJeremy L Thompson 3559b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3569b91271bSJeremy L Thompson // 3D transpose tensor contract z 3579b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3589b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3599b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 3609b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 361d6c19ee8SJeremy L Thompson __syncthreads(); 3629b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3639b91271bSJeremy L Thompson __syncthreads(); 3649b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 3659b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3669b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 3679b91271bSJeremy L Thompson } 3689b91271bSJeremy L Thompson } 3699b91271bSJeremy L Thompson } 3709b91271bSJeremy L Thompson 3719b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3729b91271bSJeremy L Thompson // 3D transpose tensor contract y 3739b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3749b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3759b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 3769b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 377d6c19ee8SJeremy L Thompson __syncthreads(); 3789b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3799b91271bSJeremy L Thompson __syncthreads(); 3809b91271bSJeremy L Thompson *V = 0.0; 3819b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 3829b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3839b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 3849b91271bSJeremy L Thompson } 3859b91271bSJeremy L Thompson } 3869b91271bSJeremy L Thompson } 3879b91271bSJeremy L Thompson 3889b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3899b91271bSJeremy L Thompson // 3D transpose tensor contract y 3909b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 3919b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 3929b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 3939b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 394d6c19ee8SJeremy L Thompson __syncthreads(); 3959b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 3969b91271bSJeremy L Thompson __syncthreads(); 3979b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 3989b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 3999b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 4009b91271bSJeremy L Thompson } 4019b91271bSJeremy L Thompson } 4029b91271bSJeremy L Thompson } 4039b91271bSJeremy L Thompson 4049b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4059b91271bSJeremy L Thompson // 3D transpose tensor contract x 4069b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4079b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4089b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 4099b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 410d6c19ee8SJeremy L Thompson __syncthreads(); 4119b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 4129b91271bSJeremy L Thompson __syncthreads(); 4139b91271bSJeremy L Thompson *V = 0.0; 4149b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 4159b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4169b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 4179b91271bSJeremy L Thompson } 4189b91271bSJeremy L Thompson } 4199b91271bSJeremy L Thompson } 4209b91271bSJeremy L Thompson 4219b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4229b91271bSJeremy L Thompson // 3D transpose tensor contract add x 4239b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4249b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4259b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 4269b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 427d6c19ee8SJeremy L Thompson __syncthreads(); 4289b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 4299b91271bSJeremy L Thompson __syncthreads(); 4309b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 4319b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 4329b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 4339b91271bSJeremy L Thompson } 4349b91271bSJeremy L Thompson } 4359b91271bSJeremy L Thompson } 4369b91271bSJeremy L Thompson 4379b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4389b91271bSJeremy L Thompson // 3D pack/unpack quadrature values 4399b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4409b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 4419b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 4429b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 4439b91271bSJeremy L Thompson 4449b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 445d6c19ee8SJeremy L Thompson __syncthreads(); 4469b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 4479b91271bSJeremy L Thompson __syncthreads(); 4489b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 4499b91271bSJeremy L Thompson } 4509b91271bSJeremy L Thompson } 4519b91271bSJeremy L Thompson 4529b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 4539b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 4549b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 4559b91271bSJeremy L Thompson 4569b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 457d6c19ee8SJeremy L Thompson __syncthreads(); 4589b91271bSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 4599b91271bSJeremy L Thompson __syncthreads(); 4609b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 4619b91271bSJeremy L Thompson } 4629b91271bSJeremy L Thompson } 4639b91271bSJeremy L Thompson 4649b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4659b91271bSJeremy L Thompson // 3D interpolate to quadrature points 4669b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4679b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4689b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 4699b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 4709b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 4719b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 4729b91271bSJeremy L Thompson 473ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 4749b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4759b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 4769b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 4779b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 4789b91271bSJeremy L Thompson } 4793e2e790dSJeremy L Thompson __syncthreads(); 480ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 481ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 4829b91271bSJeremy L Thompson } 4839b91271bSJeremy L Thompson 4849b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4859b91271bSJeremy L Thompson // 3D interpolate transpose 4869b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 4879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 4889b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 4899b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 4909b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 4919b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 4929b91271bSJeremy L Thompson 493ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 4949b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 4959b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 4969b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 4979b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 4989b91271bSJeremy L Thompson } 4993e2e790dSJeremy L Thompson __syncthreads(); 500ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 5019b91271bSJeremy L Thompson } 5029b91271bSJeremy L Thompson 5039b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 504*0ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated 505*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 506*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 507*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 508*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 509*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 510*0ccda8ebSJeremy L Thompson 511*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 512*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 513*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 514*0ccda8ebSJeremy L Thompson } 515*0ccda8ebSJeremy L Thompson __syncthreads(); 516*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 517*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 518*0ccda8ebSJeremy L Thompson } 519*0ccda8ebSJeremy L Thompson 520*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 521*0ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated 522*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 523*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 524*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 525*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 526*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 527*0ccda8ebSJeremy L Thompson 528*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 529*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 530*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 531*0ccda8ebSJeremy L Thompson } 532*0ccda8ebSJeremy L Thompson __syncthreads(); 533*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 534*0ccda8ebSJeremy L Thompson } 535*0ccda8ebSJeremy L Thompson 536*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 5379b91271bSJeremy L Thompson // 3D derivatives at quadrature points 5389b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5399b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5409b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 5419b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) { 5429b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 5439b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 5449b91271bSJeremy L Thompson 545ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 5469b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5479b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 5489b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 5499b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 5509b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 5519b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 5529b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 5539b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 5549b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 5559b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 5569b91271bSJeremy L Thompson } 5573e2e790dSJeremy L Thompson __syncthreads(); 558ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 559ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 5609b91271bSJeremy L Thompson } 5619b91271bSJeremy L Thompson 5629b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5639b91271bSJeremy L Thompson // 3D derivatives transpose 5649b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5659b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5669b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 5679b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 5689b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 5699b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 5709b91271bSJeremy L Thompson 571ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 5729b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5739b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 5749b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5759b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 5769b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 5779b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 5789b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 5799b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 5809b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 5819b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 5829b91271bSJeremy L Thompson } 5833e2e790dSJeremy L Thompson __syncthreads(); 584ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 5859b91271bSJeremy L Thompson } 5869b91271bSJeremy L Thompson 5879b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5889b91271bSJeremy L Thompson // 3D derivatives at quadrature points 5899b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 5909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 5919b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 5929b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 5939b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 5949b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 5959b91271bSJeremy L Thompson 596ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 5979b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 5989b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 5999b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 6009b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 6019b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 6029b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 6039b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 6049b91271bSJeremy L Thompson } 6053e2e790dSJeremy L Thompson __syncthreads(); 606ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 607ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 6089b91271bSJeremy L Thompson } 6099b91271bSJeremy L Thompson 6109b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 6119b91271bSJeremy L Thompson // 3D derivatives transpose 6129b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 6139b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 6149b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 6159b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 6169b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 6179b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 6189b91271bSJeremy L Thompson 619ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 6209b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 6219b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 6229b91271bSJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 6239b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 6249b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 6259b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 6269b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 6279b91271bSJeremy L Thompson } 6283e2e790dSJeremy L Thompson __syncthreads(); 629ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 6309b91271bSJeremy L Thompson } 6319b91271bSJeremy L Thompson 6329b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 633*0ccda8ebSJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated 634*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 635*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 636*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 637*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 638*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 639*0ccda8ebSJeremy L Thompson 640*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 641*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 642*0ccda8ebSJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 643*0ccda8ebSJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 644*0ccda8ebSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]); 645*0ccda8ebSJeremy L Thompson } 646*0ccda8ebSJeremy L Thompson __syncthreads(); 647*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 648*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 649*0ccda8ebSJeremy L Thompson } 650*0ccda8ebSJeremy L Thompson 651*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 652*0ccda8ebSJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated 653*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 654*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 655*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 656*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 657*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 658*0ccda8ebSJeremy L Thompson 659*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 660*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 661*0ccda8ebSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]); 662*0ccda8ebSJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 663*0ccda8ebSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 664*0ccda8ebSJeremy L Thompson } 665*0ccda8ebSJeremy L Thompson __syncthreads(); 666*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 667*0ccda8ebSJeremy L Thompson } 668*0ccda8ebSJeremy L Thompson 669*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 6709b91271bSJeremy L Thompson // 3D quadrature weights 6719b91271bSJeremy L Thompson //------------------------------------------------------------------------------ 6729b91271bSJeremy L Thompson template <int P_1D, int Q_1D> 6739b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 6749b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 6759b91271bSJeremy L Thompson 6769b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 6779b91271bSJeremy L Thompson } 678