1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3c8e372f0SJeremy L Thompson // 4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5c8e372f0SJeremy L Thompson // 6c8e372f0SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7c8e372f0SJeremy L Thompson 8c8e372f0SJeremy L Thompson /// @file 9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates 10c8e372f0SJeremy L Thompson #include <ceed/types.h> 11c8e372f0SJeremy L Thompson 12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 13c8e372f0SJeremy L Thompson // 2D 14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 15c8e372f0SJeremy L Thompson 16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 17c8e372f0SJeremy L Thompson // 2D tensor contraction x 18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21c8e372f0SJeremy L Thompson CeedScalar *V) { 22d6c19ee8SJeremy L Thompson __syncthreads(); 23c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 24c8e372f0SJeremy L Thompson __syncthreads(); 25c8e372f0SJeremy L Thompson *V = 0.0; 26c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 27c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 28c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 29c8e372f0SJeremy L Thompson } 30c8e372f0SJeremy L Thompson } 31c8e372f0SJeremy L Thompson } 32c8e372f0SJeremy L Thompson 33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 34c8e372f0SJeremy L Thompson // 2D tensor contract y 35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38c8e372f0SJeremy L Thompson CeedScalar *V) { 39d6c19ee8SJeremy L Thompson __syncthreads(); 40c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 41c8e372f0SJeremy L Thompson __syncthreads(); 42c8e372f0SJeremy L Thompson *V = 0.0; 43c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 44c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 45c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 46c8e372f0SJeremy L Thompson } 47c8e372f0SJeremy L Thompson } 48c8e372f0SJeremy L Thompson } 49c8e372f0SJeremy L Thompson 50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y 52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56d6c19ee8SJeremy L Thompson __syncthreads(); 57c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 58c8e372f0SJeremy L Thompson __syncthreads(); 59c8e372f0SJeremy L Thompson *V = 0.0; 60c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 61c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 62c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 63c8e372f0SJeremy L Thompson } 64c8e372f0SJeremy L Thompson } 65c8e372f0SJeremy L Thompson } 66c8e372f0SJeremy L Thompson 67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x 69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73d6c19ee8SJeremy L Thompson __syncthreads(); 74c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 75c8e372f0SJeremy L Thompson __syncthreads(); 76c8e372f0SJeremy L Thompson *V = 0.0; 77c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 78c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 79c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 80c8e372f0SJeremy L Thompson } 81c8e372f0SJeremy L Thompson } 82c8e372f0SJeremy L Thompson } 83c8e372f0SJeremy L Thompson 84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x 86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90d6c19ee8SJeremy L Thompson __syncthreads(); 91c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 92c8e372f0SJeremy L Thompson __syncthreads(); 93c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 94c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 95c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 96c8e372f0SJeremy L Thompson } 97c8e372f0SJeremy L Thompson } 98c8e372f0SJeremy L Thompson } 99c8e372f0SJeremy L Thompson 100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values 102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105c8e372f0SJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106c8e372f0SJeremy L Thompson 107c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108d6c19ee8SJeremy L Thompson __syncthreads(); 109c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 110c8e372f0SJeremy L Thompson __syncthreads(); 111c8e372f0SJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 112c8e372f0SJeremy L Thompson } 113c8e372f0SJeremy L Thompson } 114c8e372f0SJeremy L Thompson 115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117c8e372f0SJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118c8e372f0SJeremy L Thompson 119c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120d6c19ee8SJeremy L Thompson __syncthreads(); 121c8e372f0SJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 122c8e372f0SJeremy L Thompson __syncthreads(); 123c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 124c8e372f0SJeremy L Thompson } 125c8e372f0SJeremy L Thompson } 126c8e372f0SJeremy L Thompson 127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points 129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 133c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 135c8e372f0SJeremy L Thompson 136ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140c8e372f0SJeremy L Thompson } 1413e2e790dSJeremy L Thompson __syncthreads(); 142ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 143ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 144c8e372f0SJeremy L Thompson } 145c8e372f0SJeremy L Thompson 146c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 147c8e372f0SJeremy L Thompson // 2D interpolate transpose 148c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 149c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 150c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 151c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 152c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 153c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 154c8e372f0SJeremy L Thompson 155ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 156c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 157c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 158c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 159c8e372f0SJeremy L Thompson } 1603e2e790dSJeremy L Thompson __syncthreads(); 161ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 162c8e372f0SJeremy L Thompson } 163c8e372f0SJeremy L Thompson 164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 165*0ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated 166*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 167*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 168*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 169*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 170*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 171*0ccda8ebSJeremy L Thompson 172*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 173*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 174*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 175*0ccda8ebSJeremy L Thompson } 176*0ccda8ebSJeremy L Thompson __syncthreads(); 177*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 178*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 179*0ccda8ebSJeremy L Thompson } 180*0ccda8ebSJeremy L Thompson 181*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 182*0ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated 183*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 184*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 185*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 186*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 187*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 188*0ccda8ebSJeremy L Thompson 189*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 190*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 191*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 192*0ccda8ebSJeremy L Thompson } 193*0ccda8ebSJeremy L Thompson __syncthreads(); 194*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 195*0ccda8ebSJeremy L Thompson } 196*0ccda8ebSJeremy L Thompson 197*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 198c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points 199c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 200c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 201c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 202c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 203c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 204c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 205c8e372f0SJeremy L Thompson 206ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 207c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 208c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 209c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 210c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 211c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 212c8e372f0SJeremy L Thompson } 2133e2e790dSJeremy L Thompson __syncthreads(); 214ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 215ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 216c8e372f0SJeremy L Thompson } 217c8e372f0SJeremy L Thompson 218c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 219c8e372f0SJeremy L Thompson // 2D derivatives transpose 220c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 221c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 222c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 223c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 224c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 225c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 226c8e372f0SJeremy L Thompson 227ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 228c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 229c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 230c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 231c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 232c8e372f0SJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 233c8e372f0SJeremy L Thompson } 2343e2e790dSJeremy L Thompson __syncthreads(); 235ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 236c8e372f0SJeremy L Thompson } 237c8e372f0SJeremy L Thompson 238c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 239*0ccda8ebSJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated 240*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 241*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 242*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 243*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 244*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 245*0ccda8ebSJeremy L Thompson 246*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 247*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 248*0ccda8ebSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 249*0ccda8ebSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 250*0ccda8ebSJeremy L Thompson } 251*0ccda8ebSJeremy L Thompson __syncthreads(); 252*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 253*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 254*0ccda8ebSJeremy L Thompson } 255*0ccda8ebSJeremy L Thompson 256*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 257*0ccda8ebSJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated 258*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 259*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 260*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 261*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 262*0ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 263*0ccda8ebSJeremy L Thompson 264*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 265*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 266*0ccda8ebSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 267*0ccda8ebSJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 268*0ccda8ebSJeremy L Thompson } 269*0ccda8ebSJeremy L Thompson __syncthreads(); 270*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 271*0ccda8ebSJeremy L Thompson } 272*0ccda8ebSJeremy L Thompson 273*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 274c8e372f0SJeremy L Thompson // 2D quadrature weights 275c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 276c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 277c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 278c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 279c8e372f0SJeremy L Thompson 280c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 281c8e372f0SJeremy L Thompson } 282c8e372f0SJeremy L Thompson 283c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 284c8e372f0SJeremy L Thompson // 3D 285c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 286c8e372f0SJeremy L Thompson 287c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 288c8e372f0SJeremy L Thompson // 3D tensor contract x 289c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 290c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 291c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 292c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 293d6c19ee8SJeremy L Thompson __syncthreads(); 294c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 295c8e372f0SJeremy L Thompson __syncthreads(); 296c8e372f0SJeremy L Thompson *V = 0.0; 297c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 298c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 299c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 300c8e372f0SJeremy L Thompson } 301c8e372f0SJeremy L Thompson } 302c8e372f0SJeremy L Thompson } 303c8e372f0SJeremy L Thompson 304c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 305c8e372f0SJeremy L Thompson // 3D tensor contract y 306c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 307c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 308c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 309c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 310d6c19ee8SJeremy L Thompson __syncthreads(); 311c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 312c8e372f0SJeremy L Thompson __syncthreads(); 313c8e372f0SJeremy L Thompson *V = 0.0; 314c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 315c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 316c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 317c8e372f0SJeremy L Thompson } 318c8e372f0SJeremy L Thompson } 319c8e372f0SJeremy L Thompson } 320c8e372f0SJeremy L Thompson 321c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 322c8e372f0SJeremy L Thompson // 3D tensor contract z 323c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 324c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 325c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 326c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 327d6c19ee8SJeremy L Thompson __syncthreads(); 328c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 329c8e372f0SJeremy L Thompson __syncthreads(); 330c8e372f0SJeremy L Thompson *V = 0.0; 331c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 332c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 333c8e372f0SJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 334c8e372f0SJeremy L Thompson } 335c8e372f0SJeremy L Thompson } 336c8e372f0SJeremy L Thompson } 337c8e372f0SJeremy L Thompson 338c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 339c8e372f0SJeremy L Thompson // 3D tensor contract z 340c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 341c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 342c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 343c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 344d6c19ee8SJeremy L Thompson __syncthreads(); 345c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 346c8e372f0SJeremy L Thompson __syncthreads(); 347c8e372f0SJeremy L Thompson *V = 0.0; 348c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 349c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 350c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 351c8e372f0SJeremy L Thompson } 352c8e372f0SJeremy L Thompson } 353c8e372f0SJeremy L Thompson } 354c8e372f0SJeremy L Thompson 355c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 356c8e372f0SJeremy L Thompson // 3D transpose tensor contract z 357c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 358c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 359c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 360c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 361d6c19ee8SJeremy L Thompson __syncthreads(); 362c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 363c8e372f0SJeremy L Thompson __syncthreads(); 364c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 365c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 366c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 367c8e372f0SJeremy L Thompson } 368c8e372f0SJeremy L Thompson } 369c8e372f0SJeremy L Thompson } 370c8e372f0SJeremy L Thompson 371c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 372c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 373c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 374c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 375c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 376c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 377d6c19ee8SJeremy L Thompson __syncthreads(); 378c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 379c8e372f0SJeremy L Thompson __syncthreads(); 380c8e372f0SJeremy L Thompson *V = 0.0; 381c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 382c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 383c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 384c8e372f0SJeremy L Thompson } 385c8e372f0SJeremy L Thompson } 386c8e372f0SJeremy L Thompson } 387c8e372f0SJeremy L Thompson 388c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 389c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 390c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 391c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 392c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 393c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 394d6c19ee8SJeremy L Thompson __syncthreads(); 395c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 396c8e372f0SJeremy L Thompson __syncthreads(); 397c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 398c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 399c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 400c8e372f0SJeremy L Thompson } 401c8e372f0SJeremy L Thompson } 402c8e372f0SJeremy L Thompson } 403c8e372f0SJeremy L Thompson 404c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 405c8e372f0SJeremy L Thompson // 3D transpose tensor contract x 406c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 407c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 408c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 409c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 410d6c19ee8SJeremy L Thompson __syncthreads(); 411c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 412c8e372f0SJeremy L Thompson __syncthreads(); 413c8e372f0SJeremy L Thompson *V = 0.0; 414c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 415c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 416c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 417c8e372f0SJeremy L Thompson } 418c8e372f0SJeremy L Thompson } 419c8e372f0SJeremy L Thompson } 420c8e372f0SJeremy L Thompson 421c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 422c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x 423c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 424c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 425c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 426c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 427d6c19ee8SJeremy L Thompson __syncthreads(); 428c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 429c8e372f0SJeremy L Thompson __syncthreads(); 430c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 431c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 432c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 433c8e372f0SJeremy L Thompson } 434c8e372f0SJeremy L Thompson } 435c8e372f0SJeremy L Thompson } 436c8e372f0SJeremy L Thompson 437c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 438c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values 439c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 440c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 441c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 442259057edSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 443c8e372f0SJeremy L Thompson 444c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 445d6c19ee8SJeremy L Thompson __syncthreads(); 446c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 447c8e372f0SJeremy L Thompson __syncthreads(); 448259057edSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 449c8e372f0SJeremy L Thompson } 450c8e372f0SJeremy L Thompson } 451c8e372f0SJeremy L Thompson 452c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 453c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 454259057edSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 455c8e372f0SJeremy L Thompson 456c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 457d6c19ee8SJeremy L Thompson __syncthreads(); 458259057edSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 459c8e372f0SJeremy L Thompson __syncthreads(); 460c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 461c8e372f0SJeremy L Thompson } 462c8e372f0SJeremy L Thompson } 463c8e372f0SJeremy L Thompson 464c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 465c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points 466c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 467c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 468c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 469c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 470259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 471c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 472c8e372f0SJeremy L Thompson 473ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 474c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 475c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 476c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 477c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 478c8e372f0SJeremy L Thompson } 4793e2e790dSJeremy L Thompson __syncthreads(); 480ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 481ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 482c8e372f0SJeremy L Thompson } 483c8e372f0SJeremy L Thompson 484c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 485c8e372f0SJeremy L Thompson // 3D interpolate transpose 486c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 487c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 488c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 489c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 490259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 491c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 492c8e372f0SJeremy L Thompson 493ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 494c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 495c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 496c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 497c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 498c8e372f0SJeremy L Thompson } 4993e2e790dSJeremy L Thompson __syncthreads(); 500ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 501c8e372f0SJeremy L Thompson } 502c8e372f0SJeremy L Thompson 503c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 504*0ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated 505*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 506*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 507*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 508*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 509*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 510*0ccda8ebSJeremy L Thompson 511*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 512*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 513*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 514*0ccda8ebSJeremy L Thompson } 515*0ccda8ebSJeremy L Thompson __syncthreads(); 516*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 517*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 518*0ccda8ebSJeremy L Thompson } 519*0ccda8ebSJeremy L Thompson 520*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 521*0ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated 522*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 523*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 524*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 525*0ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) { 526*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 527*0ccda8ebSJeremy L Thompson 528*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 529*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 530*0ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp]; 531*0ccda8ebSJeremy L Thompson } 532*0ccda8ebSJeremy L Thompson __syncthreads(); 533*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 534*0ccda8ebSJeremy L Thompson } 535*0ccda8ebSJeremy L Thompson 536*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 537c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 538c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 539c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 540c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 541c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 542259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 543c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 544c8e372f0SJeremy L Thompson 545ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 546c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 547c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 548c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 549c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 550c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 551c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 552c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 553c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 554c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 555c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 556c8e372f0SJeremy L Thompson } 5573e2e790dSJeremy L Thompson __syncthreads(); 558ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 559ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 560c8e372f0SJeremy L Thompson } 561c8e372f0SJeremy L Thompson 562c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 563c8e372f0SJeremy L Thompson // 3D derivatives transpose 564c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 565c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 566c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 567c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 568259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 569c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 570c8e372f0SJeremy L Thompson 571ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 572c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 573c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 574c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 575c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 576c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 577c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 578c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 579c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 580c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 581c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 582c8e372f0SJeremy L Thompson } 5833e2e790dSJeremy L Thompson __syncthreads(); 584ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 585c8e372f0SJeremy L Thompson } 586c8e372f0SJeremy L Thompson 587c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 588c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 589c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 590c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 591c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 592c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 593259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 594c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 595c8e372f0SJeremy L Thompson 596ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 597c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 598c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 599c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 600c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 601c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 602c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 603c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 604c8e372f0SJeremy L Thompson } 6053e2e790dSJeremy L Thompson __syncthreads(); 606ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 607ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 608c8e372f0SJeremy L Thompson } 609c8e372f0SJeremy L Thompson 610c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 611c8e372f0SJeremy L Thompson // 3D derivatives transpose 612c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 613c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 614*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 615c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 616259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 617c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 618c8e372f0SJeremy L Thompson 619ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 620c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 621*0ccda8ebSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 622*0ccda8ebSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 623*0ccda8ebSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 624*0ccda8ebSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 625*0ccda8ebSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 626*0ccda8ebSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 627*0ccda8ebSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 628*0ccda8ebSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 629*0ccda8ebSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 630*0ccda8ebSJeremy L Thompson } 631*0ccda8ebSJeremy L Thompson __syncthreads(); 632*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 633*0ccda8ebSJeremy L Thompson } 634*0ccda8ebSJeremy L Thompson 635*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 636*0ccda8ebSJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated 637*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 638*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 639*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 640*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 641*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 642*0ccda8ebSJeremy L Thompson 643*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 644*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 645*0ccda8ebSJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 646*0ccda8ebSJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 647*0ccda8ebSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]); 648*0ccda8ebSJeremy L Thompson } 649*0ccda8ebSJeremy L Thompson __syncthreads(); 650*0ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 651*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 652*0ccda8ebSJeremy L Thompson } 653*0ccda8ebSJeremy L Thompson 654*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 655*0ccda8ebSJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated 656*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------ 657*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 658*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 659*0ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 660*0ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 661*0ccda8ebSJeremy L Thompson 662*0ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 663*0ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 664*0ccda8ebSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]); 665*0ccda8ebSJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 666*0ccda8ebSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 667c8e372f0SJeremy L Thompson } 6683e2e790dSJeremy L Thompson __syncthreads(); 669ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 670c8e372f0SJeremy L Thompson } 671c8e372f0SJeremy L Thompson 672c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 673c8e372f0SJeremy L Thompson // 3D quadrature weights 674c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 675c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 676c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 677259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 678c8e372f0SJeremy L Thompson 679c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 680c8e372f0SJeremy L Thompson } 681