1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3c8e372f0SJeremy L Thompson // 4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5c8e372f0SJeremy L Thompson // 6c8e372f0SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7c8e372f0SJeremy L Thompson 8c8e372f0SJeremy L Thompson /// @file 9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates 10c8e372f0SJeremy L Thompson #include <ceed/types.h> 11c8e372f0SJeremy L Thompson 12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 13c8e372f0SJeremy L Thompson // 2D 14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 15c8e372f0SJeremy L Thompson 16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 17c8e372f0SJeremy L Thompson // 2D tensor contraction x 18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21c8e372f0SJeremy L Thompson CeedScalar *V) { 22d6c19ee8SJeremy L Thompson __syncthreads(); 23c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 24c8e372f0SJeremy L Thompson __syncthreads(); 25c8e372f0SJeremy L Thompson *V = 0.0; 26c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 27c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 28c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 29c8e372f0SJeremy L Thompson } 30c8e372f0SJeremy L Thompson } 31c8e372f0SJeremy L Thompson } 32c8e372f0SJeremy L Thompson 33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 34c8e372f0SJeremy L Thompson // 2D tensor contract y 35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38c8e372f0SJeremy L Thompson CeedScalar *V) { 39d6c19ee8SJeremy L Thompson __syncthreads(); 40c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 41c8e372f0SJeremy L Thompson __syncthreads(); 42c8e372f0SJeremy L Thompson *V = 0.0; 43c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 44c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 45c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 46c8e372f0SJeremy L Thompson } 47c8e372f0SJeremy L Thompson } 48c8e372f0SJeremy L Thompson } 49c8e372f0SJeremy L Thompson 50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y 52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56d6c19ee8SJeremy L Thompson __syncthreads(); 57c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 58c8e372f0SJeremy L Thompson __syncthreads(); 59c8e372f0SJeremy L Thompson *V = 0.0; 60c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 61c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 62c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 63c8e372f0SJeremy L Thompson } 64c8e372f0SJeremy L Thompson } 65c8e372f0SJeremy L Thompson } 66c8e372f0SJeremy L Thompson 67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x 69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73d6c19ee8SJeremy L Thompson __syncthreads(); 74c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 75c8e372f0SJeremy L Thompson __syncthreads(); 76c8e372f0SJeremy L Thompson *V = 0.0; 77c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 78c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 79c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 80c8e372f0SJeremy L Thompson } 81c8e372f0SJeremy L Thompson } 82c8e372f0SJeremy L Thompson } 83c8e372f0SJeremy L Thompson 84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x 86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90d6c19ee8SJeremy L Thompson __syncthreads(); 91c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 92c8e372f0SJeremy L Thompson __syncthreads(); 93c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 94c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 95c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 96c8e372f0SJeremy L Thompson } 97c8e372f0SJeremy L Thompson } 98c8e372f0SJeremy L Thompson } 99c8e372f0SJeremy L Thompson 100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values 102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105c8e372f0SJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106c8e372f0SJeremy L Thompson 107c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108d6c19ee8SJeremy L Thompson __syncthreads(); 109c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 110c8e372f0SJeremy L Thompson __syncthreads(); 111c8e372f0SJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 112c8e372f0SJeremy L Thompson } 113c8e372f0SJeremy L Thompson } 114c8e372f0SJeremy L Thompson 115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117c8e372f0SJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118c8e372f0SJeremy L Thompson 119c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120d6c19ee8SJeremy L Thompson __syncthreads(); 121c8e372f0SJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 122c8e372f0SJeremy L Thompson __syncthreads(); 123c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 124c8e372f0SJeremy L Thompson } 125c8e372f0SJeremy L Thompson } 126c8e372f0SJeremy L Thompson 127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points 129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 133c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 135c8e372f0SJeremy L Thompson 136ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140c8e372f0SJeremy L Thompson } 141*3e2e790dSJeremy L Thompson __syncthreads(); 142ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 143ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 144c8e372f0SJeremy L Thompson } 145c8e372f0SJeremy L Thompson 146c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 147c8e372f0SJeremy L Thompson // 2D interpolate transpose 148c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 149c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 150c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 151c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 152c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 153c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 154c8e372f0SJeremy L Thompson 155ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 156c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 157c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 158c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 159c8e372f0SJeremy L Thompson } 160*3e2e790dSJeremy L Thompson __syncthreads(); 161ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 162c8e372f0SJeremy L Thompson } 163c8e372f0SJeremy L Thompson 164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 165c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points 166c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 167c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 168c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 169c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 170c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 171c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 172c8e372f0SJeremy L Thompson 173ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 174c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 175c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 176c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 177c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 178c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 179c8e372f0SJeremy L Thompson } 180*3e2e790dSJeremy L Thompson __syncthreads(); 181ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 182ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 183c8e372f0SJeremy L Thompson } 184c8e372f0SJeremy L Thompson 185c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 186c8e372f0SJeremy L Thompson // 2D derivatives transpose 187c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 188c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 189c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 190c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 191c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 192c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 193c8e372f0SJeremy L Thompson 194ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 195c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 196c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 197c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 198c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 199c8e372f0SJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 200c8e372f0SJeremy L Thompson } 201*3e2e790dSJeremy L Thompson __syncthreads(); 202ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 203c8e372f0SJeremy L Thompson } 204c8e372f0SJeremy L Thompson 205c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 206c8e372f0SJeremy L Thompson // 2D quadrature weights 207c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 208c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 209c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 210c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 211c8e372f0SJeremy L Thompson 212c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 213c8e372f0SJeremy L Thompson } 214c8e372f0SJeremy L Thompson 215c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 216c8e372f0SJeremy L Thompson // 3D 217c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 218c8e372f0SJeremy L Thompson 219c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 220c8e372f0SJeremy L Thompson // 3D tensor contract x 221c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 222c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 223c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 224c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 225d6c19ee8SJeremy L Thompson __syncthreads(); 226c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 227c8e372f0SJeremy L Thompson __syncthreads(); 228c8e372f0SJeremy L Thompson *V = 0.0; 229c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 230c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 231c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 232c8e372f0SJeremy L Thompson } 233c8e372f0SJeremy L Thompson } 234c8e372f0SJeremy L Thompson } 235c8e372f0SJeremy L Thompson 236c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 237c8e372f0SJeremy L Thompson // 3D tensor contract y 238c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 239c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 240c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 241c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 242d6c19ee8SJeremy L Thompson __syncthreads(); 243c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 244c8e372f0SJeremy L Thompson __syncthreads(); 245c8e372f0SJeremy L Thompson *V = 0.0; 246c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 247c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 248c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 249c8e372f0SJeremy L Thompson } 250c8e372f0SJeremy L Thompson } 251c8e372f0SJeremy L Thompson } 252c8e372f0SJeremy L Thompson 253c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 254c8e372f0SJeremy L Thompson // 3D tensor contract z 255c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 256c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 257c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 258c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 259d6c19ee8SJeremy L Thompson __syncthreads(); 260c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 261c8e372f0SJeremy L Thompson __syncthreads(); 262c8e372f0SJeremy L Thompson *V = 0.0; 263c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 264c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 265c8e372f0SJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 266c8e372f0SJeremy L Thompson } 267c8e372f0SJeremy L Thompson } 268c8e372f0SJeremy L Thompson } 269c8e372f0SJeremy L Thompson 270c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 271c8e372f0SJeremy L Thompson // 3D tensor contract z 272c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 273c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 274c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 275c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 276d6c19ee8SJeremy L Thompson __syncthreads(); 277c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 278c8e372f0SJeremy L Thompson __syncthreads(); 279c8e372f0SJeremy L Thompson *V = 0.0; 280c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 281c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 282c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 283c8e372f0SJeremy L Thompson } 284c8e372f0SJeremy L Thompson } 285c8e372f0SJeremy L Thompson } 286c8e372f0SJeremy L Thompson 287c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 288c8e372f0SJeremy L Thompson // 3D transpose tensor contract z 289c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 290c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 291c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 292c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 293d6c19ee8SJeremy L Thompson __syncthreads(); 294c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 295c8e372f0SJeremy L Thompson __syncthreads(); 296c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 297c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 298c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 299c8e372f0SJeremy L Thompson } 300c8e372f0SJeremy L Thompson } 301c8e372f0SJeremy L Thompson } 302c8e372f0SJeremy L Thompson 303c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 304c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 305c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 306c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 307c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 308c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 309d6c19ee8SJeremy L Thompson __syncthreads(); 310c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 311c8e372f0SJeremy L Thompson __syncthreads(); 312c8e372f0SJeremy L Thompson *V = 0.0; 313c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 314c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 315c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 316c8e372f0SJeremy L Thompson } 317c8e372f0SJeremy L Thompson } 318c8e372f0SJeremy L Thompson } 319c8e372f0SJeremy L Thompson 320c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 321c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 322c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 323c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 324c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 325c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 326d6c19ee8SJeremy L Thompson __syncthreads(); 327c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 328c8e372f0SJeremy L Thompson __syncthreads(); 329c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 330c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 331c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 332c8e372f0SJeremy L Thompson } 333c8e372f0SJeremy L Thompson } 334c8e372f0SJeremy L Thompson } 335c8e372f0SJeremy L Thompson 336c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 337c8e372f0SJeremy L Thompson // 3D transpose tensor contract x 338c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 339c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 340c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 341c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 342d6c19ee8SJeremy L Thompson __syncthreads(); 343c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 344c8e372f0SJeremy L Thompson __syncthreads(); 345c8e372f0SJeremy L Thompson *V = 0.0; 346c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 347c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 348c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 349c8e372f0SJeremy L Thompson } 350c8e372f0SJeremy L Thompson } 351c8e372f0SJeremy L Thompson } 352c8e372f0SJeremy L Thompson 353c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 354c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x 355c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 356c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 357c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 358c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 359d6c19ee8SJeremy L Thompson __syncthreads(); 360c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 361c8e372f0SJeremy L Thompson __syncthreads(); 362c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 363c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 364c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 365c8e372f0SJeremy L Thompson } 366c8e372f0SJeremy L Thompson } 367c8e372f0SJeremy L Thompson } 368c8e372f0SJeremy L Thompson 369c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 370c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values 371c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 372c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 373c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 374259057edSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 375c8e372f0SJeremy L Thompson 376c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 377d6c19ee8SJeremy L Thompson __syncthreads(); 378c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 379c8e372f0SJeremy L Thompson __syncthreads(); 380259057edSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 381c8e372f0SJeremy L Thompson } 382c8e372f0SJeremy L Thompson } 383c8e372f0SJeremy L Thompson 384c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 385c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 386259057edSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 387c8e372f0SJeremy L Thompson 388c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 389d6c19ee8SJeremy L Thompson __syncthreads(); 390259057edSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 391c8e372f0SJeremy L Thompson __syncthreads(); 392c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 393c8e372f0SJeremy L Thompson } 394c8e372f0SJeremy L Thompson } 395c8e372f0SJeremy L Thompson 396c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 397c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points 398c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 399c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 400c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 401c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 402259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 403c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 404c8e372f0SJeremy L Thompson 405ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 406c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 407c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 408c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 409c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 410c8e372f0SJeremy L Thompson } 411*3e2e790dSJeremy L Thompson __syncthreads(); 412ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 413ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 414c8e372f0SJeremy L Thompson } 415c8e372f0SJeremy L Thompson 416c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 417c8e372f0SJeremy L Thompson // 3D interpolate transpose 418c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 419c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 420c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 421c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 422259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 423c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 424c8e372f0SJeremy L Thompson 425ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 426c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 427c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 428c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 429c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 430c8e372f0SJeremy L Thompson } 431*3e2e790dSJeremy L Thompson __syncthreads(); 432ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 433c8e372f0SJeremy L Thompson } 434c8e372f0SJeremy L Thompson 435c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 436c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 437c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 438c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 439c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 440c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 441259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 442c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 443c8e372f0SJeremy L Thompson 444ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 445c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 446c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 447c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 448c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 449c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 450c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 451c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 452c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 453c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 454c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 455c8e372f0SJeremy L Thompson } 456*3e2e790dSJeremy L Thompson __syncthreads(); 457ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 458ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 459c8e372f0SJeremy L Thompson } 460c8e372f0SJeremy L Thompson 461c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 462c8e372f0SJeremy L Thompson // 3D derivatives transpose 463c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 464c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 465c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 466c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 467259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 468c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 469c8e372f0SJeremy L Thompson 470ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 471c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 472c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 473c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 474c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 475c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 476c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 477c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 478c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 479c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 480c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 481c8e372f0SJeremy L Thompson } 482*3e2e790dSJeremy L Thompson __syncthreads(); 483ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 484c8e372f0SJeremy L Thompson } 485c8e372f0SJeremy L Thompson 486c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 487c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 488c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 489c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 490c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 491c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 492259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 493c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 494c8e372f0SJeremy L Thompson 495ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 496c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 497c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 498c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 499c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 500c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 501c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 502c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 503c8e372f0SJeremy L Thompson } 504*3e2e790dSJeremy L Thompson __syncthreads(); 505ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 506ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 507c8e372f0SJeremy L Thompson } 508c8e372f0SJeremy L Thompson 509c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 510c8e372f0SJeremy L Thompson // 3D derivatives transpose 511c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 512c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 513c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 514c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 515259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 516c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 517c8e372f0SJeremy L Thompson 518ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 519c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 520c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 521c8e372f0SJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 522c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 523c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 524c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 525c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 526c8e372f0SJeremy L Thompson } 527*3e2e790dSJeremy L Thompson __syncthreads(); 528ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 529c8e372f0SJeremy L Thompson } 530c8e372f0SJeremy L Thompson 531c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 532c8e372f0SJeremy L Thompson // 3D quadrature weights 533c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 534c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 535c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 536259057edSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 537c8e372f0SJeremy L Thompson 538c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 539c8e372f0SJeremy L Thompson } 540