1*c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*c8e372f0SJeremy L Thompson // 4*c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*c8e372f0SJeremy L Thompson // 6*c8e372f0SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*c8e372f0SJeremy L Thompson 8*c8e372f0SJeremy L Thompson /// @file 9*c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates 10*c8e372f0SJeremy L Thompson #include <ceed/types.h> 11*c8e372f0SJeremy L Thompson 12*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 13*c8e372f0SJeremy L Thompson // 2D 14*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 15*c8e372f0SJeremy L Thompson 16*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 17*c8e372f0SJeremy L Thompson // 2D tensor contraction x 18*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 19*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20*c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21*c8e372f0SJeremy L Thompson CeedScalar *V) { 22*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 23*c8e372f0SJeremy L Thompson __syncthreads(); 24*c8e372f0SJeremy L Thompson *V = 0.0; 25*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 26*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 27*c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 28*c8e372f0SJeremy L Thompson } 29*c8e372f0SJeremy L Thompson } 30*c8e372f0SJeremy L Thompson __syncthreads(); 31*c8e372f0SJeremy L Thompson } 32*c8e372f0SJeremy L Thompson 33*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 34*c8e372f0SJeremy L Thompson // 2D tensor contract y 35*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 36*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37*c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38*c8e372f0SJeremy L Thompson CeedScalar *V) { 39*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 40*c8e372f0SJeremy L Thompson __syncthreads(); 41*c8e372f0SJeremy L Thompson *V = 0.0; 42*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) { 43*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 44*c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 45*c8e372f0SJeremy L Thompson } 46*c8e372f0SJeremy L Thompson } 47*c8e372f0SJeremy L Thompson __syncthreads(); 48*c8e372f0SJeremy L Thompson } 49*c8e372f0SJeremy L Thompson 50*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 51*c8e372f0SJeremy L Thompson // 2D transpose tensor contract y 52*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 53*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 56*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 57*c8e372f0SJeremy L Thompson __syncthreads(); 58*c8e372f0SJeremy L Thompson *V = 0.0; 59*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) { 60*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 61*c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 62*c8e372f0SJeremy L Thompson } 63*c8e372f0SJeremy L Thompson } 64*c8e372f0SJeremy L Thompson __syncthreads(); 65*c8e372f0SJeremy L Thompson } 66*c8e372f0SJeremy L Thompson 67*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 68*c8e372f0SJeremy L Thompson // 2D transpose tensor contract x 69*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 70*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 73*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 74*c8e372f0SJeremy L Thompson __syncthreads(); 75*c8e372f0SJeremy L Thompson *V = 0.0; 76*c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 77*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 78*c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 79*c8e372f0SJeremy L Thompson } 80*c8e372f0SJeremy L Thompson } 81*c8e372f0SJeremy L Thompson __syncthreads(); 82*c8e372f0SJeremy L Thompson } 83*c8e372f0SJeremy L Thompson 84*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 85*c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x 86*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 87*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 90*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U; 91*c8e372f0SJeremy L Thompson __syncthreads(); 92*c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) { 93*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 94*c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 95*c8e372f0SJeremy L Thompson } 96*c8e372f0SJeremy L Thompson } 97*c8e372f0SJeremy L Thompson __syncthreads(); 98*c8e372f0SJeremy L Thompson } 99*c8e372f0SJeremy L Thompson 100*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 101*c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values 102*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 103*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 104*c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105*c8e372f0SJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106*c8e372f0SJeremy L Thompson 107*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 109*c8e372f0SJeremy L Thompson __syncthreads(); 110*c8e372f0SJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 111*c8e372f0SJeremy L Thompson __syncthreads(); 112*c8e372f0SJeremy L Thompson } 113*c8e372f0SJeremy L Thompson } 114*c8e372f0SJeremy L Thompson 115*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 116*c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117*c8e372f0SJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118*c8e372f0SJeremy L Thompson 119*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120*c8e372f0SJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 121*c8e372f0SJeremy L Thompson __syncthreads(); 122*c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 123*c8e372f0SJeremy L Thompson __syncthreads(); 124*c8e372f0SJeremy L Thompson } 125*c8e372f0SJeremy L Thompson } 126*c8e372f0SJeremy L Thompson 127*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 128*c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points 129*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 130*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131*c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132*c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 133*c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134*c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 135*c8e372f0SJeremy L Thompson 136*c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138*c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139*c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140*c8e372f0SJeremy L Thompson } 141*c8e372f0SJeremy L Thompson QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 142*c8e372f0SJeremy L Thompson } 143*c8e372f0SJeremy L Thompson 144*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 145*c8e372f0SJeremy L Thompson // 2D interpolate transpose 146*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 147*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 148*c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 149*c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 150*c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 151*c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 152*c8e372f0SJeremy L Thompson 153*c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 154*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 155*c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 156*c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 157*c8e372f0SJeremy L Thompson } 158*c8e372f0SJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 159*c8e372f0SJeremy L Thompson } 160*c8e372f0SJeremy L Thompson 161*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 162*c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points 163*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 164*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 165*c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 166*c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 167*c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 168*c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 169*c8e372f0SJeremy L Thompson 170*c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 171*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 172*c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 173*c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 174*c8e372f0SJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 175*c8e372f0SJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 176*c8e372f0SJeremy L Thompson } 177*c8e372f0SJeremy L Thompson QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 178*c8e372f0SJeremy L Thompson } 179*c8e372f0SJeremy L Thompson 180*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 181*c8e372f0SJeremy L Thompson // 2D derivatives transpose 182*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 183*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 184*c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 185*c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 186*c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 187*c8e372f0SJeremy L Thompson CeedScalar r_t[1]; 188*c8e372f0SJeremy L Thompson 189*c8e372f0SJeremy L Thompson QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 190*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 191*c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 192*c8e372f0SJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 193*c8e372f0SJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 194*c8e372f0SJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 195*c8e372f0SJeremy L Thompson } 196*c8e372f0SJeremy L Thompson QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 197*c8e372f0SJeremy L Thompson } 198*c8e372f0SJeremy L Thompson 199*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 200*c8e372f0SJeremy L Thompson // 2D quadrature weights 201*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 202*c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 203*c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 204*c8e372f0SJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 205*c8e372f0SJeremy L Thompson 206*c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 207*c8e372f0SJeremy L Thompson } 208*c8e372f0SJeremy L Thompson 209*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 210*c8e372f0SJeremy L Thompson // 3D 211*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 212*c8e372f0SJeremy L Thompson 213*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 214*c8e372f0SJeremy L Thompson // 3D tensor contract x 215*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 216*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 217*c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 218*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 219*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 220*c8e372f0SJeremy L Thompson __syncthreads(); 221*c8e372f0SJeremy L Thompson *V = 0.0; 222*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 223*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 224*c8e372f0SJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 225*c8e372f0SJeremy L Thompson } 226*c8e372f0SJeremy L Thompson } 227*c8e372f0SJeremy L Thompson __syncthreads(); 228*c8e372f0SJeremy L Thompson } 229*c8e372f0SJeremy L Thompson 230*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 231*c8e372f0SJeremy L Thompson // 3D tensor contract y 232*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 233*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 234*c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 235*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 236*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 237*c8e372f0SJeremy L Thompson __syncthreads(); 238*c8e372f0SJeremy L Thompson *V = 0.0; 239*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 240*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 241*c8e372f0SJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 242*c8e372f0SJeremy L Thompson } 243*c8e372f0SJeremy L Thompson } 244*c8e372f0SJeremy L Thompson __syncthreads(); 245*c8e372f0SJeremy L Thompson } 246*c8e372f0SJeremy L Thompson 247*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 248*c8e372f0SJeremy L Thompson // 3D tensor contract z 249*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 250*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 251*c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 252*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 253*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 254*c8e372f0SJeremy L Thompson __syncthreads(); 255*c8e372f0SJeremy L Thompson *V = 0.0; 256*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 257*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) { 258*c8e372f0SJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 259*c8e372f0SJeremy L Thompson } 260*c8e372f0SJeremy L Thompson } 261*c8e372f0SJeremy L Thompson __syncthreads(); 262*c8e372f0SJeremy L Thompson } 263*c8e372f0SJeremy L Thompson 264*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 265*c8e372f0SJeremy L Thompson // 3D tensor contract z 266*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 267*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 268*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 269*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 270*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 271*c8e372f0SJeremy L Thompson __syncthreads(); 272*c8e372f0SJeremy L Thompson *V = 0.0; 273*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 274*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 275*c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 276*c8e372f0SJeremy L Thompson } 277*c8e372f0SJeremy L Thompson } 278*c8e372f0SJeremy L Thompson __syncthreads(); 279*c8e372f0SJeremy L Thompson } 280*c8e372f0SJeremy L Thompson 281*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 282*c8e372f0SJeremy L Thompson // 3D transpose tensor contract z 283*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 284*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 285*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 286*c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 287*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 288*c8e372f0SJeremy L Thompson __syncthreads(); 289*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 290*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 291*c8e372f0SJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 292*c8e372f0SJeremy L Thompson } 293*c8e372f0SJeremy L Thompson } 294*c8e372f0SJeremy L Thompson __syncthreads(); 295*c8e372f0SJeremy L Thompson } 296*c8e372f0SJeremy L Thompson 297*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 298*c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 299*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 300*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 301*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 302*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 303*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 304*c8e372f0SJeremy L Thompson __syncthreads(); 305*c8e372f0SJeremy L Thompson *V = 0.0; 306*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 307*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 308*c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 309*c8e372f0SJeremy L Thompson } 310*c8e372f0SJeremy L Thompson } 311*c8e372f0SJeremy L Thompson __syncthreads(); 312*c8e372f0SJeremy L Thompson } 313*c8e372f0SJeremy L Thompson 314*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 315*c8e372f0SJeremy L Thompson // 3D transpose tensor contract y 316*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 317*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 318*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 319*c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 320*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 321*c8e372f0SJeremy L Thompson __syncthreads(); 322*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 323*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 324*c8e372f0SJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 325*c8e372f0SJeremy L Thompson } 326*c8e372f0SJeremy L Thompson } 327*c8e372f0SJeremy L Thompson __syncthreads(); 328*c8e372f0SJeremy L Thompson } 329*c8e372f0SJeremy L Thompson 330*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 331*c8e372f0SJeremy L Thompson // 3D transpose tensor contract x 332*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 333*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 334*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 335*c8e372f0SJeremy L Thompson const CeedScalar *B, CeedScalar *V) { 336*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 337*c8e372f0SJeremy L Thompson __syncthreads(); 338*c8e372f0SJeremy L Thompson *V = 0.0; 339*c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 340*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 341*c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 342*c8e372f0SJeremy L Thompson } 343*c8e372f0SJeremy L Thompson } 344*c8e372f0SJeremy L Thompson __syncthreads(); 345*c8e372f0SJeremy L Thompson } 346*c8e372f0SJeremy L Thompson 347*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 348*c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x 349*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 350*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 351*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 352*c8e372f0SJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 353*c8e372f0SJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 354*c8e372f0SJeremy L Thompson __syncthreads(); 355*c8e372f0SJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 356*c8e372f0SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 357*c8e372f0SJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 358*c8e372f0SJeremy L Thompson } 359*c8e372f0SJeremy L Thompson } 360*c8e372f0SJeremy L Thompson __syncthreads(); 361*c8e372f0SJeremy L Thompson } 362*c8e372f0SJeremy L Thompson 363*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 364*c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values 365*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 366*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 367*c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 368*c8e372f0SJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 369*c8e372f0SJeremy L Thompson 370*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 371*c8e372f0SJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 372*c8e372f0SJeremy L Thompson __syncthreads(); 373*c8e372f0SJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 374*c8e372f0SJeremy L Thompson __syncthreads(); 375*c8e372f0SJeremy L Thompson } 376*c8e372f0SJeremy L Thompson } 377*c8e372f0SJeremy L Thompson 378*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D> 379*c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 380*c8e372f0SJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 381*c8e372f0SJeremy L Thompson 382*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 383*c8e372f0SJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 384*c8e372f0SJeremy L Thompson __syncthreads(); 385*c8e372f0SJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 386*c8e372f0SJeremy L Thompson __syncthreads(); 387*c8e372f0SJeremy L Thompson } 388*c8e372f0SJeremy L Thompson } 389*c8e372f0SJeremy L Thompson 390*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 391*c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points 392*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 393*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 394*c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 395*c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 396*c8e372f0SJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 397*c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 398*c8e372f0SJeremy L Thompson 399*c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 400*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 401*c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 402*c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 403*c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 404*c8e372f0SJeremy L Thompson } 405*c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 406*c8e372f0SJeremy L Thompson } 407*c8e372f0SJeremy L Thompson 408*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 409*c8e372f0SJeremy L Thompson // 3D interpolate transpose 410*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 411*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 412*c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 413*c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 414*c8e372f0SJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 415*c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 416*c8e372f0SJeremy L Thompson 417*c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 418*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 419*c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 420*c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 421*c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 422*c8e372f0SJeremy L Thompson } 423*c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 424*c8e372f0SJeremy L Thompson } 425*c8e372f0SJeremy L Thompson 426*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 427*c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 428*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 429*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 430*c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 431*c8e372f0SJeremy L Thompson CeedScalar *__restrict__ r_V) { 432*c8e372f0SJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 433*c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 434*c8e372f0SJeremy L Thompson 435*c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 436*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 437*c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 438*c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 439*c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 440*c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 441*c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 442*c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 443*c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 444*c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 445*c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 446*c8e372f0SJeremy L Thompson } 447*c8e372f0SJeremy L Thompson QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 448*c8e372f0SJeremy L Thompson } 449*c8e372f0SJeremy L Thompson 450*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 451*c8e372f0SJeremy L Thompson // 3D derivatives transpose 452*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 453*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 454*c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 455*c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 456*c8e372f0SJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 457*c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 458*c8e372f0SJeremy L Thompson 459*c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 460*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 461*c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 462*c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 463*c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 464*c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 465*c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 466*c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 467*c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 468*c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 469*c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 470*c8e372f0SJeremy L Thompson } 471*c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 472*c8e372f0SJeremy L Thompson } 473*c8e372f0SJeremy L Thompson 474*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 475*c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points 476*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 477*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 478*c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 479*c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 480*c8e372f0SJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 481*c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 482*c8e372f0SJeremy L Thompson 483*c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 484*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 485*c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 486*c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 487*c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 488*c8e372f0SJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 489*c8e372f0SJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 490*c8e372f0SJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 491*c8e372f0SJeremy L Thompson } 492*c8e372f0SJeremy L Thompson QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 493*c8e372f0SJeremy L Thompson } 494*c8e372f0SJeremy L Thompson 495*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 496*c8e372f0SJeremy L Thompson // 3D derivatives transpose 497*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 498*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 499*c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 500*c8e372f0SJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 501*c8e372f0SJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 502*c8e372f0SJeremy L Thompson CeedScalar r_t1[1], r_t2[1]; 503*c8e372f0SJeremy L Thompson 504*c8e372f0SJeremy L Thompson QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 505*c8e372f0SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 506*c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 507*c8e372f0SJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 508*c8e372f0SJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 509*c8e372f0SJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 510*c8e372f0SJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 511*c8e372f0SJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 512*c8e372f0SJeremy L Thompson } 513*c8e372f0SJeremy L Thompson QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 514*c8e372f0SJeremy L Thompson } 515*c8e372f0SJeremy L Thompson 516*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 517*c8e372f0SJeremy L Thompson // 3D quadrature weights 518*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------ 519*c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D> 520*c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 521*c8e372f0SJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 522*c8e372f0SJeremy L Thompson 523*c8e372f0SJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 524*c8e372f0SJeremy L Thompson } 525