xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 343e3094792a64f9c2da70ef2256f98e7dc173cf)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA shared memory tensor product basis templates
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // 1D
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann 
169e201c85SYohann //------------------------------------------------------------------------------
179e201c85SYohann // 1D tensor contraction x
189e201c85SYohann //------------------------------------------------------------------------------
199e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
209e201c85SYohann inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
219e201c85SYohann   data.slice[data.t_id_x] = *U;
229e201c85SYohann   __syncthreads();
239e201c85SYohann   *V = 0.0;
249e201c85SYohann   if (data.t_id_x < Q_1D) {
259e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
269e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
279e201c85SYohann     }
289e201c85SYohann   }
299e201c85SYohann   __syncthreads();
309e201c85SYohann }
319e201c85SYohann 
329e201c85SYohann //------------------------------------------------------------------------------
339e201c85SYohann // 1D transpose tensor contraction x
349e201c85SYohann //------------------------------------------------------------------------------
359e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
369e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
379e201c85SYohann   data.slice[data.t_id_x] = *U;
389e201c85SYohann   __syncthreads();
399e201c85SYohann   *V = 0.0;
409e201c85SYohann   if (data.t_id_x < P_1D) {
419e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
429e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
439e201c85SYohann     }
449e201c85SYohann   }
459e201c85SYohann   __syncthreads();
469e201c85SYohann }
479e201c85SYohann 
489e201c85SYohann //------------------------------------------------------------------------------
499e201c85SYohann // 1D interpolate to quadrature points
509e201c85SYohann //------------------------------------------------------------------------------
5199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
529e201c85SYohann inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
539e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
559e201c85SYohann   }
569e201c85SYohann }
579e201c85SYohann 
589e201c85SYohann //------------------------------------------------------------------------------
599e201c85SYohann // 1D interpolate transpose
609e201c85SYohann //------------------------------------------------------------------------------
6199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
622b730f8bSJeremy L Thompson inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
632b730f8bSJeremy L Thompson                                          CeedScalar *__restrict__ r_V) {
649e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
669e201c85SYohann   }
679e201c85SYohann }
689e201c85SYohann 
699e201c85SYohann //------------------------------------------------------------------------------
709e201c85SYohann // 1D derivatives at quadrature points
719e201c85SYohann //------------------------------------------------------------------------------
7299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
732b730f8bSJeremy L Thompson inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
742b730f8bSJeremy L Thompson                               CeedScalar *__restrict__ r_V) {
759e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
779e201c85SYohann   }
789e201c85SYohann }
799e201c85SYohann 
809e201c85SYohann //------------------------------------------------------------------------------
819e201c85SYohann // 1D derivatives transpose
829e201c85SYohann //------------------------------------------------------------------------------
8399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
842b730f8bSJeremy L Thompson inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
852b730f8bSJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
869e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
889e201c85SYohann   }
899e201c85SYohann }
909e201c85SYohann 
919e201c85SYohann //------------------------------------------------------------------------------
929e201c85SYohann // 1D quadrature weights
939e201c85SYohann //------------------------------------------------------------------------------
94*343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
959e201c85SYohann inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
969e201c85SYohann   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
979e201c85SYohann }
989e201c85SYohann 
999e201c85SYohann //------------------------------------------------------------------------------
1009e201c85SYohann // 2D
1019e201c85SYohann //------------------------------------------------------------------------------
1029e201c85SYohann 
1039e201c85SYohann //------------------------------------------------------------------------------
1049e201c85SYohann // 2D tensor contraction x
1059e201c85SYohann //------------------------------------------------------------------------------
10699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107*343e3094SJeremy L Thompson inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
108*343e3094SJeremy L Thompson                                    CeedScalar *V) {
109*343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1109e201c85SYohann   __syncthreads();
1119e201c85SYohann   *V = 0.0;
112*343e3094SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
1139e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
114*343e3094SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
1159e201c85SYohann     }
1169e201c85SYohann   }
1179e201c85SYohann   __syncthreads();
1189e201c85SYohann }
1199e201c85SYohann 
1209e201c85SYohann //------------------------------------------------------------------------------
1219e201c85SYohann // 2D tensor contract y
1229e201c85SYohann //------------------------------------------------------------------------------
12399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
124*343e3094SJeremy L Thompson inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
125*343e3094SJeremy L Thompson                                    CeedScalar *V) {
126*343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1279e201c85SYohann   __syncthreads();
1289e201c85SYohann   *V = 0.0;
129*343e3094SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
1309e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
131*343e3094SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
1329e201c85SYohann     }
1339e201c85SYohann   }
1349e201c85SYohann   __syncthreads();
1359e201c85SYohann }
1369e201c85SYohann 
1379e201c85SYohann //------------------------------------------------------------------------------
1389e201c85SYohann // 2D transpose tensor contract y
1399e201c85SYohann //------------------------------------------------------------------------------
14099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
141*343e3094SJeremy L Thompson inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
142*343e3094SJeremy L Thompson                                             CeedScalar *V) {
143*343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1449e201c85SYohann   __syncthreads();
1459e201c85SYohann   *V = 0.0;
146*343e3094SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
1479e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
148*343e3094SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
1499e201c85SYohann     }
1509e201c85SYohann   }
1519e201c85SYohann   __syncthreads();
1529e201c85SYohann }
1539e201c85SYohann 
1549e201c85SYohann //------------------------------------------------------------------------------
1559e201c85SYohann // 2D transpose tensor contract x
1569e201c85SYohann //------------------------------------------------------------------------------
15799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
158*343e3094SJeremy L Thompson inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
159*343e3094SJeremy L Thompson                                             CeedScalar *V) {
160*343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1619e201c85SYohann   __syncthreads();
1629e201c85SYohann   *V = 0.0;
163*343e3094SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
1649e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
165*343e3094SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
1669e201c85SYohann     }
1679e201c85SYohann   }
1689e201c85SYohann   __syncthreads();
1699e201c85SYohann }
1709e201c85SYohann 
1719e201c85SYohann //------------------------------------------------------------------------------
1729e201c85SYohann // 2D transpose tensor contract and add x
1739e201c85SYohann //------------------------------------------------------------------------------
17499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
175*343e3094SJeremy L Thompson inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
176*343e3094SJeremy L Thompson                                                CeedScalar *V) {
177*343e3094SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
1789e201c85SYohann   __syncthreads();
179*343e3094SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
1809e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
181*343e3094SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
1829e201c85SYohann     }
1839e201c85SYohann   }
1849e201c85SYohann   __syncthreads();
1859e201c85SYohann }
1869e201c85SYohann 
1879e201c85SYohann //------------------------------------------------------------------------------
1889e201c85SYohann // 2D interpolate to quadrature points
1899e201c85SYohann //------------------------------------------------------------------------------
19099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
191*343e3094SJeremy L Thompson inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
192*343e3094SJeremy L Thompson                                            const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1939e201c85SYohann   CeedScalar r_t[1];
1949e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
195*343e3094SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
196*343e3094SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1979e201c85SYohann   }
1989e201c85SYohann }
1999e201c85SYohann 
200*343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
201*343e3094SJeremy L Thompson inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
202*343e3094SJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
203*343e3094SJeremy L Thompson   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
204*343e3094SJeremy L Thompson }
205*343e3094SJeremy L Thompson 
2069e201c85SYohann //------------------------------------------------------------------------------
2079e201c85SYohann // 2D interpolate transpose
2089e201c85SYohann //------------------------------------------------------------------------------
20999421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
210*343e3094SJeremy L Thompson inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
211*343e3094SJeremy L Thompson                                                     const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
2129e201c85SYohann   CeedScalar r_t[1];
2139e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
214*343e3094SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
215*343e3094SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
2169e201c85SYohann   }
2179e201c85SYohann }
2189e201c85SYohann 
219*343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
220*343e3094SJeremy L Thompson inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
221*343e3094SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
222*343e3094SJeremy L Thompson   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
223*343e3094SJeremy L Thompson }
224*343e3094SJeremy L Thompson 
2259e201c85SYohann //------------------------------------------------------------------------------
2269e201c85SYohann // 2D derivatives at quadrature points
2279e201c85SYohann //------------------------------------------------------------------------------
22899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
229*343e3094SJeremy L Thompson inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
230*343e3094SJeremy L Thompson                                          const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2319e201c85SYohann   CeedScalar r_t[1];
2329e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
233*343e3094SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
234*343e3094SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
235*343e3094SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
236*343e3094SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
2379e201c85SYohann   }
2389e201c85SYohann }
2399e201c85SYohann 
240*343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
241*343e3094SJeremy L Thompson inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
242*343e3094SJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
243*343e3094SJeremy L Thompson   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
244*343e3094SJeremy L Thompson }
245*343e3094SJeremy L Thompson 
2469e201c85SYohann //------------------------------------------------------------------------------
2479e201c85SYohann // 2D derivatives transpose
2489e201c85SYohann //------------------------------------------------------------------------------
24999421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
250*343e3094SJeremy L Thompson inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
251*343e3094SJeremy L Thompson                                                   const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2529e201c85SYohann   CeedScalar r_t[1];
2539e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
254*343e3094SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
255*343e3094SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
256*343e3094SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
257*343e3094SJeremy L Thompson     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
2589e201c85SYohann   }
2599e201c85SYohann }
2609e201c85SYohann 
261*343e3094SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
262*343e3094SJeremy L Thompson inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
263*343e3094SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
264*343e3094SJeremy L Thompson   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
265*343e3094SJeremy L Thompson }
266*343e3094SJeremy L Thompson 
2679e201c85SYohann //------------------------------------------------------------------------------
2689e201c85SYohann // 2D quadrature weights
2699e201c85SYohann //------------------------------------------------------------------------------
2709e201c85SYohann template <int Q_1D>
271*343e3094SJeremy L Thompson inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
272*343e3094SJeremy L Thompson                                            CeedScalar *w) {
273*343e3094SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
274*343e3094SJeremy L Thompson }
275*343e3094SJeremy L Thompson 
276*343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
2779e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
278*343e3094SJeremy L Thompson   WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
2799e201c85SYohann }
2809e201c85SYohann 
2819e201c85SYohann //------------------------------------------------------------------------------
2829e201c85SYohann // 3D
2839e201c85SYohann //------------------------------------------------------------------------------
2849e201c85SYohann 
2859e201c85SYohann //------------------------------------------------------------------------------
2869e201c85SYohann // 3D tensor contract x
2879e201c85SYohann //------------------------------------------------------------------------------
28899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2899e201c85SYohann inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2909e201c85SYohann   CeedScalar r_B[P_1D];
2919e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
2929e201c85SYohann     r_B[i] = B[i + data.t_id_x * P_1D];
2939e201c85SYohann   }
2949e201c85SYohann 
2959e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
2969e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
2979e201c85SYohann     __syncthreads();
2989e201c85SYohann     V[k] = 0.0;
2999e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3009e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3019e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
3029e201c85SYohann       }
3039e201c85SYohann     }
3049e201c85SYohann     __syncthreads();
3059e201c85SYohann   }
3069e201c85SYohann }
3079e201c85SYohann 
3089e201c85SYohann //------------------------------------------------------------------------------
3099e201c85SYohann // 3D tensor contract y
3109e201c85SYohann //------------------------------------------------------------------------------
31199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3129e201c85SYohann inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3139e201c85SYohann   CeedScalar r_B[P_1D];
3149e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
3159e201c85SYohann     r_B[i] = B[i + data.t_id_y * P_1D];
3169e201c85SYohann   }
3179e201c85SYohann 
3189e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3199e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3209e201c85SYohann     __syncthreads();
3219e201c85SYohann     V[k] = 0.0;
3229e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3239e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3249e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3259e201c85SYohann       }
3269e201c85SYohann     }
3279e201c85SYohann     __syncthreads();
3289e201c85SYohann   }
3299e201c85SYohann }
3309e201c85SYohann 
3319e201c85SYohann //------------------------------------------------------------------------------
3329e201c85SYohann // 3D tensor contract z
3339e201c85SYohann //------------------------------------------------------------------------------
33499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3359e201c85SYohann inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3369e201c85SYohann   for (CeedInt k = 0; k < Q_1D; k++) {
3379e201c85SYohann     V[k] = 0.0;
3389e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3399e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3409e201c85SYohann         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
3419e201c85SYohann       }
3429e201c85SYohann     }
3439e201c85SYohann   }
3449e201c85SYohann }
3459e201c85SYohann 
3469e201c85SYohann //------------------------------------------------------------------------------
3479e201c85SYohann // 3D transpose tensor contract z
3489e201c85SYohann //------------------------------------------------------------------------------
34999421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3509e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3519e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3529e201c85SYohann     V[k] = 0.0;
3539e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3549e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3559e201c85SYohann         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
3569e201c85SYohann       }
3579e201c85SYohann     }
3589e201c85SYohann   }
3599e201c85SYohann }
3609e201c85SYohann 
3619e201c85SYohann //------------------------------------------------------------------------------
3629e201c85SYohann // 3D transpose tensor contract y
3639e201c85SYohann //------------------------------------------------------------------------------
36499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3659e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3669e201c85SYohann   CeedScalar r_B[Q_1D];
3679e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
3689e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
3699e201c85SYohann   }
3709e201c85SYohann 
3719e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3729e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3739e201c85SYohann     __syncthreads();
3749e201c85SYohann     V[k] = 0.0;
3759e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3769e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3779e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3789e201c85SYohann       }
3799e201c85SYohann     }
3809e201c85SYohann     __syncthreads();
3819e201c85SYohann   }
3829e201c85SYohann }
3839e201c85SYohann 
3849e201c85SYohann //------------------------------------------------------------------------------
3859e201c85SYohann // 3D transpose tensor contract y
3869e201c85SYohann //------------------------------------------------------------------------------
38799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3889e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3899e201c85SYohann   CeedScalar r_B[Q_1D];
3909e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
3919e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
3929e201c85SYohann   }
3939e201c85SYohann 
3949e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3959e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3969e201c85SYohann     __syncthreads();
3979e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3989e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3999e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
4009e201c85SYohann       }
4019e201c85SYohann     }
4029e201c85SYohann     __syncthreads();
4039e201c85SYohann   }
4049e201c85SYohann }
4059e201c85SYohann 
4069e201c85SYohann //------------------------------------------------------------------------------
4079e201c85SYohann // 3D transpose tensor contract x
4089e201c85SYohann //------------------------------------------------------------------------------
40999421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4109e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4119e201c85SYohann   CeedScalar r_B[Q_1D];
4129e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4139e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4149e201c85SYohann   }
4159e201c85SYohann 
4169e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
4179e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4189e201c85SYohann     __syncthreads();
4199e201c85SYohann     V[k] = 0.0;
4209e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4219e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4229e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4239e201c85SYohann       }
4249e201c85SYohann     }
4259e201c85SYohann     __syncthreads();
4269e201c85SYohann   }
4279e201c85SYohann }
4289e201c85SYohann 
4299e201c85SYohann //------------------------------------------------------------------------------
4309e201c85SYohann // 3D transpose tensor contract add x
4319e201c85SYohann //------------------------------------------------------------------------------
43299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4339e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4349e201c85SYohann   CeedScalar r_B[Q_1D];
4359e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4369e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4379e201c85SYohann   }
4389e201c85SYohann 
4399e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
4409e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4419e201c85SYohann     __syncthreads();
4429e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4439e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4449e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4459e201c85SYohann       }
4469e201c85SYohann     }
4479e201c85SYohann     __syncthreads();
4489e201c85SYohann   }
4499e201c85SYohann }
4509e201c85SYohann 
4519e201c85SYohann //------------------------------------------------------------------------------
4529e201c85SYohann // 3D interpolate to quadrature points
4539e201c85SYohann //------------------------------------------------------------------------------
45499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4552b730f8bSJeremy L Thompson inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4562b730f8bSJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
4579e201c85SYohann   CeedScalar r_t1[T_1D];
4589e201c85SYohann   CeedScalar r_t2[T_1D];
4599e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
46099421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
46199421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
46299421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
4639e201c85SYohann   }
4649e201c85SYohann }
4659e201c85SYohann 
4669e201c85SYohann //------------------------------------------------------------------------------
4679e201c85SYohann // 3D interpolate transpose
4689e201c85SYohann //------------------------------------------------------------------------------
46999421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4702b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4712b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
4729e201c85SYohann   CeedScalar r_t1[T_1D];
4739e201c85SYohann   CeedScalar r_t2[T_1D];
4749e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47599421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
47699421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
47799421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
4789e201c85SYohann   }
4799e201c85SYohann }
4809e201c85SYohann 
4819e201c85SYohann //------------------------------------------------------------------------------
4829e201c85SYohann // 3D derivatives at quadrature points
4839e201c85SYohann //------------------------------------------------------------------------------
48499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4852b730f8bSJeremy L Thompson inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4862b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
4879e201c85SYohann   CeedScalar r_t1[T_1D];
4889e201c85SYohann   CeedScalar r_t2[T_1D];
4899e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
49099421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
49199421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
49299421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
49399421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
49499421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
49599421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
49699421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
49799421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
49899421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
4999e201c85SYohann   }
5009e201c85SYohann }
5019e201c85SYohann 
5029e201c85SYohann //------------------------------------------------------------------------------
5039e201c85SYohann // 3D derivatives transpose
5049e201c85SYohann //------------------------------------------------------------------------------
50599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5062b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5072b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
5089e201c85SYohann   CeedScalar r_t1[T_1D];
5099e201c85SYohann   CeedScalar r_t2[T_1D];
5109e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
51199421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
51299421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
51399421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
51499421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
51599421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
51699421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
51799421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
51899421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
51999421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5209e201c85SYohann   }
5219e201c85SYohann }
5229e201c85SYohann 
5239e201c85SYohann //------------------------------------------------------------------------------
5249e201c85SYohann // 3D derivatives at quadrature points
5259e201c85SYohann //------------------------------------------------------------------------------
52699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5272b730f8bSJeremy L Thompson inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5282b730f8bSJeremy L Thompson                                               CeedScalar *__restrict__ r_V) {
5299e201c85SYohann   CeedScalar r_t1[T_1D];
5309e201c85SYohann   CeedScalar r_t2[T_1D];
5319e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
53299421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
53399421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
53499421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
53599421279SJeremy L Thompson     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
53699421279SJeremy L Thompson     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
53799421279SJeremy L Thompson     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5389e201c85SYohann   }
5399e201c85SYohann }
5409e201c85SYohann 
5419e201c85SYohann //------------------------------------------------------------------------------
5429e201c85SYohann // 3D derivatives transpose
5439e201c85SYohann //------------------------------------------------------------------------------
54499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5452b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5462b730f8bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5479e201c85SYohann   CeedScalar r_t1[T_1D];
5489e201c85SYohann   CeedScalar r_t2[T_1D];
5499e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
55099421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
55199421279SJeremy L Thompson     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
55299421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
55399421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
55499421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
55599421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5569e201c85SYohann   }
5579e201c85SYohann }
5589e201c85SYohann 
5599e201c85SYohann //------------------------------------------------------------------------------
5609e201c85SYohann // 3D quadrature weights
5619e201c85SYohann //------------------------------------------------------------------------------
562*343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
5639e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
5649e201c85SYohann   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
5659e201c85SYohann   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
5669e201c85SYohann   for (CeedInt q = 0; q < Q_1D; q++) {
5679e201c85SYohann     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
5689e201c85SYohann   }
5699e201c85SYohann }
570