xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 94b7b29b41ad8a17add4c577886859ef16f89dec)
19e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA shared memory tensor product basis templates
10*94b7b29bSJeremy L Thompson #ifndef CEED_CUDA_SHARED_BASIS_TENSOR_TEMPLATES_H
11*94b7b29bSJeremy L Thompson #define CEED_CUDA_SHARED_BASIS_TENSOR_TEMPLATES_H
129e201c85SYohann 
139e201c85SYohann #include <ceed.h>
149e201c85SYohann 
159e201c85SYohann //------------------------------------------------------------------------------
169e201c85SYohann // 1D
179e201c85SYohann //------------------------------------------------------------------------------
189e201c85SYohann 
199e201c85SYohann //------------------------------------------------------------------------------
209e201c85SYohann // 1D tensor contraction x
219e201c85SYohann //------------------------------------------------------------------------------
229e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
239e201c85SYohann inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
249e201c85SYohann   data.slice[data.t_id_x] = *U;
259e201c85SYohann   __syncthreads();
269e201c85SYohann   *V = 0.0;
279e201c85SYohann   if (data.t_id_x < Q_1D) {
289e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
299e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
309e201c85SYohann     }
319e201c85SYohann   }
329e201c85SYohann   __syncthreads();
339e201c85SYohann }
349e201c85SYohann 
359e201c85SYohann //------------------------------------------------------------------------------
369e201c85SYohann // 1D transpose tensor contraction x
379e201c85SYohann //------------------------------------------------------------------------------
389e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
399e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
409e201c85SYohann   data.slice[data.t_id_x] = *U;
419e201c85SYohann   __syncthreads();
429e201c85SYohann   *V = 0.0;
439e201c85SYohann   if (data.t_id_x < P_1D) {
449e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
459e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
469e201c85SYohann     }
479e201c85SYohann   }
489e201c85SYohann   __syncthreads();
499e201c85SYohann }
509e201c85SYohann 
519e201c85SYohann //------------------------------------------------------------------------------
529e201c85SYohann // 1D interpolate to quadrature points
539e201c85SYohann //------------------------------------------------------------------------------
549e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
559e201c85SYohann inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
569e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
579e201c85SYohann     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
589e201c85SYohann   }
599e201c85SYohann }
609e201c85SYohann 
619e201c85SYohann //------------------------------------------------------------------------------
629e201c85SYohann // 1D interpolate transpose
639e201c85SYohann //------------------------------------------------------------------------------
649e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
652b730f8bSJeremy L Thompson inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
662b730f8bSJeremy L Thompson                                          CeedScalar *__restrict__ r_V) {
679e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
689e201c85SYohann     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
699e201c85SYohann   }
709e201c85SYohann }
719e201c85SYohann 
729e201c85SYohann //------------------------------------------------------------------------------
739e201c85SYohann // 1D derivatives at quadrature points
749e201c85SYohann //------------------------------------------------------------------------------
759e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
762b730f8bSJeremy L Thompson inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
772b730f8bSJeremy L Thompson                               CeedScalar *__restrict__ r_V) {
789e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
799e201c85SYohann     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
809e201c85SYohann   }
819e201c85SYohann }
829e201c85SYohann 
839e201c85SYohann //------------------------------------------------------------------------------
849e201c85SYohann // 1D derivatives transpose
859e201c85SYohann //------------------------------------------------------------------------------
869e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
872b730f8bSJeremy L Thompson inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
882b730f8bSJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
899e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
909e201c85SYohann     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
919e201c85SYohann   }
929e201c85SYohann }
939e201c85SYohann 
949e201c85SYohann //------------------------------------------------------------------------------
959e201c85SYohann // 1D quadrature weights
969e201c85SYohann //------------------------------------------------------------------------------
979e201c85SYohann template <int Q_1D>
989e201c85SYohann inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
999e201c85SYohann   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
1009e201c85SYohann }
1019e201c85SYohann 
1029e201c85SYohann //------------------------------------------------------------------------------
1039e201c85SYohann // 2D
1049e201c85SYohann //------------------------------------------------------------------------------
1059e201c85SYohann 
1069e201c85SYohann //------------------------------------------------------------------------------
1079e201c85SYohann // 2D tensor contraction x
1089e201c85SYohann //------------------------------------------------------------------------------
1099e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
1109e201c85SYohann inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1119e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1129e201c85SYohann   __syncthreads();
1139e201c85SYohann   *V = 0.0;
1149e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1159e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
1169e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1179e201c85SYohann     }
1189e201c85SYohann   }
1199e201c85SYohann   __syncthreads();
1209e201c85SYohann }
1219e201c85SYohann 
1229e201c85SYohann //------------------------------------------------------------------------------
1239e201c85SYohann // 2D tensor contract y
1249e201c85SYohann //------------------------------------------------------------------------------
1259e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
1269e201c85SYohann inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1279e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1289e201c85SYohann   __syncthreads();
1299e201c85SYohann   *V = 0.0;
1309e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
1319e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
1329e201c85SYohann       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
1339e201c85SYohann     }
1349e201c85SYohann   }
1359e201c85SYohann   __syncthreads();
1369e201c85SYohann }
1379e201c85SYohann 
1389e201c85SYohann //------------------------------------------------------------------------------
1399e201c85SYohann // 2D transpose tensor contract y
1409e201c85SYohann //------------------------------------------------------------------------------
1419e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
1429e201c85SYohann inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1439e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1449e201c85SYohann   __syncthreads();
1459e201c85SYohann   *V = 0.0;
1469e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1479e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1489e201c85SYohann       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
1499e201c85SYohann     }
1509e201c85SYohann   }
1519e201c85SYohann   __syncthreads();
1529e201c85SYohann }
1539e201c85SYohann 
1549e201c85SYohann //------------------------------------------------------------------------------
1559e201c85SYohann // 2D transpose tensor contract x
1569e201c85SYohann //------------------------------------------------------------------------------
1579e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
1589e201c85SYohann inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1599e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1609e201c85SYohann   __syncthreads();
1619e201c85SYohann   *V = 0.0;
1629e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1639e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1649e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1659e201c85SYohann     }
1669e201c85SYohann   }
1679e201c85SYohann   __syncthreads();
1689e201c85SYohann }
1699e201c85SYohann 
1709e201c85SYohann //------------------------------------------------------------------------------
1719e201c85SYohann // 2D transpose tensor contract and add x
1729e201c85SYohann //------------------------------------------------------------------------------
1739e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
1749e201c85SYohann inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1759e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1769e201c85SYohann   __syncthreads();
1779e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1789e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1799e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1809e201c85SYohann     }
1819e201c85SYohann   }
1829e201c85SYohann   __syncthreads();
1839e201c85SYohann }
1849e201c85SYohann 
1859e201c85SYohann //------------------------------------------------------------------------------
1869e201c85SYohann // 2D interpolate to quadrature points
1879e201c85SYohann //------------------------------------------------------------------------------
1889e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
1892b730f8bSJeremy L Thompson inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1902b730f8bSJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
1919e201c85SYohann   CeedScalar r_t[1];
1929e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1939e201c85SYohann     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
1949e201c85SYohann     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
1959e201c85SYohann   }
1969e201c85SYohann }
1979e201c85SYohann 
1989e201c85SYohann //------------------------------------------------------------------------------
1999e201c85SYohann // 2D interpolate transpose
2009e201c85SYohann //------------------------------------------------------------------------------
2019e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
2022b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2032b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
2049e201c85SYohann   CeedScalar r_t[1];
2059e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2069e201c85SYohann     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
2079e201c85SYohann     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
2089e201c85SYohann   }
2099e201c85SYohann }
2109e201c85SYohann 
2119e201c85SYohann //------------------------------------------------------------------------------
2129e201c85SYohann // 2D derivatives at quadrature points
2139e201c85SYohann //------------------------------------------------------------------------------
2149e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
2152b730f8bSJeremy L Thompson inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2162b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
2179e201c85SYohann   CeedScalar r_t[1];
2189e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2199e201c85SYohann     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
2209e201c85SYohann     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
2219e201c85SYohann     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
2229e201c85SYohann     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
2239e201c85SYohann   }
2249e201c85SYohann }
2259e201c85SYohann 
2269e201c85SYohann //------------------------------------------------------------------------------
2279e201c85SYohann // 2D derivatives transpose
2289e201c85SYohann //------------------------------------------------------------------------------
2299e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
2302b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2312b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
2329e201c85SYohann   CeedScalar r_t[1];
2339e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2349e201c85SYohann     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
2359e201c85SYohann     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
2369e201c85SYohann     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
2379e201c85SYohann     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
2389e201c85SYohann   }
2399e201c85SYohann }
2409e201c85SYohann 
2419e201c85SYohann //------------------------------------------------------------------------------
2429e201c85SYohann // 2D quadrature weights
2439e201c85SYohann //------------------------------------------------------------------------------
2449e201c85SYohann template <int Q_1D>
2459e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
2462b730f8bSJeremy L Thompson   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
2479e201c85SYohann }
2489e201c85SYohann 
2499e201c85SYohann //------------------------------------------------------------------------------
2509e201c85SYohann // 3D
2519e201c85SYohann //------------------------------------------------------------------------------
2529e201c85SYohann 
2539e201c85SYohann //------------------------------------------------------------------------------
2549e201c85SYohann // 3D tensor contract x
2559e201c85SYohann //------------------------------------------------------------------------------
2569e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
2579e201c85SYohann inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2589e201c85SYohann   CeedScalar r_B[P_1D];
2599e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
2609e201c85SYohann     r_B[i] = B[i + data.t_id_x * P_1D];
2619e201c85SYohann   }
2629e201c85SYohann 
2639e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
2649e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
2659e201c85SYohann     __syncthreads();
2669e201c85SYohann     V[k] = 0.0;
2679e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
2689e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
2699e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
2709e201c85SYohann       }
2719e201c85SYohann     }
2729e201c85SYohann     __syncthreads();
2739e201c85SYohann   }
2749e201c85SYohann }
2759e201c85SYohann 
2769e201c85SYohann //------------------------------------------------------------------------------
2779e201c85SYohann // 3D tensor contract y
2789e201c85SYohann //------------------------------------------------------------------------------
2799e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
2809e201c85SYohann inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2819e201c85SYohann   CeedScalar r_B[P_1D];
2829e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
2839e201c85SYohann     r_B[i] = B[i + data.t_id_y * P_1D];
2849e201c85SYohann   }
2859e201c85SYohann 
2869e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
2879e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
2889e201c85SYohann     __syncthreads();
2899e201c85SYohann     V[k] = 0.0;
2909e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
2919e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
2929e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
2939e201c85SYohann       }
2949e201c85SYohann     }
2959e201c85SYohann     __syncthreads();
2969e201c85SYohann   }
2979e201c85SYohann }
2989e201c85SYohann 
2999e201c85SYohann //------------------------------------------------------------------------------
3009e201c85SYohann // 3D tensor contract z
3019e201c85SYohann //------------------------------------------------------------------------------
3029e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
3039e201c85SYohann inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3049e201c85SYohann   for (CeedInt k = 0; k < Q_1D; k++) {
3059e201c85SYohann     V[k] = 0.0;
3069e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3079e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3089e201c85SYohann         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
3099e201c85SYohann       }
3109e201c85SYohann     }
3119e201c85SYohann   }
3129e201c85SYohann }
3139e201c85SYohann 
3149e201c85SYohann //------------------------------------------------------------------------------
3159e201c85SYohann // 3D transpose tensor contract z
3169e201c85SYohann //------------------------------------------------------------------------------
3179e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
3189e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3199e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3209e201c85SYohann     V[k] = 0.0;
3219e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3229e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3239e201c85SYohann         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
3249e201c85SYohann       }
3259e201c85SYohann     }
3269e201c85SYohann   }
3279e201c85SYohann }
3289e201c85SYohann 
3299e201c85SYohann //------------------------------------------------------------------------------
3309e201c85SYohann // 3D transpose tensor contract y
3319e201c85SYohann //------------------------------------------------------------------------------
3329e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
3339e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3349e201c85SYohann   CeedScalar r_B[Q_1D];
3359e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
3369e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
3379e201c85SYohann   }
3389e201c85SYohann 
3399e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3409e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3419e201c85SYohann     __syncthreads();
3429e201c85SYohann     V[k] = 0.0;
3439e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3449e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3459e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3469e201c85SYohann       }
3479e201c85SYohann     }
3489e201c85SYohann     __syncthreads();
3499e201c85SYohann   }
3509e201c85SYohann }
3519e201c85SYohann 
3529e201c85SYohann //------------------------------------------------------------------------------
3539e201c85SYohann // 3D transpose tensor contract y
3549e201c85SYohann //------------------------------------------------------------------------------
3559e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
3569e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3579e201c85SYohann   CeedScalar r_B[Q_1D];
3589e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
3599e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
3609e201c85SYohann   }
3619e201c85SYohann 
3629e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3639e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3649e201c85SYohann     __syncthreads();
3659e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3669e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3679e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3689e201c85SYohann       }
3699e201c85SYohann     }
3709e201c85SYohann     __syncthreads();
3719e201c85SYohann   }
3729e201c85SYohann }
3739e201c85SYohann 
3749e201c85SYohann //------------------------------------------------------------------------------
3759e201c85SYohann // 3D transpose tensor contract x
3769e201c85SYohann //------------------------------------------------------------------------------
3779e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
3789e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3799e201c85SYohann   CeedScalar r_B[Q_1D];
3809e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
3819e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
3829e201c85SYohann   }
3839e201c85SYohann 
3849e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3859e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3869e201c85SYohann     __syncthreads();
3879e201c85SYohann     V[k] = 0.0;
3889e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
3899e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3909e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
3919e201c85SYohann       }
3929e201c85SYohann     }
3939e201c85SYohann     __syncthreads();
3949e201c85SYohann   }
3959e201c85SYohann }
3969e201c85SYohann 
3979e201c85SYohann //------------------------------------------------------------------------------
3989e201c85SYohann // 3D transpose tensor contract add x
3999e201c85SYohann //------------------------------------------------------------------------------
4009e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
4019e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4029e201c85SYohann   CeedScalar r_B[Q_1D];
4039e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4049e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4059e201c85SYohann   }
4069e201c85SYohann 
4079e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
4089e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4099e201c85SYohann     __syncthreads();
4109e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4119e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4129e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4139e201c85SYohann       }
4149e201c85SYohann     }
4159e201c85SYohann     __syncthreads();
4169e201c85SYohann   }
4179e201c85SYohann }
4189e201c85SYohann 
4199e201c85SYohann //------------------------------------------------------------------------------
4209e201c85SYohann // 3D interpolate to quadrature points
4219e201c85SYohann //------------------------------------------------------------------------------
4229e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
4232b730f8bSJeremy L Thompson inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4242b730f8bSJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
4259e201c85SYohann   CeedScalar r_t1[T_1D];
4269e201c85SYohann   CeedScalar r_t2[T_1D];
4279e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4289e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
4299e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
4309e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
4319e201c85SYohann   }
4329e201c85SYohann }
4339e201c85SYohann 
4349e201c85SYohann //------------------------------------------------------------------------------
4359e201c85SYohann // 3D interpolate transpose
4369e201c85SYohann //------------------------------------------------------------------------------
4379e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
4382b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4392b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
4409e201c85SYohann   CeedScalar r_t1[T_1D];
4419e201c85SYohann   CeedScalar r_t2[T_1D];
4429e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4439e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
4449e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
4459e201c85SYohann     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
4469e201c85SYohann   }
4479e201c85SYohann }
4489e201c85SYohann 
4499e201c85SYohann //------------------------------------------------------------------------------
4509e201c85SYohann // 3D derivatives at quadrature points
4519e201c85SYohann //------------------------------------------------------------------------------
4529e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
4532b730f8bSJeremy L Thompson inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4542b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
4559e201c85SYohann   CeedScalar r_t1[T_1D];
4569e201c85SYohann   CeedScalar r_t2[T_1D];
4579e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4589e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
4599e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
4609e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
4619e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
4629e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
4639e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
4649e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
4659e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
4669e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
4679e201c85SYohann   }
4689e201c85SYohann }
4699e201c85SYohann 
4709e201c85SYohann //------------------------------------------------------------------------------
4719e201c85SYohann // 3D derivatives transpose
4729e201c85SYohann //------------------------------------------------------------------------------
4739e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
4742b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4752b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
4769e201c85SYohann   CeedScalar r_t1[T_1D];
4779e201c85SYohann   CeedScalar r_t2[T_1D];
4789e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4799e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
4809e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
4819e201c85SYohann     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
4829e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
4839e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
4849e201c85SYohann     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
4859e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
4869e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
4879e201c85SYohann     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
4889e201c85SYohann   }
4899e201c85SYohann }
4909e201c85SYohann 
4919e201c85SYohann //------------------------------------------------------------------------------
4929e201c85SYohann // 3D derivatives at quadrature points
4939e201c85SYohann //------------------------------------------------------------------------------
4949e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
4952b730f8bSJeremy L Thompson inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4962b730f8bSJeremy L Thompson                                               CeedScalar *__restrict__ r_V) {
4979e201c85SYohann   CeedScalar r_t1[T_1D];
4989e201c85SYohann   CeedScalar r_t2[T_1D];
4999e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5009e201c85SYohann     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
5019e201c85SYohann     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
5029e201c85SYohann     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
5039e201c85SYohann     ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
5049e201c85SYohann     ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
5059e201c85SYohann     ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
5069e201c85SYohann   }
5079e201c85SYohann }
5089e201c85SYohann 
5099e201c85SYohann //------------------------------------------------------------------------------
5109e201c85SYohann // 3D derivatives transpose
5119e201c85SYohann //------------------------------------------------------------------------------
5129e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
5132b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5142b730f8bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5159e201c85SYohann   CeedScalar r_t1[T_1D];
5169e201c85SYohann   CeedScalar r_t2[T_1D];
5179e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5189e201c85SYohann     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
5199e201c85SYohann     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
5209e201c85SYohann     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
5219e201c85SYohann     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
5229e201c85SYohann     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
5239e201c85SYohann     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
5249e201c85SYohann   }
5259e201c85SYohann }
5269e201c85SYohann 
5279e201c85SYohann //------------------------------------------------------------------------------
5289e201c85SYohann // 3D quadrature weights
5299e201c85SYohann //------------------------------------------------------------------------------
5309e201c85SYohann template <int Q_1D>
5319e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
5329e201c85SYohann   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
5339e201c85SYohann   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
5349e201c85SYohann   for (CeedInt q = 0; q < Q_1D; q++) {
5359e201c85SYohann     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
5369e201c85SYohann   }
5379e201c85SYohann }
5389e201c85SYohann 
5399e201c85SYohann //------------------------------------------------------------------------------
5409e201c85SYohann 
541*94b7b29bSJeremy L Thompson #endif  // CEED_CUDA_SHARED_BASIS_TENSOR_TEMPLATES_H
542