xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 2129291034139a1db9ffe414bf8bc92a8e43e245)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA shared memory tensor product basis templates
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // 1D
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann 
169e201c85SYohann //------------------------------------------------------------------------------
179e201c85SYohann // 1D tensor contraction x
189e201c85SYohann //------------------------------------------------------------------------------
199e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
209e201c85SYohann inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21d6c19ee8SJeremy L Thompson   __syncthreads();
229e201c85SYohann   data.slice[data.t_id_x] = *U;
239e201c85SYohann   __syncthreads();
249e201c85SYohann   *V = 0.0;
259e201c85SYohann   if (data.t_id_x < Q_1D) {
269e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
279e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
289e201c85SYohann     }
299e201c85SYohann   }
309e201c85SYohann }
319e201c85SYohann 
329e201c85SYohann //------------------------------------------------------------------------------
339e201c85SYohann // 1D transpose tensor contraction x
349e201c85SYohann //------------------------------------------------------------------------------
359e201c85SYohann template <int NUM_COMP, int P_1D, int Q_1D>
369e201c85SYohann inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37d6c19ee8SJeremy L Thompson   __syncthreads();
389e201c85SYohann   data.slice[data.t_id_x] = *U;
399e201c85SYohann   __syncthreads();
409e201c85SYohann   *V = 0.0;
419e201c85SYohann   if (data.t_id_x < P_1D) {
429e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
439e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
449e201c85SYohann     }
459e201c85SYohann   }
469e201c85SYohann }
479e201c85SYohann 
489e201c85SYohann //------------------------------------------------------------------------------
499e201c85SYohann // 1D interpolate to quadrature points
509e201c85SYohann //------------------------------------------------------------------------------
5199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
529e201c85SYohann inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
539e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
559e201c85SYohann   }
569e201c85SYohann }
579e201c85SYohann 
589e201c85SYohann //------------------------------------------------------------------------------
599e201c85SYohann // 1D interpolate transpose
609e201c85SYohann //------------------------------------------------------------------------------
6199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
622b730f8bSJeremy L Thompson inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
632b730f8bSJeremy L Thompson                                          CeedScalar *__restrict__ r_V) {
649e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
669e201c85SYohann   }
679e201c85SYohann }
689e201c85SYohann 
699e201c85SYohann //------------------------------------------------------------------------------
709e201c85SYohann // 1D derivatives at quadrature points
719e201c85SYohann //------------------------------------------------------------------------------
7299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
732b730f8bSJeremy L Thompson inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
742b730f8bSJeremy L Thompson                               CeedScalar *__restrict__ r_V) {
759e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76db2becc9SJeremy L Thompson     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
779e201c85SYohann   }
789e201c85SYohann }
799e201c85SYohann 
809e201c85SYohann //------------------------------------------------------------------------------
819e201c85SYohann // 1D derivatives transpose
829e201c85SYohann //------------------------------------------------------------------------------
8399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
842b730f8bSJeremy L Thompson inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
852b730f8bSJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
869e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87db2becc9SJeremy L Thompson     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
889e201c85SYohann   }
899e201c85SYohann }
909e201c85SYohann 
919e201c85SYohann //------------------------------------------------------------------------------
929e201c85SYohann // 1D quadrature weights
939e201c85SYohann //------------------------------------------------------------------------------
94343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
959e201c85SYohann inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
969e201c85SYohann   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
979e201c85SYohann }
989e201c85SYohann 
999e201c85SYohann //------------------------------------------------------------------------------
1009e201c85SYohann // 2D
1019e201c85SYohann //------------------------------------------------------------------------------
1029e201c85SYohann 
1039e201c85SYohann //------------------------------------------------------------------------------
1049e201c85SYohann // 2D tensor contraction x
1059e201c85SYohann //------------------------------------------------------------------------------
10699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1079e201c85SYohann inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
108d6c19ee8SJeremy L Thompson   __syncthreads();
1099e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1109e201c85SYohann   __syncthreads();
1119e201c85SYohann   *V = 0.0;
1129e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1139e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
1149e201c85SYohann       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1159e201c85SYohann     }
1169e201c85SYohann   }
1179e201c85SYohann }
1189e201c85SYohann 
1199e201c85SYohann //------------------------------------------------------------------------------
1209e201c85SYohann // 2D tensor contract y
1219e201c85SYohann //------------------------------------------------------------------------------
12299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1239e201c85SYohann inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124d6c19ee8SJeremy L Thompson   __syncthreads();
1259e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1269e201c85SYohann   __syncthreads();
1279e201c85SYohann   *V = 0.0;
1289e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
1299e201c85SYohann     for (CeedInt i = 0; i < P_1D; i++) {
1309e201c85SYohann       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
1319e201c85SYohann     }
1329e201c85SYohann   }
1339e201c85SYohann }
1349e201c85SYohann 
1359e201c85SYohann //------------------------------------------------------------------------------
1369e201c85SYohann // 2D transpose tensor contract y
1379e201c85SYohann //------------------------------------------------------------------------------
13899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1399e201c85SYohann inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140d6c19ee8SJeremy L Thompson   __syncthreads();
1419e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1429e201c85SYohann   __syncthreads();
1439e201c85SYohann   *V = 0.0;
1449e201c85SYohann   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
1459e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1469e201c85SYohann       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
1479e201c85SYohann     }
1489e201c85SYohann   }
1499e201c85SYohann }
1509e201c85SYohann 
1519e201c85SYohann //------------------------------------------------------------------------------
1529e201c85SYohann // 2D transpose tensor contract x
1539e201c85SYohann //------------------------------------------------------------------------------
15499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1559e201c85SYohann inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
156d6c19ee8SJeremy L Thompson   __syncthreads();
1579e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1589e201c85SYohann   __syncthreads();
1599e201c85SYohann   *V = 0.0;
1609e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1619e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1629e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1639e201c85SYohann     }
1649e201c85SYohann   }
1659e201c85SYohann }
1669e201c85SYohann 
1679e201c85SYohann //------------------------------------------------------------------------------
1689e201c85SYohann // 2D transpose tensor contract and add x
1699e201c85SYohann //------------------------------------------------------------------------------
17099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1719e201c85SYohann inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172d6c19ee8SJeremy L Thompson   __syncthreads();
1739e201c85SYohann   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
1749e201c85SYohann   __syncthreads();
1759e201c85SYohann   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1769e201c85SYohann     for (CeedInt i = 0; i < Q_1D; i++) {
1779e201c85SYohann       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
1789e201c85SYohann     }
1799e201c85SYohann   }
1809e201c85SYohann }
1819e201c85SYohann 
1829e201c85SYohann //------------------------------------------------------------------------------
1839e201c85SYohann // 2D interpolate to quadrature points
1849e201c85SYohann //------------------------------------------------------------------------------
18599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1862b730f8bSJeremy L Thompson inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1872b730f8bSJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
1889e201c85SYohann   CeedScalar r_t[1];
1899e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
19099421279SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
19199421279SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
1929e201c85SYohann   }
1939e201c85SYohann }
1949e201c85SYohann 
1959e201c85SYohann //------------------------------------------------------------------------------
1969e201c85SYohann // 2D interpolate transpose
1979e201c85SYohann //------------------------------------------------------------------------------
19899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
1992b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2002b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
2019e201c85SYohann   CeedScalar r_t[1];
2029e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
20399421279SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
20499421279SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2059e201c85SYohann   }
2069e201c85SYohann }
2079e201c85SYohann 
2089e201c85SYohann //------------------------------------------------------------------------------
2099e201c85SYohann // 2D derivatives at quadrature points
2109e201c85SYohann //------------------------------------------------------------------------------
21199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2122b730f8bSJeremy L Thompson inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2132b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
2149e201c85SYohann   CeedScalar r_t[1];
2159e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
21699421279SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
21799421279SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
21899421279SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
21999421279SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
2209e201c85SYohann   }
2219e201c85SYohann }
2229e201c85SYohann 
2239e201c85SYohann //------------------------------------------------------------------------------
2249e201c85SYohann // 2D derivatives transpose
2259e201c85SYohann //------------------------------------------------------------------------------
22699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2272b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2282b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
2299e201c85SYohann   CeedScalar r_t[1];
2309e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
23199421279SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
23299421279SJeremy L Thompson     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
23399421279SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
23499421279SJeremy L Thompson     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
2359e201c85SYohann   }
2369e201c85SYohann }
2379e201c85SYohann 
2389e201c85SYohann //------------------------------------------------------------------------------
239*21292910SJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated
240*21292910SJeremy L Thompson //------------------------------------------------------------------------------
241*21292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
242*21292910SJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
243*21292910SJeremy L Thompson                                                    CeedScalar *__restrict__ r_V) {
244*21292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
245*21292910SJeremy L Thompson     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
246*21292910SJeremy L Thompson     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
247*21292910SJeremy L Thompson   }
248*21292910SJeremy L Thompson }
249*21292910SJeremy L Thompson 
250*21292910SJeremy L Thompson //------------------------------------------------------------------------------
251*21292910SJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated
252*21292910SJeremy L Thompson //------------------------------------------------------------------------------
253*21292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
254*21292910SJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
255*21292910SJeremy L Thompson                                                             CeedScalar *__restrict__ r_V) {
256*21292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
257*21292910SJeremy L Thompson     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
258*21292910SJeremy L Thompson     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
259*21292910SJeremy L Thompson   }
260*21292910SJeremy L Thompson }
261*21292910SJeremy L Thompson 
262*21292910SJeremy L Thompson //------------------------------------------------------------------------------
2639e201c85SYohann // 2D quadrature weights
2649e201c85SYohann //------------------------------------------------------------------------------
265343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
2669e201c85SYohann inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
2672b730f8bSJeremy L Thompson   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
2689e201c85SYohann }
2699e201c85SYohann 
2709e201c85SYohann //------------------------------------------------------------------------------
2719e201c85SYohann // 3D
2729e201c85SYohann //------------------------------------------------------------------------------
2739e201c85SYohann 
2749e201c85SYohann //------------------------------------------------------------------------------
2759e201c85SYohann // 3D tensor contract x
2769e201c85SYohann //------------------------------------------------------------------------------
27799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2789e201c85SYohann inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2799e201c85SYohann   CeedScalar r_B[P_1D];
2809e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
2819e201c85SYohann     r_B[i] = B[i + data.t_id_x * P_1D];
2829e201c85SYohann   }
2839e201c85SYohann 
2849e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
285d6c19ee8SJeremy L Thompson     __syncthreads();
2869e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
2879e201c85SYohann     __syncthreads();
2889e201c85SYohann     V[k] = 0.0;
2899e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
2909e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
2919e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
2929e201c85SYohann       }
2939e201c85SYohann     }
2949e201c85SYohann   }
2959e201c85SYohann }
2969e201c85SYohann 
2979e201c85SYohann //------------------------------------------------------------------------------
2989e201c85SYohann // 3D tensor contract y
2999e201c85SYohann //------------------------------------------------------------------------------
30099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3019e201c85SYohann inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3029e201c85SYohann   CeedScalar r_B[P_1D];
3039e201c85SYohann   for (CeedInt i = 0; i < P_1D; i++) {
3049e201c85SYohann     r_B[i] = B[i + data.t_id_y * P_1D];
3059e201c85SYohann   }
3069e201c85SYohann 
3079e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
308d6c19ee8SJeremy L Thompson     __syncthreads();
3099e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3109e201c85SYohann     __syncthreads();
3119e201c85SYohann     V[k] = 0.0;
3129e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3139e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3149e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3159e201c85SYohann       }
3169e201c85SYohann     }
3179e201c85SYohann   }
3189e201c85SYohann }
3199e201c85SYohann 
3209e201c85SYohann //------------------------------------------------------------------------------
3219e201c85SYohann // 3D tensor contract z
3229e201c85SYohann //------------------------------------------------------------------------------
32399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3249e201c85SYohann inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3259e201c85SYohann   for (CeedInt k = 0; k < Q_1D; k++) {
3269e201c85SYohann     V[k] = 0.0;
3279e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3289e201c85SYohann       for (CeedInt i = 0; i < P_1D; i++) {
3299e201c85SYohann         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
3309e201c85SYohann       }
3319e201c85SYohann     }
3329e201c85SYohann   }
3339e201c85SYohann }
3349e201c85SYohann 
3359e201c85SYohann //------------------------------------------------------------------------------
3369e201c85SYohann // 3D transpose tensor contract z
3379e201c85SYohann //------------------------------------------------------------------------------
33899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3399e201c85SYohann inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3409e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
3419e201c85SYohann     V[k] = 0.0;
3429e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3439e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3449e201c85SYohann         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
3459e201c85SYohann       }
3469e201c85SYohann     }
3479e201c85SYohann   }
3489e201c85SYohann }
3499e201c85SYohann 
3509e201c85SYohann //------------------------------------------------------------------------------
3519e201c85SYohann // 3D transpose tensor contract y
3529e201c85SYohann //------------------------------------------------------------------------------
35399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3549e201c85SYohann inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3559e201c85SYohann   CeedScalar r_B[Q_1D];
3569e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
3579e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
3589e201c85SYohann   }
3599e201c85SYohann 
3609e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
361d6c19ee8SJeremy L Thompson     __syncthreads();
3629e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3639e201c85SYohann     __syncthreads();
3649e201c85SYohann     V[k] = 0.0;
3659e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3669e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3679e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3689e201c85SYohann       }
3699e201c85SYohann     }
3709e201c85SYohann   }
3719e201c85SYohann }
3729e201c85SYohann 
3739e201c85SYohann //------------------------------------------------------------------------------
3749e201c85SYohann // 3D transpose tensor contract y
3759e201c85SYohann //------------------------------------------------------------------------------
37699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3779e201c85SYohann inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3789e201c85SYohann   CeedScalar r_B[Q_1D];
3799e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
3809e201c85SYohann     r_B[i] = B[data.t_id_y + i * P_1D];
3819e201c85SYohann   }
3829e201c85SYohann 
3839e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
384d6c19ee8SJeremy L Thompson     __syncthreads();
3859e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
3869e201c85SYohann     __syncthreads();
3879e201c85SYohann     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
3889e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
3899e201c85SYohann         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
3909e201c85SYohann       }
3919e201c85SYohann     }
3929e201c85SYohann   }
3939e201c85SYohann }
3949e201c85SYohann 
3959e201c85SYohann //------------------------------------------------------------------------------
3969e201c85SYohann // 3D transpose tensor contract x
3979e201c85SYohann //------------------------------------------------------------------------------
39899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3999e201c85SYohann inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4009e201c85SYohann   CeedScalar r_B[Q_1D];
4019e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4029e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4039e201c85SYohann   }
4049e201c85SYohann 
4059e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
406d6c19ee8SJeremy L Thompson     __syncthreads();
4079e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4089e201c85SYohann     __syncthreads();
4099e201c85SYohann     V[k] = 0.0;
4109e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4119e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4129e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4139e201c85SYohann       }
4149e201c85SYohann     }
4159e201c85SYohann   }
4169e201c85SYohann }
4179e201c85SYohann 
4189e201c85SYohann //------------------------------------------------------------------------------
4199e201c85SYohann // 3D transpose tensor contract add x
4209e201c85SYohann //------------------------------------------------------------------------------
42199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4229e201c85SYohann inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4239e201c85SYohann   CeedScalar r_B[Q_1D];
4249e201c85SYohann   for (CeedInt i = 0; i < Q_1D; i++) {
4259e201c85SYohann     r_B[i] = B[data.t_id_x + i * P_1D];
4269e201c85SYohann   }
4279e201c85SYohann 
4289e201c85SYohann   for (CeedInt k = 0; k < P_1D; k++) {
429d6c19ee8SJeremy L Thompson     __syncthreads();
4309e201c85SYohann     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
4319e201c85SYohann     __syncthreads();
4329e201c85SYohann     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4339e201c85SYohann       for (CeedInt i = 0; i < Q_1D; i++) {
4349e201c85SYohann         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
4359e201c85SYohann       }
4369e201c85SYohann     }
4379e201c85SYohann   }
4389e201c85SYohann }
4399e201c85SYohann 
4409e201c85SYohann //------------------------------------------------------------------------------
4419e201c85SYohann // 3D interpolate to quadrature points
4429e201c85SYohann //------------------------------------------------------------------------------
44399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4442b730f8bSJeremy L Thompson inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4452b730f8bSJeremy L Thompson                                       CeedScalar *__restrict__ r_V) {
4469e201c85SYohann   CeedScalar r_t1[T_1D];
4479e201c85SYohann   CeedScalar r_t2[T_1D];
4489e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
44999421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
45099421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
45199421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
4529e201c85SYohann   }
4539e201c85SYohann }
4549e201c85SYohann 
4559e201c85SYohann //------------------------------------------------------------------------------
4569e201c85SYohann // 3D interpolate transpose
4579e201c85SYohann //------------------------------------------------------------------------------
45899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4592b730f8bSJeremy L Thompson inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4602b730f8bSJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
4619e201c85SYohann   CeedScalar r_t1[T_1D];
4629e201c85SYohann   CeedScalar r_t2[T_1D];
4639e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
46499421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
46599421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
46699421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
4679e201c85SYohann   }
4689e201c85SYohann }
4699e201c85SYohann 
4709e201c85SYohann //------------------------------------------------------------------------------
4719e201c85SYohann // 3D derivatives at quadrature points
4729e201c85SYohann //------------------------------------------------------------------------------
47399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4742b730f8bSJeremy L Thompson inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4752b730f8bSJeremy L Thompson                                     CeedScalar *__restrict__ r_V) {
4769e201c85SYohann   CeedScalar r_t1[T_1D];
4779e201c85SYohann   CeedScalar r_t2[T_1D];
4789e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47999421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
48099421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
48199421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
48299421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
48399421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
48499421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
48599421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
48699421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
48799421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
4889e201c85SYohann   }
4899e201c85SYohann }
4909e201c85SYohann 
4919e201c85SYohann //------------------------------------------------------------------------------
4929e201c85SYohann // 3D derivatives transpose
4939e201c85SYohann //------------------------------------------------------------------------------
49499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
4952b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
4962b730f8bSJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
4979e201c85SYohann   CeedScalar r_t1[T_1D];
4989e201c85SYohann   CeedScalar r_t2[T_1D];
4999e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
50099421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
50199421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
50299421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
50399421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
50499421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
50599421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
50699421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
50799421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
50899421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5099e201c85SYohann   }
5109e201c85SYohann }
5119e201c85SYohann 
5129e201c85SYohann //------------------------------------------------------------------------------
5139e201c85SYohann // 3D derivatives at quadrature points
5149e201c85SYohann //------------------------------------------------------------------------------
51599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5162b730f8bSJeremy L Thompson inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5172b730f8bSJeremy L Thompson                                               CeedScalar *__restrict__ r_V) {
5189e201c85SYohann   CeedScalar r_t1[T_1D];
5199e201c85SYohann   CeedScalar r_t2[T_1D];
5209e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
52199421279SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
52299421279SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
52399421279SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
52499421279SJeremy L Thompson     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
52599421279SJeremy L Thompson     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
52699421279SJeremy L Thompson     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
5279e201c85SYohann   }
5289e201c85SYohann }
5299e201c85SYohann 
5309e201c85SYohann //------------------------------------------------------------------------------
5319e201c85SYohann // 3D derivatives transpose
5329e201c85SYohann //------------------------------------------------------------------------------
53399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5342b730f8bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5352b730f8bSJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5369e201c85SYohann   CeedScalar r_t1[T_1D];
5379e201c85SYohann   CeedScalar r_t2[T_1D];
5389e201c85SYohann   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
53999421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
54099421279SJeremy L Thompson     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
54199421279SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
54299421279SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
54399421279SJeremy L Thompson     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
54499421279SJeremy L Thompson     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
5459e201c85SYohann   }
5469e201c85SYohann }
5479e201c85SYohann 
5489e201c85SYohann //------------------------------------------------------------------------------
549*21292910SJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated
550*21292910SJeremy L Thompson //------------------------------------------------------------------------------
551*21292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
552*21292910SJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
553*21292910SJeremy L Thompson                                                    CeedScalar *__restrict__ r_V) {
554*21292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
555*21292910SJeremy L Thompson     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
556*21292910SJeremy L Thompson     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
557*21292910SJeremy L Thompson     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
558*21292910SJeremy L Thompson   }
559*21292910SJeremy L Thompson }
560*21292910SJeremy L Thompson 
561*21292910SJeremy L Thompson //------------------------------------------------------------------------------
562*21292910SJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated
563*21292910SJeremy L Thompson //------------------------------------------------------------------------------
564*21292910SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
565*21292910SJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
566*21292910SJeremy L Thompson                                                             CeedScalar *__restrict__ r_V) {
567*21292910SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
568*21292910SJeremy L Thompson     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
569*21292910SJeremy L Thompson     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
570*21292910SJeremy L Thompson     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
571*21292910SJeremy L Thompson   }
572*21292910SJeremy L Thompson }
573*21292910SJeremy L Thompson 
574*21292910SJeremy L Thompson //------------------------------------------------------------------------------
5759e201c85SYohann // 3D quadrature weights
5769e201c85SYohann //------------------------------------------------------------------------------
577343e3094SJeremy L Thompson template <int P_1D, int Q_1D>
5789e201c85SYohann inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
5799e201c85SYohann   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
5809e201c85SYohann   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
5819e201c85SYohann   for (CeedInt q = 0; q < Q_1D; q++) {
5829e201c85SYohann     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
5839e201c85SYohann   }
5849e201c85SYohann }
585