xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3c8e372f0SJeremy L Thompson //
4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5c8e372f0SJeremy L Thompson //
6c8e372f0SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7c8e372f0SJeremy L Thompson 
8c8e372f0SJeremy L Thompson /// @file
9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates
10c8e372f0SJeremy L Thompson #include <ceed/types.h>
11c8e372f0SJeremy L Thompson 
12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
13c8e372f0SJeremy L Thompson // 2D
14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
15c8e372f0SJeremy L Thompson 
16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
17c8e372f0SJeremy L Thompson // 2D tensor contraction x
18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
22d6c19ee8SJeremy L Thompson   __syncthreads();
23c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
24c8e372f0SJeremy L Thompson   __syncthreads();
25c8e372f0SJeremy L Thompson   *V = 0.0;
26c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
27c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
28c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
29c8e372f0SJeremy L Thompson     }
30c8e372f0SJeremy L Thompson   }
31c8e372f0SJeremy L Thompson }
32c8e372f0SJeremy L Thompson 
33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
34c8e372f0SJeremy L Thompson // 2D tensor contract y
35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
39d6c19ee8SJeremy L Thompson   __syncthreads();
40c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
41c8e372f0SJeremy L Thompson   __syncthreads();
42c8e372f0SJeremy L Thompson   *V = 0.0;
43c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
44c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
45c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
46c8e372f0SJeremy L Thompson     }
47c8e372f0SJeremy L Thompson   }
48c8e372f0SJeremy L Thompson }
49c8e372f0SJeremy L Thompson 
50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y
52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56d6c19ee8SJeremy L Thompson   __syncthreads();
57c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
58c8e372f0SJeremy L Thompson   __syncthreads();
59c8e372f0SJeremy L Thompson   *V = 0.0;
60c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
61c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
62c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
63c8e372f0SJeremy L Thompson     }
64c8e372f0SJeremy L Thompson   }
65c8e372f0SJeremy L Thompson }
66c8e372f0SJeremy L Thompson 
67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x
69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73d6c19ee8SJeremy L Thompson   __syncthreads();
74c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
75c8e372f0SJeremy L Thompson   __syncthreads();
76c8e372f0SJeremy L Thompson   *V = 0.0;
77c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
78c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
79c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
80c8e372f0SJeremy L Thompson     }
81c8e372f0SJeremy L Thompson   }
82c8e372f0SJeremy L Thompson }
83c8e372f0SJeremy L Thompson 
84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x
86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX2dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89c8e372f0SJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90d6c19ee8SJeremy L Thompson   __syncthreads();
91c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
92c8e372f0SJeremy L Thompson   __syncthreads();
93c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
94c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
95c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
96c8e372f0SJeremy L Thompson     }
97c8e372f0SJeremy L Thompson   }
98c8e372f0SJeremy L Thompson }
99c8e372f0SJeremy L Thompson 
100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values
102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QPack2d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,CeedScalar * U)104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105c8e372f0SJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106c8e372f0SJeremy L Thompson 
107c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108d6c19ee8SJeremy L Thompson     __syncthreads();
109c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110c8e372f0SJeremy L Thompson     __syncthreads();
111c8e372f0SJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112c8e372f0SJeremy L Thompson   }
113c8e372f0SJeremy L Thompson }
114c8e372f0SJeremy L Thompson 
115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QUnpack2d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,CeedScalar * U)116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117c8e372f0SJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118c8e372f0SJeremy L Thompson 
119c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120d6c19ee8SJeremy L Thompson     __syncthreads();
121c8e372f0SJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122c8e372f0SJeremy L Thompson     __syncthreads();
123c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124c8e372f0SJeremy L Thompson   }
125c8e372f0SJeremy L Thompson }
126c8e372f0SJeremy L Thompson 
127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points
129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
133c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
135c8e372f0SJeremy L Thompson 
136ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140c8e372f0SJeremy L Thompson   }
1413e2e790dSJeremy L Thompson   __syncthreads();
142ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
144c8e372f0SJeremy L Thompson }
145c8e372f0SJeremy L Thompson 
146c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
147c8e372f0SJeremy L Thompson // 2D interpolate transpose
148c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
149c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)150c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
151c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
152c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
153c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
154c8e372f0SJeremy L Thompson 
155ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
156c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
157c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
158c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
159c8e372f0SJeremy L Thompson   }
1603e2e790dSJeremy L Thompson   __syncthreads();
161ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
162c8e372f0SJeremy L Thompson }
163c8e372f0SJeremy L Thompson 
164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
1650ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated
1660ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1670ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)1680ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1690ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
1700ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1710ccda8ebSJeremy L Thompson 
1720ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1730ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1740ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
1750ccda8ebSJeremy L Thompson   }
1760ccda8ebSJeremy L Thompson   __syncthreads();
1770ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1780ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1790ccda8ebSJeremy L Thompson }
1800ccda8ebSJeremy L Thompson 
1810ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1820ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated
1830ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1840ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)1850ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1860ccda8ebSJeremy L Thompson                                                                        CeedScalar *__restrict__ r_V) {
1870ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1880ccda8ebSJeremy L Thompson 
1890ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1900ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1910ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
1920ccda8ebSJeremy L Thompson   }
1930ccda8ebSJeremy L Thompson   __syncthreads();
1940ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1950ccda8ebSJeremy L Thompson }
1960ccda8ebSJeremy L Thompson 
1970ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
198c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points
199c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
200c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)201c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
202c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
203c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
204c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
205c8e372f0SJeremy L Thompson 
206ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
207c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
208c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
209c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
210c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
211c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
212c8e372f0SJeremy L Thompson   }
2133e2e790dSJeremy L Thompson   __syncthreads();
214ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
215ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
216c8e372f0SJeremy L Thompson }
217c8e372f0SJeremy L Thompson 
218c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
219c8e372f0SJeremy L Thompson // 2D derivatives transpose
220c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
221c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)222c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
223c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
224c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
225c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
226c8e372f0SJeremy L Thompson 
227ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
228c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
229c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
230c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
231c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
232c8e372f0SJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
233c8e372f0SJeremy L Thompson   }
2343e2e790dSJeremy L Thompson   __syncthreads();
235ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
236c8e372f0SJeremy L Thompson }
237c8e372f0SJeremy L Thompson 
238c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
2390ccda8ebSJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated
2400ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2410ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2420ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2430ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2440ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2450ccda8ebSJeremy L Thompson 
2460ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2470ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2480ccda8ebSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
2490ccda8ebSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
2500ccda8ebSJeremy L Thompson   }
2510ccda8ebSJeremy L Thompson   __syncthreads();
2520ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2530ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2540ccda8ebSJeremy L Thompson }
2550ccda8ebSJeremy L Thompson 
2560ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2570ccda8ebSJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated
2580ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2590ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2600ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2610ccda8ebSJeremy L Thompson                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2620ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2630ccda8ebSJeremy L Thompson 
2640ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2650ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2660ccda8ebSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
2670ccda8ebSJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
2680ccda8ebSJeremy L Thompson   }
2690ccda8ebSJeremy L Thompson   __syncthreads();
2700ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2710ccda8ebSJeremy L Thompson }
2720ccda8ebSJeremy L Thompson 
2730ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
274c8e372f0SJeremy L Thompson // 2D quadrature weights
275c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
276c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
WeightTensor2dFlattened(SharedData_Cuda & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)277c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
278c8e372f0SJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
279c8e372f0SJeremy L Thompson 
280c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
281c8e372f0SJeremy L Thompson }
282c8e372f0SJeremy L Thompson 
283c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
284c8e372f0SJeremy L Thompson // 3D
285c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
286c8e372f0SJeremy L Thompson 
287c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
288c8e372f0SJeremy L Thompson // 3D tensor contract x
289c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
290c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)291c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
292c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
293d6c19ee8SJeremy L Thompson   __syncthreads();
294c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
295c8e372f0SJeremy L Thompson   __syncthreads();
296c8e372f0SJeremy L Thompson   *V = 0.0;
297c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
298c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
299c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
300c8e372f0SJeremy L Thompson     }
301c8e372f0SJeremy L Thompson   }
302c8e372f0SJeremy L Thompson }
303c8e372f0SJeremy L Thompson 
304c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
305c8e372f0SJeremy L Thompson // 3D tensor contract y
306c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
307c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)308c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
309c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
310d6c19ee8SJeremy L Thompson   __syncthreads();
311c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
312c8e372f0SJeremy L Thompson   __syncthreads();
313c8e372f0SJeremy L Thompson   *V = 0.0;
314c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
315c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
316c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
317c8e372f0SJeremy L Thompson     }
318c8e372f0SJeremy L Thompson   }
319c8e372f0SJeremy L Thompson }
320c8e372f0SJeremy L Thompson 
321c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
322c8e372f0SJeremy L Thompson // 3D tensor contract z
323c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
324c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractZ3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)325c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
326c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
327d6c19ee8SJeremy L Thompson   __syncthreads();
328c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
329c8e372f0SJeremy L Thompson   __syncthreads();
330c8e372f0SJeremy L Thompson   *V = 0.0;
331c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
332c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
333c8e372f0SJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
334c8e372f0SJeremy L Thompson     }
335c8e372f0SJeremy L Thompson   }
336c8e372f0SJeremy L Thompson }
337c8e372f0SJeremy L Thompson 
338c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
339c8e372f0SJeremy L Thompson // 3D tensor contract z
340c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
341c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeZ3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)342c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
343c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
344d6c19ee8SJeremy L Thompson   __syncthreads();
345c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
346c8e372f0SJeremy L Thompson   __syncthreads();
347c8e372f0SJeremy L Thompson   *V = 0.0;
348c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
349c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
350c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
351c8e372f0SJeremy L Thompson     }
352c8e372f0SJeremy L Thompson   }
353c8e372f0SJeremy L Thompson }
354c8e372f0SJeremy L Thompson 
355c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
356c8e372f0SJeremy L Thompson // 3D transpose tensor contract z
357c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
358c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddZ3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)359c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
360c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
361d6c19ee8SJeremy L Thompson   __syncthreads();
362c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
363c8e372f0SJeremy L Thompson   __syncthreads();
364c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
365c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
366c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
367c8e372f0SJeremy L Thompson     }
368c8e372f0SJeremy L Thompson   }
369c8e372f0SJeremy L Thompson }
370c8e372f0SJeremy L Thompson 
371c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
372c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
373c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
374c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)375c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
376c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
377d6c19ee8SJeremy L Thompson   __syncthreads();
378c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
379c8e372f0SJeremy L Thompson   __syncthreads();
380c8e372f0SJeremy L Thompson   *V = 0.0;
381c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
382c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
383c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
384c8e372f0SJeremy L Thompson     }
385c8e372f0SJeremy L Thompson   }
386c8e372f0SJeremy L Thompson }
387c8e372f0SJeremy L Thompson 
388c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
389c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
390c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
391c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddY3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)392c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
393c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
394d6c19ee8SJeremy L Thompson   __syncthreads();
395c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
396c8e372f0SJeremy L Thompson   __syncthreads();
397c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
398c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
399c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
400c8e372f0SJeremy L Thompson     }
401c8e372f0SJeremy L Thompson   }
402c8e372f0SJeremy L Thompson }
403c8e372f0SJeremy L Thompson 
404c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
405c8e372f0SJeremy L Thompson // 3D transpose tensor contract x
406c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
407c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)408c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
409c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
410d6c19ee8SJeremy L Thompson   __syncthreads();
411c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
412c8e372f0SJeremy L Thompson   __syncthreads();
413c8e372f0SJeremy L Thompson   *V = 0.0;
414c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
415c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
416c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
417c8e372f0SJeremy L Thompson     }
418c8e372f0SJeremy L Thompson   }
419c8e372f0SJeremy L Thompson }
420c8e372f0SJeremy L Thompson 
421c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
422c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x
423c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
424c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX3dFlattened(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)425c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
426c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
427d6c19ee8SJeremy L Thompson   __syncthreads();
428c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
429c8e372f0SJeremy L Thompson   __syncthreads();
430c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
431c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
432c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
433c8e372f0SJeremy L Thompson     }
434c8e372f0SJeremy L Thompson   }
435c8e372f0SJeremy L Thompson }
436c8e372f0SJeremy L Thompson 
437c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
438c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values
439c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
440c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QPack3d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,CeedScalar * U)441c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
442259057edSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
443c8e372f0SJeremy L Thompson 
444c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
445d6c19ee8SJeremy L Thompson     __syncthreads();
446c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
447c8e372f0SJeremy L Thompson     __syncthreads();
448259057edSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
449c8e372f0SJeremy L Thompson   }
450c8e372f0SJeremy L Thompson }
451c8e372f0SJeremy L Thompson 
452c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QUnpack3d(SharedData_Cuda & data,const int t_id_x,const int t_id_y,const int t_id_z,CeedScalar * U)453c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
454259057edSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
455c8e372f0SJeremy L Thompson 
456c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
457d6c19ee8SJeremy L Thompson     __syncthreads();
458259057edSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
459c8e372f0SJeremy L Thompson     __syncthreads();
460c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
461c8e372f0SJeremy L Thompson   }
462c8e372f0SJeremy L Thompson }
463c8e372f0SJeremy L Thompson 
464c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
465c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points
466c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
467c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)468c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
469c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
470259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
471c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
472c8e372f0SJeremy L Thompson 
473ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
474c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
475c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
476c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
477c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
478c8e372f0SJeremy L Thompson   }
4793e2e790dSJeremy L Thompson   __syncthreads();
480ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
481ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
482c8e372f0SJeremy L Thompson }
483c8e372f0SJeremy L Thompson 
484c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
485c8e372f0SJeremy L Thompson // 3D interpolate transpose
486c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
487c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)488c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
489c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
490259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
491c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
492c8e372f0SJeremy L Thompson 
493ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
494c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
496c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
497c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
498c8e372f0SJeremy L Thompson   }
4993e2e790dSJeremy L Thompson   __syncthreads();
500ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
501c8e372f0SJeremy L Thompson }
502c8e372f0SJeremy L Thompson 
503c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
5040ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated
5050ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5060ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)5070ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5080ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
5090ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5100ccda8ebSJeremy L Thompson 
5110ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5120ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5130ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
5140ccda8ebSJeremy L Thompson   }
5150ccda8ebSJeremy L Thompson   __syncthreads();
5160ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5170ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5180ccda8ebSJeremy L Thompson }
5190ccda8ebSJeremy L Thompson 
5200ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5210ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated
5220ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5230ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)5240ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5250ccda8ebSJeremy L Thompson                                                                        CeedScalar *__restrict__ r_V) {
5260ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5270ccda8ebSJeremy L Thompson 
5280ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5290ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5300ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
5310ccda8ebSJeremy L Thompson   }
5320ccda8ebSJeremy L Thompson   __syncthreads();
5330ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5340ccda8ebSJeremy L Thompson }
5350ccda8ebSJeremy L Thompson 
5360ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
537c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
538c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
539c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)540c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
541c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
542259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
543c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
544c8e372f0SJeremy L Thompson 
545ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
546c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
547c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
548c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
549c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
550c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
551c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
552c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
553c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
554c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
555c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
556c8e372f0SJeremy L Thompson   }
5573e2e790dSJeremy L Thompson   __syncthreads();
558ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
559ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
560c8e372f0SJeremy L Thompson }
561c8e372f0SJeremy L Thompson 
562c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
563c8e372f0SJeremy L Thompson // 3D derivatives transpose
564c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
565c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)566c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
567c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
568259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
569c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
570c8e372f0SJeremy L Thompson 
571ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
572c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
573c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
574c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
575c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
576c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
577c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
578c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
579c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
580c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
581c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
582c8e372f0SJeremy L Thompson   }
5833e2e790dSJeremy L Thompson   __syncthreads();
584ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
585c8e372f0SJeremy L Thompson }
586c8e372f0SJeremy L Thompson 
587c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
588c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
589c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
590c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocated3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)591c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
592c8e372f0SJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
593259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
594c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
595c8e372f0SJeremy L Thompson 
596ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
597c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
598c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
599c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
600c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
601c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
602c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
603c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
604c8e372f0SJeremy L Thompson   }
6053e2e790dSJeremy L Thompson   __syncthreads();
606ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
607ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
608c8e372f0SJeremy L Thompson }
609c8e372f0SJeremy L Thompson 
610c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
611c8e372f0SJeremy L Thompson // 3D derivatives transpose
612c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
613c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6140ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
615c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
616259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
617c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
618c8e372f0SJeremy L Thompson 
619ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
620c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6210ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
6220ccda8ebSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
6230ccda8ebSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
6240ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
6250ccda8ebSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
6260ccda8ebSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
6270ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
6280ccda8ebSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
6290ccda8ebSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
6300ccda8ebSJeremy L Thompson   }
6310ccda8ebSJeremy L Thompson   __syncthreads();
6320ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6330ccda8ebSJeremy L Thompson }
6340ccda8ebSJeremy L Thompson 
6350ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6360ccda8ebSJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated
6370ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6380ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6390ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6400ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6410ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
6420ccda8ebSJeremy L Thompson 
6430ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6440ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6450ccda8ebSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
6460ccda8ebSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
6470ccda8ebSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
6480ccda8ebSJeremy L Thompson   }
6490ccda8ebSJeremy L Thompson   __syncthreads();
6500ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6510ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6520ccda8ebSJeremy L Thompson }
6530ccda8ebSJeremy L Thompson 
6540ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6550ccda8ebSJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated
6560ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6570ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6580ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6590ccda8ebSJeremy L Thompson                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6600ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
6610ccda8ebSJeremy L Thompson 
6620ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6630ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6640ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
6650ccda8ebSJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
6660ccda8ebSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
667c8e372f0SJeremy L Thompson   }
6683e2e790dSJeremy L Thompson   __syncthreads();
669ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
670c8e372f0SJeremy L Thompson }
671c8e372f0SJeremy L Thompson 
672c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
673c8e372f0SJeremy L Thompson // 3D quadrature weights
674c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
675c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
WeightTensor3dFlattened(SharedData_Cuda & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)676c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
677259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
678c8e372f0SJeremy L Thompson 
679c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
680c8e372f0SJeremy L Thompson }
681