xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision 0ccda8ebe42db3fb90cdb724a58e4e5d2aedf1a1)
1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3c8e372f0SJeremy L Thompson //
4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5c8e372f0SJeremy L Thompson //
6c8e372f0SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7c8e372f0SJeremy L Thompson 
8c8e372f0SJeremy L Thompson /// @file
9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates
10c8e372f0SJeremy L Thompson #include <ceed/types.h>
11c8e372f0SJeremy L Thompson 
12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
13c8e372f0SJeremy L Thompson // 2D
14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
15c8e372f0SJeremy L Thompson 
16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
17c8e372f0SJeremy L Thompson // 2D tensor contraction x
18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
22d6c19ee8SJeremy L Thompson   __syncthreads();
23c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
24c8e372f0SJeremy L Thompson   __syncthreads();
25c8e372f0SJeremy L Thompson   *V = 0.0;
26c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
27c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
28c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
29c8e372f0SJeremy L Thompson     }
30c8e372f0SJeremy L Thompson   }
31c8e372f0SJeremy L Thompson }
32c8e372f0SJeremy L Thompson 
33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
34c8e372f0SJeremy L Thompson // 2D tensor contract y
35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
39d6c19ee8SJeremy L Thompson   __syncthreads();
40c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
41c8e372f0SJeremy L Thompson   __syncthreads();
42c8e372f0SJeremy L Thompson   *V = 0.0;
43c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
44c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
45c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
46c8e372f0SJeremy L Thompson     }
47c8e372f0SJeremy L Thompson   }
48c8e372f0SJeremy L Thompson }
49c8e372f0SJeremy L Thompson 
50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y
52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56d6c19ee8SJeremy L Thompson   __syncthreads();
57c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
58c8e372f0SJeremy L Thompson   __syncthreads();
59c8e372f0SJeremy L Thompson   *V = 0.0;
60c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
61c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
62c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
63c8e372f0SJeremy L Thompson     }
64c8e372f0SJeremy L Thompson   }
65c8e372f0SJeremy L Thompson }
66c8e372f0SJeremy L Thompson 
67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x
69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73d6c19ee8SJeremy L Thompson   __syncthreads();
74c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
75c8e372f0SJeremy L Thompson   __syncthreads();
76c8e372f0SJeremy L Thompson   *V = 0.0;
77c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
78c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
79c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
80c8e372f0SJeremy L Thompson     }
81c8e372f0SJeremy L Thompson   }
82c8e372f0SJeremy L Thompson }
83c8e372f0SJeremy L Thompson 
84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x
86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89c8e372f0SJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90d6c19ee8SJeremy L Thompson   __syncthreads();
91c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
92c8e372f0SJeremy L Thompson   __syncthreads();
93c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
94c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
95c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
96c8e372f0SJeremy L Thompson     }
97c8e372f0SJeremy L Thompson   }
98c8e372f0SJeremy L Thompson }
99c8e372f0SJeremy L Thompson 
100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values
102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105c8e372f0SJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106c8e372f0SJeremy L Thompson 
107c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108d6c19ee8SJeremy L Thompson     __syncthreads();
109c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110c8e372f0SJeremy L Thompson     __syncthreads();
111c8e372f0SJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112c8e372f0SJeremy L Thompson   }
113c8e372f0SJeremy L Thompson }
114c8e372f0SJeremy L Thompson 
115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117c8e372f0SJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118c8e372f0SJeremy L Thompson 
119c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120d6c19ee8SJeremy L Thompson     __syncthreads();
121c8e372f0SJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122c8e372f0SJeremy L Thompson     __syncthreads();
123c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124c8e372f0SJeremy L Thompson   }
125c8e372f0SJeremy L Thompson }
126c8e372f0SJeremy L Thompson 
127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points
129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
133c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
135c8e372f0SJeremy L Thompson 
136ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140c8e372f0SJeremy L Thompson   }
1413e2e790dSJeremy L Thompson   __syncthreads();
142ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
144c8e372f0SJeremy L Thompson }
145c8e372f0SJeremy L Thompson 
146c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
147c8e372f0SJeremy L Thompson // 2D interpolate transpose
148c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
149c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
150c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
151c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
152c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
153c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
154c8e372f0SJeremy L Thompson 
155ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
156c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
157c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
158c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
159c8e372f0SJeremy L Thompson   }
1603e2e790dSJeremy L Thompson   __syncthreads();
161ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
162c8e372f0SJeremy L Thompson }
163c8e372f0SJeremy L Thompson 
164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
165*0ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated
166*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
167*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
168*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
169*0ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
170*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
171*0ccda8ebSJeremy L Thompson 
172*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
173*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
174*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
175*0ccda8ebSJeremy L Thompson   }
176*0ccda8ebSJeremy L Thompson   __syncthreads();
177*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
178*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
179*0ccda8ebSJeremy L Thompson }
180*0ccda8ebSJeremy L Thompson 
181*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
182*0ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated
183*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
184*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
185*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
186*0ccda8ebSJeremy L Thompson                                                                        CeedScalar *__restrict__ r_V) {
187*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
188*0ccda8ebSJeremy L Thompson 
189*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
190*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
191*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
192*0ccda8ebSJeremy L Thompson   }
193*0ccda8ebSJeremy L Thompson   __syncthreads();
194*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
195*0ccda8ebSJeremy L Thompson }
196*0ccda8ebSJeremy L Thompson 
197*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
198c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points
199c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
200c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
201c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
202c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
203c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
204c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
205c8e372f0SJeremy L Thompson 
206ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
207c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
208c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
209c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
210c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
211c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
212c8e372f0SJeremy L Thompson   }
2133e2e790dSJeremy L Thompson   __syncthreads();
214ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
215ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
216c8e372f0SJeremy L Thompson }
217c8e372f0SJeremy L Thompson 
218c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
219c8e372f0SJeremy L Thompson // 2D derivatives transpose
220c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
221c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
222c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
223c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
224c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
225c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
226c8e372f0SJeremy L Thompson 
227ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
228c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
229c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
230c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
231c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
232c8e372f0SJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
233c8e372f0SJeremy L Thompson   }
2343e2e790dSJeremy L Thompson   __syncthreads();
235ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
236c8e372f0SJeremy L Thompson }
237c8e372f0SJeremy L Thompson 
238c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
239*0ccda8ebSJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated
240*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
241*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
242*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
243*0ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
244*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
245*0ccda8ebSJeremy L Thompson 
246*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
247*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248*0ccda8ebSJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
249*0ccda8ebSJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
250*0ccda8ebSJeremy L Thompson   }
251*0ccda8ebSJeremy L Thompson   __syncthreads();
252*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
253*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
254*0ccda8ebSJeremy L Thompson }
255*0ccda8ebSJeremy L Thompson 
256*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
257*0ccda8ebSJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated
258*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
259*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
260*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
261*0ccda8ebSJeremy L Thompson                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
262*0ccda8ebSJeremy L Thompson   const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
263*0ccda8ebSJeremy L Thompson 
264*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
265*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
266*0ccda8ebSJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
267*0ccda8ebSJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
268*0ccda8ebSJeremy L Thompson   }
269*0ccda8ebSJeremy L Thompson   __syncthreads();
270*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
271*0ccda8ebSJeremy L Thompson }
272*0ccda8ebSJeremy L Thompson 
273*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
274c8e372f0SJeremy L Thompson // 2D quadrature weights
275c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
276c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
277c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
278c8e372f0SJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
279c8e372f0SJeremy L Thompson 
280c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
281c8e372f0SJeremy L Thompson }
282c8e372f0SJeremy L Thompson 
283c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
284c8e372f0SJeremy L Thompson // 3D
285c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
286c8e372f0SJeremy L Thompson 
287c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
288c8e372f0SJeremy L Thompson // 3D tensor contract x
289c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
290c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
291c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
292c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
293d6c19ee8SJeremy L Thompson   __syncthreads();
294c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
295c8e372f0SJeremy L Thompson   __syncthreads();
296c8e372f0SJeremy L Thompson   *V = 0.0;
297c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
298c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
299c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
300c8e372f0SJeremy L Thompson     }
301c8e372f0SJeremy L Thompson   }
302c8e372f0SJeremy L Thompson }
303c8e372f0SJeremy L Thompson 
304c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
305c8e372f0SJeremy L Thompson // 3D tensor contract y
306c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
307c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
308c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
309c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
310d6c19ee8SJeremy L Thompson   __syncthreads();
311c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
312c8e372f0SJeremy L Thompson   __syncthreads();
313c8e372f0SJeremy L Thompson   *V = 0.0;
314c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
315c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
316c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
317c8e372f0SJeremy L Thompson     }
318c8e372f0SJeremy L Thompson   }
319c8e372f0SJeremy L Thompson }
320c8e372f0SJeremy L Thompson 
321c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
322c8e372f0SJeremy L Thompson // 3D tensor contract z
323c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
324c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
325c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
326c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
327d6c19ee8SJeremy L Thompson   __syncthreads();
328c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
329c8e372f0SJeremy L Thompson   __syncthreads();
330c8e372f0SJeremy L Thompson   *V = 0.0;
331c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
332c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
333c8e372f0SJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
334c8e372f0SJeremy L Thompson     }
335c8e372f0SJeremy L Thompson   }
336c8e372f0SJeremy L Thompson }
337c8e372f0SJeremy L Thompson 
338c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
339c8e372f0SJeremy L Thompson // 3D tensor contract z
340c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
341c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
342c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
343c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
344d6c19ee8SJeremy L Thompson   __syncthreads();
345c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
346c8e372f0SJeremy L Thompson   __syncthreads();
347c8e372f0SJeremy L Thompson   *V = 0.0;
348c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
349c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
350c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
351c8e372f0SJeremy L Thompson     }
352c8e372f0SJeremy L Thompson   }
353c8e372f0SJeremy L Thompson }
354c8e372f0SJeremy L Thompson 
355c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
356c8e372f0SJeremy L Thompson // 3D transpose tensor contract z
357c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
358c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
359c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
360c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
361d6c19ee8SJeremy L Thompson   __syncthreads();
362c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
363c8e372f0SJeremy L Thompson   __syncthreads();
364c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
365c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
366c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
367c8e372f0SJeremy L Thompson     }
368c8e372f0SJeremy L Thompson   }
369c8e372f0SJeremy L Thompson }
370c8e372f0SJeremy L Thompson 
371c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
372c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
373c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
374c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
375c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
376c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
377d6c19ee8SJeremy L Thompson   __syncthreads();
378c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
379c8e372f0SJeremy L Thompson   __syncthreads();
380c8e372f0SJeremy L Thompson   *V = 0.0;
381c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
382c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
383c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
384c8e372f0SJeremy L Thompson     }
385c8e372f0SJeremy L Thompson   }
386c8e372f0SJeremy L Thompson }
387c8e372f0SJeremy L Thompson 
388c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
389c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
390c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
391c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
392c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
393c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
394d6c19ee8SJeremy L Thompson   __syncthreads();
395c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
396c8e372f0SJeremy L Thompson   __syncthreads();
397c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
398c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
399c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
400c8e372f0SJeremy L Thompson     }
401c8e372f0SJeremy L Thompson   }
402c8e372f0SJeremy L Thompson }
403c8e372f0SJeremy L Thompson 
404c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
405c8e372f0SJeremy L Thompson // 3D transpose tensor contract x
406c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
407c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
408c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
409c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
410d6c19ee8SJeremy L Thompson   __syncthreads();
411c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
412c8e372f0SJeremy L Thompson   __syncthreads();
413c8e372f0SJeremy L Thompson   *V = 0.0;
414c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
415c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
416c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
417c8e372f0SJeremy L Thompson     }
418c8e372f0SJeremy L Thompson   }
419c8e372f0SJeremy L Thompson }
420c8e372f0SJeremy L Thompson 
421c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
422c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x
423c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
424c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
425c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
426c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
427d6c19ee8SJeremy L Thompson   __syncthreads();
428c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
429c8e372f0SJeremy L Thompson   __syncthreads();
430c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
431c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
432c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
433c8e372f0SJeremy L Thompson     }
434c8e372f0SJeremy L Thompson   }
435c8e372f0SJeremy L Thompson }
436c8e372f0SJeremy L Thompson 
437c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
438c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values
439c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
440c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
441c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
442259057edSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
443c8e372f0SJeremy L Thompson 
444c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
445d6c19ee8SJeremy L Thompson     __syncthreads();
446c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
447c8e372f0SJeremy L Thompson     __syncthreads();
448259057edSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
449c8e372f0SJeremy L Thompson   }
450c8e372f0SJeremy L Thompson }
451c8e372f0SJeremy L Thompson 
452c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
453c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
454259057edSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
455c8e372f0SJeremy L Thompson 
456c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
457d6c19ee8SJeremy L Thompson     __syncthreads();
458259057edSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
459c8e372f0SJeremy L Thompson     __syncthreads();
460c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
461c8e372f0SJeremy L Thompson   }
462c8e372f0SJeremy L Thompson }
463c8e372f0SJeremy L Thompson 
464c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
465c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points
466c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
467c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
468c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
469c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
470259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
471c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
472c8e372f0SJeremy L Thompson 
473ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
474c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
475c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
476c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
477c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
478c8e372f0SJeremy L Thompson   }
4793e2e790dSJeremy L Thompson   __syncthreads();
480ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
481ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
482c8e372f0SJeremy L Thompson }
483c8e372f0SJeremy L Thompson 
484c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
485c8e372f0SJeremy L Thompson // 3D interpolate transpose
486c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
487c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
488c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
489c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
490259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
491c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
492c8e372f0SJeremy L Thompson 
493ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
494c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
496c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
497c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
498c8e372f0SJeremy L Thompson   }
4993e2e790dSJeremy L Thompson   __syncthreads();
500ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
501c8e372f0SJeremy L Thompson }
502c8e372f0SJeremy L Thompson 
503c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
504*0ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated
505*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
506*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
507*0ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
508*0ccda8ebSJeremy L Thompson                                                               CeedScalar *__restrict__ r_V) {
509*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
510*0ccda8ebSJeremy L Thompson 
511*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
512*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
514*0ccda8ebSJeremy L Thompson   }
515*0ccda8ebSJeremy L Thompson   __syncthreads();
516*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
517*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
518*0ccda8ebSJeremy L Thompson }
519*0ccda8ebSJeremy L Thompson 
520*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
521*0ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated
522*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
523*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
524*0ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
525*0ccda8ebSJeremy L Thompson                                                                        CeedScalar *__restrict__ r_V) {
526*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
527*0ccda8ebSJeremy L Thompson 
528*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
529*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
530*0ccda8ebSJeremy L Thompson     r_V[comp] = r_U[comp];
531*0ccda8ebSJeremy L Thompson   }
532*0ccda8ebSJeremy L Thompson   __syncthreads();
533*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
534*0ccda8ebSJeremy L Thompson }
535*0ccda8ebSJeremy L Thompson 
536*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
537c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
538c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
539c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
540c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
541c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
542259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
543c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
544c8e372f0SJeremy L Thompson 
545ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
546c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
547c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
548c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
549c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
550c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
551c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
552c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
553c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
554c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
555c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
556c8e372f0SJeremy L Thompson   }
5573e2e790dSJeremy L Thompson   __syncthreads();
558ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
559ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
560c8e372f0SJeremy L Thompson }
561c8e372f0SJeremy L Thompson 
562c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
563c8e372f0SJeremy L Thompson // 3D derivatives transpose
564c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
565c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
566c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
567c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
568259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
569c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
570c8e372f0SJeremy L Thompson 
571ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
572c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
573c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
574c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
575c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
576c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
577c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
578c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
579c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
580c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
581c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
582c8e372f0SJeremy L Thompson   }
5833e2e790dSJeremy L Thompson   __syncthreads();
584ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
585c8e372f0SJeremy L Thompson }
586c8e372f0SJeremy L Thompson 
587c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
588c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
589c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
590c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
591c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
592c8e372f0SJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
593259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
594c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
595c8e372f0SJeremy L Thompson 
596ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
597c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
598c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
599c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
600c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
601c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
602c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
603c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
604c8e372f0SJeremy L Thompson   }
6053e2e790dSJeremy L Thompson   __syncthreads();
606ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
607ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
608c8e372f0SJeremy L Thompson }
609c8e372f0SJeremy L Thompson 
610c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
611c8e372f0SJeremy L Thompson // 3D derivatives transpose
612c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
613c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
614*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
615c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
616259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
617c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
618c8e372f0SJeremy L Thompson 
619ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
620c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
621*0ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
622*0ccda8ebSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
623*0ccda8ebSJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
624*0ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
625*0ccda8ebSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
626*0ccda8ebSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
627*0ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
628*0ccda8ebSJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
629*0ccda8ebSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
630*0ccda8ebSJeremy L Thompson   }
631*0ccda8ebSJeremy L Thompson   __syncthreads();
632*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
633*0ccda8ebSJeremy L Thompson }
634*0ccda8ebSJeremy L Thompson 
635*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
636*0ccda8ebSJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated
637*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
638*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
639*0ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
640*0ccda8ebSJeremy L Thompson                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
641*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
642*0ccda8ebSJeremy L Thompson 
643*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
644*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
645*0ccda8ebSJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
646*0ccda8ebSJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
647*0ccda8ebSJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
648*0ccda8ebSJeremy L Thompson   }
649*0ccda8ebSJeremy L Thompson   __syncthreads();
650*0ccda8ebSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
651*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
652*0ccda8ebSJeremy L Thompson }
653*0ccda8ebSJeremy L Thompson 
654*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
655*0ccda8ebSJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated
656*0ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
657*0ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
658*0ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
659*0ccda8ebSJeremy L Thompson                                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
660*0ccda8ebSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
661*0ccda8ebSJeremy L Thompson 
662*0ccda8ebSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
663*0ccda8ebSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
664*0ccda8ebSJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
665*0ccda8ebSJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
666*0ccda8ebSJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
667c8e372f0SJeremy L Thompson   }
6683e2e790dSJeremy L Thompson   __syncthreads();
669ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
670c8e372f0SJeremy L Thompson }
671c8e372f0SJeremy L Thompson 
672c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
673c8e372f0SJeremy L Thompson // 3D quadrature weights
674c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
675c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
676c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
677259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
678c8e372f0SJeremy L Thompson 
679c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
680c8e372f0SJeremy L Thompson }
681