xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision 3e2e790d55559d3802e94c4d2ca1c8066babe5db)
1c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3c8e372f0SJeremy L Thompson //
4c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5c8e372f0SJeremy L Thompson //
6c8e372f0SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7c8e372f0SJeremy L Thompson 
8c8e372f0SJeremy L Thompson /// @file
9c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates
10c8e372f0SJeremy L Thompson #include <ceed/types.h>
11c8e372f0SJeremy L Thompson 
12c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
13c8e372f0SJeremy L Thompson // 2D
14c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
15c8e372f0SJeremy L Thompson 
16c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
17c8e372f0SJeremy L Thompson // 2D tensor contraction x
18c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
19c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
22d6c19ee8SJeremy L Thompson   __syncthreads();
23c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
24c8e372f0SJeremy L Thompson   __syncthreads();
25c8e372f0SJeremy L Thompson   *V = 0.0;
26c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
27c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
28c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
29c8e372f0SJeremy L Thompson     }
30c8e372f0SJeremy L Thompson   }
31c8e372f0SJeremy L Thompson }
32c8e372f0SJeremy L Thompson 
33c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
34c8e372f0SJeremy L Thompson // 2D tensor contract y
35c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
36c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
39d6c19ee8SJeremy L Thompson   __syncthreads();
40c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
41c8e372f0SJeremy L Thompson   __syncthreads();
42c8e372f0SJeremy L Thompson   *V = 0.0;
43c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
44c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
45c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
46c8e372f0SJeremy L Thompson     }
47c8e372f0SJeremy L Thompson   }
48c8e372f0SJeremy L Thompson }
49c8e372f0SJeremy L Thompson 
50c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
51c8e372f0SJeremy L Thompson // 2D transpose tensor contract y
52c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
53c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56d6c19ee8SJeremy L Thompson   __syncthreads();
57c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
58c8e372f0SJeremy L Thompson   __syncthreads();
59c8e372f0SJeremy L Thompson   *V = 0.0;
60c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
61c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
62c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
63c8e372f0SJeremy L Thompson     }
64c8e372f0SJeremy L Thompson   }
65c8e372f0SJeremy L Thompson }
66c8e372f0SJeremy L Thompson 
67c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
68c8e372f0SJeremy L Thompson // 2D transpose tensor contract x
69c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
70c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73d6c19ee8SJeremy L Thompson   __syncthreads();
74c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
75c8e372f0SJeremy L Thompson   __syncthreads();
76c8e372f0SJeremy L Thompson   *V = 0.0;
77c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
78c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
79c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
80c8e372f0SJeremy L Thompson     }
81c8e372f0SJeremy L Thompson   }
82c8e372f0SJeremy L Thompson }
83c8e372f0SJeremy L Thompson 
84c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
85c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x
86c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
87c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89c8e372f0SJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90d6c19ee8SJeremy L Thompson   __syncthreads();
91c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
92c8e372f0SJeremy L Thompson   __syncthreads();
93c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
94c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
95c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
96c8e372f0SJeremy L Thompson     }
97c8e372f0SJeremy L Thompson   }
98c8e372f0SJeremy L Thompson }
99c8e372f0SJeremy L Thompson 
100c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
101c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values
102c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
103c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
104c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105c8e372f0SJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106c8e372f0SJeremy L Thompson 
107c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108d6c19ee8SJeremy L Thompson     __syncthreads();
109c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110c8e372f0SJeremy L Thompson     __syncthreads();
111c8e372f0SJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112c8e372f0SJeremy L Thompson   }
113c8e372f0SJeremy L Thompson }
114c8e372f0SJeremy L Thompson 
115c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
116c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117c8e372f0SJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118c8e372f0SJeremy L Thompson 
119c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120d6c19ee8SJeremy L Thompson     __syncthreads();
121c8e372f0SJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122c8e372f0SJeremy L Thompson     __syncthreads();
123c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124c8e372f0SJeremy L Thompson   }
125c8e372f0SJeremy L Thompson }
126c8e372f0SJeremy L Thompson 
127c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
128c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points
129c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
130c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
133c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
135c8e372f0SJeremy L Thompson 
136ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140c8e372f0SJeremy L Thompson   }
141*3e2e790dSJeremy L Thompson   __syncthreads();
142ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
144c8e372f0SJeremy L Thompson }
145c8e372f0SJeremy L Thompson 
146c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
147c8e372f0SJeremy L Thompson // 2D interpolate transpose
148c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
149c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
150c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
151c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
152c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
153c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
154c8e372f0SJeremy L Thompson 
155ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
156c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
157c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
158c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
159c8e372f0SJeremy L Thompson   }
160*3e2e790dSJeremy L Thompson   __syncthreads();
161ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
162c8e372f0SJeremy L Thompson }
163c8e372f0SJeremy L Thompson 
164c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
165c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points
166c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
167c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
168c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
169c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
170c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
171c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
172c8e372f0SJeremy L Thompson 
173ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
174c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
175c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
176c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
177c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
178c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
179c8e372f0SJeremy L Thompson   }
180*3e2e790dSJeremy L Thompson   __syncthreads();
181ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
182ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
183c8e372f0SJeremy L Thompson }
184c8e372f0SJeremy L Thompson 
185c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
186c8e372f0SJeremy L Thompson // 2D derivatives transpose
187c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
188c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
189c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
190c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
191c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
192c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
193c8e372f0SJeremy L Thompson 
194ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
195c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
196c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
197c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
198c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
199c8e372f0SJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
200c8e372f0SJeremy L Thompson   }
201*3e2e790dSJeremy L Thompson   __syncthreads();
202ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
203c8e372f0SJeremy L Thompson }
204c8e372f0SJeremy L Thompson 
205c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
206c8e372f0SJeremy L Thompson // 2D quadrature weights
207c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
208c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
209c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
210c8e372f0SJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
211c8e372f0SJeremy L Thompson 
212c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
213c8e372f0SJeremy L Thompson }
214c8e372f0SJeremy L Thompson 
215c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
216c8e372f0SJeremy L Thompson // 3D
217c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
218c8e372f0SJeremy L Thompson 
219c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
220c8e372f0SJeremy L Thompson // 3D tensor contract x
221c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
222c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
223c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
224c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
225d6c19ee8SJeremy L Thompson   __syncthreads();
226c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
227c8e372f0SJeremy L Thompson   __syncthreads();
228c8e372f0SJeremy L Thompson   *V = 0.0;
229c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
230c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
231c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
232c8e372f0SJeremy L Thompson     }
233c8e372f0SJeremy L Thompson   }
234c8e372f0SJeremy L Thompson }
235c8e372f0SJeremy L Thompson 
236c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
237c8e372f0SJeremy L Thompson // 3D tensor contract y
238c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
239c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
240c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
241c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
242d6c19ee8SJeremy L Thompson   __syncthreads();
243c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
244c8e372f0SJeremy L Thompson   __syncthreads();
245c8e372f0SJeremy L Thompson   *V = 0.0;
246c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
247c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
248c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
249c8e372f0SJeremy L Thompson     }
250c8e372f0SJeremy L Thompson   }
251c8e372f0SJeremy L Thompson }
252c8e372f0SJeremy L Thompson 
253c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
254c8e372f0SJeremy L Thompson // 3D tensor contract z
255c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
256c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
257c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
258c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
259d6c19ee8SJeremy L Thompson   __syncthreads();
260c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
261c8e372f0SJeremy L Thompson   __syncthreads();
262c8e372f0SJeremy L Thompson   *V = 0.0;
263c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
264c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
265c8e372f0SJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
266c8e372f0SJeremy L Thompson     }
267c8e372f0SJeremy L Thompson   }
268c8e372f0SJeremy L Thompson }
269c8e372f0SJeremy L Thompson 
270c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
271c8e372f0SJeremy L Thompson // 3D tensor contract z
272c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
273c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
274c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
275c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
276d6c19ee8SJeremy L Thompson   __syncthreads();
277c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
278c8e372f0SJeremy L Thompson   __syncthreads();
279c8e372f0SJeremy L Thompson   *V = 0.0;
280c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
281c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
282c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
283c8e372f0SJeremy L Thompson     }
284c8e372f0SJeremy L Thompson   }
285c8e372f0SJeremy L Thompson }
286c8e372f0SJeremy L Thompson 
287c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
288c8e372f0SJeremy L Thompson // 3D transpose tensor contract z
289c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
290c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
291c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
292c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
293d6c19ee8SJeremy L Thompson   __syncthreads();
294c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
295c8e372f0SJeremy L Thompson   __syncthreads();
296c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
297c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
298c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
299c8e372f0SJeremy L Thompson     }
300c8e372f0SJeremy L Thompson   }
301c8e372f0SJeremy L Thompson }
302c8e372f0SJeremy L Thompson 
303c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
304c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
305c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
306c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
307c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
308c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
309d6c19ee8SJeremy L Thompson   __syncthreads();
310c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
311c8e372f0SJeremy L Thompson   __syncthreads();
312c8e372f0SJeremy L Thompson   *V = 0.0;
313c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
314c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
315c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
316c8e372f0SJeremy L Thompson     }
317c8e372f0SJeremy L Thompson   }
318c8e372f0SJeremy L Thompson }
319c8e372f0SJeremy L Thompson 
320c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
321c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
322c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
323c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
324c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
325c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
326d6c19ee8SJeremy L Thompson   __syncthreads();
327c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
328c8e372f0SJeremy L Thompson   __syncthreads();
329c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
330c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
331c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
332c8e372f0SJeremy L Thompson     }
333c8e372f0SJeremy L Thompson   }
334c8e372f0SJeremy L Thompson }
335c8e372f0SJeremy L Thompson 
336c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
337c8e372f0SJeremy L Thompson // 3D transpose tensor contract x
338c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
339c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
340c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
341c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
342d6c19ee8SJeremy L Thompson   __syncthreads();
343c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
344c8e372f0SJeremy L Thompson   __syncthreads();
345c8e372f0SJeremy L Thompson   *V = 0.0;
346c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
347c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
348c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
349c8e372f0SJeremy L Thompson     }
350c8e372f0SJeremy L Thompson   }
351c8e372f0SJeremy L Thompson }
352c8e372f0SJeremy L Thompson 
353c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
354c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x
355c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
356c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
357c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
358c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
359d6c19ee8SJeremy L Thompson   __syncthreads();
360c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
361c8e372f0SJeremy L Thompson   __syncthreads();
362c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
363c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
364c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
365c8e372f0SJeremy L Thompson     }
366c8e372f0SJeremy L Thompson   }
367c8e372f0SJeremy L Thompson }
368c8e372f0SJeremy L Thompson 
369c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
370c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values
371c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
372c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
373c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
374259057edSJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
375c8e372f0SJeremy L Thompson 
376c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
377d6c19ee8SJeremy L Thompson     __syncthreads();
378c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
379c8e372f0SJeremy L Thompson     __syncthreads();
380259057edSJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
381c8e372f0SJeremy L Thompson   }
382c8e372f0SJeremy L Thompson }
383c8e372f0SJeremy L Thompson 
384c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
385c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
386259057edSJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
387c8e372f0SJeremy L Thompson 
388c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
389d6c19ee8SJeremy L Thompson     __syncthreads();
390259057edSJeremy L Thompson     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
391c8e372f0SJeremy L Thompson     __syncthreads();
392c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
393c8e372f0SJeremy L Thompson   }
394c8e372f0SJeremy L Thompson }
395c8e372f0SJeremy L Thompson 
396c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
397c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points
398c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
399c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
400c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
401c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
402259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
403c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
404c8e372f0SJeremy L Thompson 
405ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
406c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
407c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
408c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
409c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
410c8e372f0SJeremy L Thompson   }
411*3e2e790dSJeremy L Thompson   __syncthreads();
412ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
413ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
414c8e372f0SJeremy L Thompson }
415c8e372f0SJeremy L Thompson 
416c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
417c8e372f0SJeremy L Thompson // 3D interpolate transpose
418c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
419c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
420c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
421c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
422259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
423c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
424c8e372f0SJeremy L Thompson 
425ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
426c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
427c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
428c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
429c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
430c8e372f0SJeremy L Thompson   }
431*3e2e790dSJeremy L Thompson   __syncthreads();
432ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
433c8e372f0SJeremy L Thompson }
434c8e372f0SJeremy L Thompson 
435c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
436c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
437c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
438c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
439c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
440c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
441259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
442c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
443c8e372f0SJeremy L Thompson 
444ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
445c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
446c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
447c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
448c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
449c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
450c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
451c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
452c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
453c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
454c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
455c8e372f0SJeremy L Thompson   }
456*3e2e790dSJeremy L Thompson   __syncthreads();
457ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
458ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
459c8e372f0SJeremy L Thompson }
460c8e372f0SJeremy L Thompson 
461c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
462c8e372f0SJeremy L Thompson // 3D derivatives transpose
463c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
464c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
465c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
466c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
467259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
468c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
469c8e372f0SJeremy L Thompson 
470ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
471c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
472c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
473c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
474c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
475c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
476c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
477c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
478c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
479c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
480c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
481c8e372f0SJeremy L Thompson   }
482*3e2e790dSJeremy L Thompson   __syncthreads();
483ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
484c8e372f0SJeremy L Thompson }
485c8e372f0SJeremy L Thompson 
486c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
487c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
488c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
489c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
490c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
491c8e372f0SJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
492259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
493c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
494c8e372f0SJeremy L Thompson 
495ce44184cSJeremy L Thompson   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
496c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
497c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
498c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
499c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
500c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
501c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
502c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
503c8e372f0SJeremy L Thompson   }
504*3e2e790dSJeremy L Thompson   __syncthreads();
505ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
506ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
507c8e372f0SJeremy L Thompson }
508c8e372f0SJeremy L Thompson 
509c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
510c8e372f0SJeremy L Thompson // 3D derivatives transpose
511c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
512c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
513c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
514c8e372f0SJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
515259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
516c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
517c8e372f0SJeremy L Thompson 
518ce44184cSJeremy L Thompson   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
519c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
520c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
521c8e372f0SJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
522c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
523c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
524c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
525c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
526c8e372f0SJeremy L Thompson   }
527*3e2e790dSJeremy L Thompson   __syncthreads();
528ce44184cSJeremy L Thompson   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
529c8e372f0SJeremy L Thompson }
530c8e372f0SJeremy L Thompson 
531c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
532c8e372f0SJeremy L Thompson // 3D quadrature weights
533c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
534c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
535c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
536259057edSJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
537c8e372f0SJeremy L Thompson 
538c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
539c8e372f0SJeremy L Thompson }
540