xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision c8e372f0bd7b7998d8acb114f0b7d6adab16132f)
1*c8e372f0SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*c8e372f0SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*c8e372f0SJeremy L Thompson //
4*c8e372f0SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*c8e372f0SJeremy L Thompson //
6*c8e372f0SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*c8e372f0SJeremy L Thompson 
8*c8e372f0SJeremy L Thompson /// @file
9*c8e372f0SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis templates
10*c8e372f0SJeremy L Thompson #include <ceed/types.h>
11*c8e372f0SJeremy L Thompson 
12*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
13*c8e372f0SJeremy L Thompson // 2D
14*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
15*c8e372f0SJeremy L Thompson 
16*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
17*c8e372f0SJeremy L Thompson // 2D tensor contraction x
18*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
19*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20*c8e372f0SJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21*c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
22*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
23*c8e372f0SJeremy L Thompson   __syncthreads();
24*c8e372f0SJeremy L Thompson   *V = 0.0;
25*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
26*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
27*c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
28*c8e372f0SJeremy L Thompson     }
29*c8e372f0SJeremy L Thompson   }
30*c8e372f0SJeremy L Thompson   __syncthreads();
31*c8e372f0SJeremy L Thompson }
32*c8e372f0SJeremy L Thompson 
33*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
34*c8e372f0SJeremy L Thompson // 2D tensor contract y
35*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
36*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37*c8e372f0SJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38*c8e372f0SJeremy L Thompson                                             CeedScalar *V) {
39*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
40*c8e372f0SJeremy L Thompson   __syncthreads();
41*c8e372f0SJeremy L Thompson   *V = 0.0;
42*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D) {
43*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
44*c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
45*c8e372f0SJeremy L Thompson     }
46*c8e372f0SJeremy L Thompson   }
47*c8e372f0SJeremy L Thompson   __syncthreads();
48*c8e372f0SJeremy L Thompson }
49*c8e372f0SJeremy L Thompson 
50*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
51*c8e372f0SJeremy L Thompson // 2D transpose tensor contract y
52*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
53*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55*c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
56*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
57*c8e372f0SJeremy L Thompson   __syncthreads();
58*c8e372f0SJeremy L Thompson   *V = 0.0;
59*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D) {
60*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
61*c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
62*c8e372f0SJeremy L Thompson     }
63*c8e372f0SJeremy L Thompson   }
64*c8e372f0SJeremy L Thompson   __syncthreads();
65*c8e372f0SJeremy L Thompson }
66*c8e372f0SJeremy L Thompson 
67*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
68*c8e372f0SJeremy L Thompson // 2D transpose tensor contract x
69*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
70*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72*c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
73*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
74*c8e372f0SJeremy L Thompson   __syncthreads();
75*c8e372f0SJeremy L Thompson   *V = 0.0;
76*c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
77*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
78*c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
79*c8e372f0SJeremy L Thompson     }
80*c8e372f0SJeremy L Thompson   }
81*c8e372f0SJeremy L Thompson   __syncthreads();
82*c8e372f0SJeremy L Thompson }
83*c8e372f0SJeremy L Thompson 
84*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
85*c8e372f0SJeremy L Thompson // 2D transpose tensor contract and add x
86*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
87*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89*c8e372f0SJeremy L Thompson                                                         const CeedScalar *B, CeedScalar *V) {
90*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D] = *U;
91*c8e372f0SJeremy L Thompson   __syncthreads();
92*c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D) {
93*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
94*c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
95*c8e372f0SJeremy L Thompson     }
96*c8e372f0SJeremy L Thompson   }
97*c8e372f0SJeremy L Thompson   __syncthreads();
98*c8e372f0SJeremy L Thompson }
99*c8e372f0SJeremy L Thompson 
100*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
101*c8e372f0SJeremy L Thompson // 2D pack/unpack quadrature values
102*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
103*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
104*c8e372f0SJeremy L Thompson inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105*c8e372f0SJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106*c8e372f0SJeremy L Thompson 
107*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108*c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
109*c8e372f0SJeremy L Thompson     __syncthreads();
110*c8e372f0SJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
111*c8e372f0SJeremy L Thompson     __syncthreads();
112*c8e372f0SJeremy L Thompson   }
113*c8e372f0SJeremy L Thompson }
114*c8e372f0SJeremy L Thompson 
115*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
116*c8e372f0SJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117*c8e372f0SJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118*c8e372f0SJeremy L Thompson 
119*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120*c8e372f0SJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
121*c8e372f0SJeremy L Thompson     __syncthreads();
122*c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
123*c8e372f0SJeremy L Thompson     __syncthreads();
124*c8e372f0SJeremy L Thompson   }
125*c8e372f0SJeremy L Thompson }
126*c8e372f0SJeremy L Thompson 
127*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
128*c8e372f0SJeremy L Thompson // 2D interpolate to quadrature points
129*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
130*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131*c8e372f0SJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132*c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
133*c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134*c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
135*c8e372f0SJeremy L Thompson 
136*c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138*c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139*c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140*c8e372f0SJeremy L Thompson   }
141*c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
142*c8e372f0SJeremy L Thompson }
143*c8e372f0SJeremy L Thompson 
144*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
145*c8e372f0SJeremy L Thompson // 2D interpolate transpose
146*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
147*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
148*c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
149*c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
150*c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
151*c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
152*c8e372f0SJeremy L Thompson 
153*c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
154*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
155*c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
156*c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
157*c8e372f0SJeremy L Thompson   }
158*c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
159*c8e372f0SJeremy L Thompson }
160*c8e372f0SJeremy L Thompson 
161*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
162*c8e372f0SJeremy L Thompson // 2D derivatives at quadrature points
163*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
164*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
165*c8e372f0SJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
166*c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
167*c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
168*c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
169*c8e372f0SJeremy L Thompson 
170*c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
171*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
172*c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
173*c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
174*c8e372f0SJeremy L Thompson     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
175*c8e372f0SJeremy L Thompson     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
176*c8e372f0SJeremy L Thompson   }
177*c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
178*c8e372f0SJeremy L Thompson }
179*c8e372f0SJeremy L Thompson 
180*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
181*c8e372f0SJeremy L Thompson // 2D derivatives transpose
182*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
183*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
184*c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
185*c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
186*c8e372f0SJeremy L Thompson   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
187*c8e372f0SJeremy L Thompson   CeedScalar r_t[1];
188*c8e372f0SJeremy L Thompson 
189*c8e372f0SJeremy L Thompson   QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
190*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
191*c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
192*c8e372f0SJeremy L Thompson     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
193*c8e372f0SJeremy L Thompson     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
194*c8e372f0SJeremy L Thompson     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
195*c8e372f0SJeremy L Thompson   }
196*c8e372f0SJeremy L Thompson   QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
197*c8e372f0SJeremy L Thompson }
198*c8e372f0SJeremy L Thompson 
199*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
200*c8e372f0SJeremy L Thompson // 2D quadrature weights
201*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
202*c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
203*c8e372f0SJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
204*c8e372f0SJeremy L Thompson   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
205*c8e372f0SJeremy L Thompson 
206*c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
207*c8e372f0SJeremy L Thompson }
208*c8e372f0SJeremy L Thompson 
209*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
210*c8e372f0SJeremy L Thompson // 3D
211*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
212*c8e372f0SJeremy L Thompson 
213*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
214*c8e372f0SJeremy L Thompson // 3D tensor contract x
215*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
216*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
217*c8e372f0SJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
218*c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
219*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
220*c8e372f0SJeremy L Thompson   __syncthreads();
221*c8e372f0SJeremy L Thompson   *V = 0.0;
222*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
223*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
224*c8e372f0SJeremy L Thompson       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
225*c8e372f0SJeremy L Thompson     }
226*c8e372f0SJeremy L Thompson   }
227*c8e372f0SJeremy L Thompson   __syncthreads();
228*c8e372f0SJeremy L Thompson }
229*c8e372f0SJeremy L Thompson 
230*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
231*c8e372f0SJeremy L Thompson // 3D tensor contract y
232*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
233*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
234*c8e372f0SJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
235*c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
236*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
237*c8e372f0SJeremy L Thompson   __syncthreads();
238*c8e372f0SJeremy L Thompson   *V = 0.0;
239*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
240*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
241*c8e372f0SJeremy L Thompson       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
242*c8e372f0SJeremy L Thompson     }
243*c8e372f0SJeremy L Thompson   }
244*c8e372f0SJeremy L Thompson   __syncthreads();
245*c8e372f0SJeremy L Thompson }
246*c8e372f0SJeremy L Thompson 
247*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
248*c8e372f0SJeremy L Thompson // 3D tensor contract z
249*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
250*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
251*c8e372f0SJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
252*c8e372f0SJeremy L Thompson                                             const CeedScalar *B, CeedScalar *V) {
253*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
254*c8e372f0SJeremy L Thompson   __syncthreads();
255*c8e372f0SJeremy L Thompson   *V = 0.0;
256*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
257*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < P_1D; i++) {
258*c8e372f0SJeremy L Thompson       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
259*c8e372f0SJeremy L Thompson     }
260*c8e372f0SJeremy L Thompson   }
261*c8e372f0SJeremy L Thompson   __syncthreads();
262*c8e372f0SJeremy L Thompson }
263*c8e372f0SJeremy L Thompson 
264*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
265*c8e372f0SJeremy L Thompson // 3D tensor contract z
266*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
267*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
268*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
269*c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
270*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
271*c8e372f0SJeremy L Thompson   __syncthreads();
272*c8e372f0SJeremy L Thompson   *V = 0.0;
273*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
274*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
275*c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
276*c8e372f0SJeremy L Thompson     }
277*c8e372f0SJeremy L Thompson   }
278*c8e372f0SJeremy L Thompson   __syncthreads();
279*c8e372f0SJeremy L Thompson }
280*c8e372f0SJeremy L Thompson 
281*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
282*c8e372f0SJeremy L Thompson // 3D transpose tensor contract z
283*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
284*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
285*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
286*c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
287*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
288*c8e372f0SJeremy L Thompson   __syncthreads();
289*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
290*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
291*c8e372f0SJeremy L Thompson       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
292*c8e372f0SJeremy L Thompson     }
293*c8e372f0SJeremy L Thompson   }
294*c8e372f0SJeremy L Thompson   __syncthreads();
295*c8e372f0SJeremy L Thompson }
296*c8e372f0SJeremy L Thompson 
297*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
298*c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
299*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
300*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
301*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
302*c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
303*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
304*c8e372f0SJeremy L Thompson   __syncthreads();
305*c8e372f0SJeremy L Thompson   *V = 0.0;
306*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
307*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
308*c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
309*c8e372f0SJeremy L Thompson     }
310*c8e372f0SJeremy L Thompson   }
311*c8e372f0SJeremy L Thompson   __syncthreads();
312*c8e372f0SJeremy L Thompson }
313*c8e372f0SJeremy L Thompson 
314*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
315*c8e372f0SJeremy L Thompson // 3D transpose tensor contract y
316*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
317*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
318*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
319*c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
320*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
321*c8e372f0SJeremy L Thompson   __syncthreads();
322*c8e372f0SJeremy L Thompson   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
323*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
324*c8e372f0SJeremy L Thompson       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
325*c8e372f0SJeremy L Thompson     }
326*c8e372f0SJeremy L Thompson   }
327*c8e372f0SJeremy L Thompson   __syncthreads();
328*c8e372f0SJeremy L Thompson }
329*c8e372f0SJeremy L Thompson 
330*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
331*c8e372f0SJeremy L Thompson // 3D transpose tensor contract x
332*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
333*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
334*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
335*c8e372f0SJeremy L Thompson                                                      const CeedScalar *B, CeedScalar *V) {
336*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
337*c8e372f0SJeremy L Thompson   __syncthreads();
338*c8e372f0SJeremy L Thompson   *V = 0.0;
339*c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
340*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
341*c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
342*c8e372f0SJeremy L Thompson     }
343*c8e372f0SJeremy L Thompson   }
344*c8e372f0SJeremy L Thompson   __syncthreads();
345*c8e372f0SJeremy L Thompson }
346*c8e372f0SJeremy L Thompson 
347*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
348*c8e372f0SJeremy L Thompson // 3D transpose tensor contract add x
349*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
350*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
351*c8e372f0SJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
352*c8e372f0SJeremy L Thompson                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
353*c8e372f0SJeremy L Thompson   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
354*c8e372f0SJeremy L Thompson   __syncthreads();
355*c8e372f0SJeremy L Thompson   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
356*c8e372f0SJeremy L Thompson     for (CeedInt i = 0; i < Q_1D; i++) {
357*c8e372f0SJeremy L Thompson       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
358*c8e372f0SJeremy L Thompson     }
359*c8e372f0SJeremy L Thompson   }
360*c8e372f0SJeremy L Thompson   __syncthreads();
361*c8e372f0SJeremy L Thompson }
362*c8e372f0SJeremy L Thompson 
363*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
364*c8e372f0SJeremy L Thompson // 3D pack/unpack quadrature values
365*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
366*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
367*c8e372f0SJeremy L Thompson inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
368*c8e372f0SJeremy L Thompson   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
369*c8e372f0SJeremy L Thompson 
370*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
371*c8e372f0SJeremy L Thompson     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
372*c8e372f0SJeremy L Thompson     __syncthreads();
373*c8e372f0SJeremy L Thompson     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
374*c8e372f0SJeremy L Thompson     __syncthreads();
375*c8e372f0SJeremy L Thompson   }
376*c8e372f0SJeremy L Thompson }
377*c8e372f0SJeremy L Thompson 
378*c8e372f0SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
379*c8e372f0SJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
380*c8e372f0SJeremy L Thompson   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
381*c8e372f0SJeremy L Thompson 
382*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
383*c8e372f0SJeremy L Thompson     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
384*c8e372f0SJeremy L Thompson     __syncthreads();
385*c8e372f0SJeremy L Thompson     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
386*c8e372f0SJeremy L Thompson     __syncthreads();
387*c8e372f0SJeremy L Thompson   }
388*c8e372f0SJeremy L Thompson }
389*c8e372f0SJeremy L Thompson 
390*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
391*c8e372f0SJeremy L Thompson // 3D interpolate to quadrature points
392*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
393*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
394*c8e372f0SJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
395*c8e372f0SJeremy L Thompson                                                CeedScalar *__restrict__ r_V) {
396*c8e372f0SJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
397*c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
398*c8e372f0SJeremy L Thompson 
399*c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
400*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
401*c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
402*c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
403*c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
404*c8e372f0SJeremy L Thompson   }
405*c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
406*c8e372f0SJeremy L Thompson }
407*c8e372f0SJeremy L Thompson 
408*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
409*c8e372f0SJeremy L Thompson // 3D interpolate transpose
410*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
411*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
412*c8e372f0SJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
413*c8e372f0SJeremy L Thompson                                                         CeedScalar *__restrict__ r_V) {
414*c8e372f0SJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
415*c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
416*c8e372f0SJeremy L Thompson 
417*c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
418*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
419*c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
420*c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
421*c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
422*c8e372f0SJeremy L Thompson   }
423*c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
424*c8e372f0SJeremy L Thompson }
425*c8e372f0SJeremy L Thompson 
426*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
427*c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
428*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
429*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
430*c8e372f0SJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
431*c8e372f0SJeremy L Thompson                                              CeedScalar *__restrict__ r_V) {
432*c8e372f0SJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
433*c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
434*c8e372f0SJeremy L Thompson 
435*c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
436*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
437*c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
438*c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
439*c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
440*c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
441*c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
442*c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
443*c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
444*c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
445*c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
446*c8e372f0SJeremy L Thompson   }
447*c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
448*c8e372f0SJeremy L Thompson }
449*c8e372f0SJeremy L Thompson 
450*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
451*c8e372f0SJeremy L Thompson // 3D derivatives transpose
452*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
453*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
454*c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
455*c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
456*c8e372f0SJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
457*c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
458*c8e372f0SJeremy L Thompson 
459*c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
460*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
461*c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
462*c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
463*c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
464*c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
465*c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
466*c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
467*c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
468*c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
469*c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
470*c8e372f0SJeremy L Thompson   }
471*c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
472*c8e372f0SJeremy L Thompson }
473*c8e372f0SJeremy L Thompson 
474*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
475*c8e372f0SJeremy L Thompson // 3D derivatives at quadrature points
476*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
477*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
478*c8e372f0SJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
479*c8e372f0SJeremy L Thompson                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
480*c8e372f0SJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
481*c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
482*c8e372f0SJeremy L Thompson 
483*c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
484*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
485*c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
486*c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
487*c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
488*c8e372f0SJeremy L Thompson     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
489*c8e372f0SJeremy L Thompson     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
490*c8e372f0SJeremy L Thompson     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
491*c8e372f0SJeremy L Thompson   }
492*c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
493*c8e372f0SJeremy L Thompson }
494*c8e372f0SJeremy L Thompson 
495*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
496*c8e372f0SJeremy L Thompson // 3D derivatives transpose
497*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
498*c8e372f0SJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
499*c8e372f0SJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
500*c8e372f0SJeremy L Thompson                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
501*c8e372f0SJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
502*c8e372f0SJeremy L Thompson   CeedScalar    r_t1[1], r_t2[1];
503*c8e372f0SJeremy L Thompson 
504*c8e372f0SJeremy L Thompson   QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
505*c8e372f0SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
506*c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
507*c8e372f0SJeremy L Thompson     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
508*c8e372f0SJeremy L Thompson     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
509*c8e372f0SJeremy L Thompson     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
510*c8e372f0SJeremy L Thompson     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
511*c8e372f0SJeremy L Thompson     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
512*c8e372f0SJeremy L Thompson   }
513*c8e372f0SJeremy L Thompson   QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
514*c8e372f0SJeremy L Thompson }
515*c8e372f0SJeremy L Thompson 
516*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
517*c8e372f0SJeremy L Thompson // 3D quadrature weights
518*c8e372f0SJeremy L Thompson //------------------------------------------------------------------------------
519*c8e372f0SJeremy L Thompson template <int P_1D, int Q_1D>
520*c8e372f0SJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
521*c8e372f0SJeremy L Thompson   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
522*c8e372f0SJeremy L Thompson 
523*c8e372f0SJeremy L Thompson   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
524*c8e372f0SJeremy L Thompson }
525