xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h (revision 5daefc96c3d6c1b0bbb656215f0640792d88e993)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 2D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 2D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21                                             CeedScalar *V) {
22   __syncthreads();
23   data.slice[t_id_x + t_id_y * T_1D] = *U;
24   __syncthreads();
25   *V = 0.0;
26   if (t_id_x < Q_1D && t_id_y < P_1D) {
27     for (CeedInt i = 0; i < P_1D; i++) {
28       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
29     }
30   }
31 }
32 
33 //------------------------------------------------------------------------------
34 // 2D tensor contract y
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38                                             CeedScalar *V) {
39   __syncthreads();
40   data.slice[t_id_x + t_id_y * T_1D] = *U;
41   __syncthreads();
42   *V = 0.0;
43   if (t_id_x < Q_1D && t_id_y < Q_1D) {
44     for (CeedInt i = 0; i < P_1D; i++) {
45       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
46     }
47   }
48 }
49 
50 //------------------------------------------------------------------------------
51 // 2D transpose tensor contract y
52 //------------------------------------------------------------------------------
53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55                                                      const CeedScalar *B, CeedScalar *V) {
56   __syncthreads();
57   data.slice[t_id_x + t_id_y * T_1D] = *U;
58   __syncthreads();
59   *V = 0.0;
60   if (t_id_x < Q_1D && t_id_y < P_1D) {
61     for (CeedInt i = 0; i < Q_1D; i++) {
62       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
63     }
64   }
65 }
66 
67 //------------------------------------------------------------------------------
68 // 2D transpose tensor contract x
69 //------------------------------------------------------------------------------
70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72                                                      const CeedScalar *B, CeedScalar *V) {
73   __syncthreads();
74   data.slice[t_id_x + t_id_y * T_1D] = *U;
75   __syncthreads();
76   *V = 0.0;
77   if (t_id_x < P_1D && t_id_y < P_1D) {
78     for (CeedInt i = 0; i < Q_1D; i++) {
79       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
80     }
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // 2D transpose tensor contract and add x
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89                                                         const CeedScalar *B, CeedScalar *V) {
90   __syncthreads();
91   data.slice[t_id_x + t_id_y * T_1D] = *U;
92   __syncthreads();
93   if (t_id_x < P_1D && t_id_y < P_1D) {
94     for (CeedInt i = 0; i < Q_1D; i++) {
95       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
96     }
97   }
98 }
99 
100 //------------------------------------------------------------------------------
101 // 2D pack/unpack quadrature values
102 //------------------------------------------------------------------------------
103 template <int NUM_COMP, int Q_1D, int T_1D>
104 inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106 
107   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108     __syncthreads();
109     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110     __syncthreads();
111     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112   }
113 }
114 
115 template <int NUM_COMP, int Q_1D, int T_1D>
116 inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118 
119   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120     __syncthreads();
121     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122     __syncthreads();
123     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124   }
125 }
126 
127 //------------------------------------------------------------------------------
128 // 2D interpolate to quadrature points
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132                                                CeedScalar *__restrict__ r_V) {
133   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134   CeedScalar r_t[1];
135 
136   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140   }
141   __syncthreads();
142   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
144 }
145 
146 //------------------------------------------------------------------------------
147 // 2D interpolate transpose
148 //------------------------------------------------------------------------------
149 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
150 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
151                                                         CeedScalar *__restrict__ r_V) {
152   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
153   CeedScalar r_t[1];
154 
155   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
156   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
157     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
158     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
159   }
160   __syncthreads();
161   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
162 }
163 
164 //------------------------------------------------------------------------------
165 // 2D derivatives at quadrature points
166 //------------------------------------------------------------------------------
167 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
168 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
169                                              CeedScalar *__restrict__ r_V) {
170   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
171   CeedScalar r_t[1];
172 
173   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
174   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
175     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
176     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
177     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
178     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
179   }
180   __syncthreads();
181   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
182   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 2D derivatives transpose
187 //------------------------------------------------------------------------------
188 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
189 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
190                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
191   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
192   CeedScalar r_t[1];
193 
194   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
195   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
196     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
197     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
198     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
199     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
200   }
201   __syncthreads();
202   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
203 }
204 
205 //------------------------------------------------------------------------------
206 // 2D quadrature weights
207 //------------------------------------------------------------------------------
208 template <int P_1D, int Q_1D>
209 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
210   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
211 
212   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
213 }
214 
215 //------------------------------------------------------------------------------
216 // 3D
217 //------------------------------------------------------------------------------
218 
219 //------------------------------------------------------------------------------
220 // 3D tensor contract x
221 //------------------------------------------------------------------------------
222 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
223 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
224                                             const CeedScalar *B, CeedScalar *V) {
225   __syncthreads();
226   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
227   __syncthreads();
228   *V = 0.0;
229   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
230     for (CeedInt i = 0; i < P_1D; i++) {
231       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
232     }
233   }
234 }
235 
236 //------------------------------------------------------------------------------
237 // 3D tensor contract y
238 //------------------------------------------------------------------------------
239 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
240 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
241                                             const CeedScalar *B, CeedScalar *V) {
242   __syncthreads();
243   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
244   __syncthreads();
245   *V = 0.0;
246   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
247     for (CeedInt i = 0; i < P_1D; i++) {
248       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
249     }
250   }
251 }
252 
253 //------------------------------------------------------------------------------
254 // 3D tensor contract z
255 //------------------------------------------------------------------------------
256 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
257 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
258                                             const CeedScalar *B, CeedScalar *V) {
259   __syncthreads();
260   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
261   __syncthreads();
262   *V = 0.0;
263   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
264     for (CeedInt i = 0; i < P_1D; i++) {
265       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
266     }
267   }
268 }
269 
270 //------------------------------------------------------------------------------
271 // 3D tensor contract z
272 //------------------------------------------------------------------------------
273 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
274 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
275                                                      const CeedScalar *B, CeedScalar *V) {
276   __syncthreads();
277   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
278   __syncthreads();
279   *V = 0.0;
280   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
281     for (CeedInt i = 0; i < Q_1D; i++) {
282       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
283     }
284   }
285 }
286 
287 //------------------------------------------------------------------------------
288 // 3D transpose tensor contract z
289 //------------------------------------------------------------------------------
290 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
291 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
292                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
293   __syncthreads();
294   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
295   __syncthreads();
296   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
297     for (CeedInt i = 0; i < Q_1D; i++) {
298       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
299     }
300   }
301 }
302 
303 //------------------------------------------------------------------------------
304 // 3D transpose tensor contract y
305 //------------------------------------------------------------------------------
306 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
307 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
308                                                      const CeedScalar *B, CeedScalar *V) {
309   __syncthreads();
310   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
311   __syncthreads();
312   *V = 0.0;
313   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
314     for (CeedInt i = 0; i < Q_1D; i++) {
315       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
316     }
317   }
318 }
319 
320 //------------------------------------------------------------------------------
321 // 3D transpose tensor contract y
322 //------------------------------------------------------------------------------
323 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
324 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
325                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
326   __syncthreads();
327   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
328   __syncthreads();
329   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
330     for (CeedInt i = 0; i < Q_1D; i++) {
331       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
332     }
333   }
334 }
335 
336 //------------------------------------------------------------------------------
337 // 3D transpose tensor contract x
338 //------------------------------------------------------------------------------
339 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
340 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
341                                                      const CeedScalar *B, CeedScalar *V) {
342   __syncthreads();
343   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
344   __syncthreads();
345   *V = 0.0;
346   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
347     for (CeedInt i = 0; i < Q_1D; i++) {
348       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
349     }
350   }
351 }
352 
353 //------------------------------------------------------------------------------
354 // 3D transpose tensor contract add x
355 //------------------------------------------------------------------------------
356 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
357 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
358                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
359   __syncthreads();
360   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
361   __syncthreads();
362   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
363     for (CeedInt i = 0; i < Q_1D; i++) {
364       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
365     }
366   }
367 }
368 
369 //------------------------------------------------------------------------------
370 // 3D pack/unpack quadrature values
371 //------------------------------------------------------------------------------
372 template <int NUM_COMP, int Q_1D, int T_1D>
373 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
374   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
375 
376   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
377     __syncthreads();
378     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
379     __syncthreads();
380     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
381   }
382 }
383 
384 template <int NUM_COMP, int Q_1D, int T_1D>
385 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
386   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
387 
388   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
389     __syncthreads();
390     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
391     __syncthreads();
392     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
393   }
394 }
395 
396 //------------------------------------------------------------------------------
397 // 3D interpolate to quadrature points
398 //------------------------------------------------------------------------------
399 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
400 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
401                                                CeedScalar *__restrict__ r_V) {
402   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
403   CeedScalar    r_t1[1], r_t2[1];
404 
405   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
406   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
407     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
408     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
409     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
410   }
411   __syncthreads();
412   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
413   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
414 }
415 
416 //------------------------------------------------------------------------------
417 // 3D interpolate transpose
418 //------------------------------------------------------------------------------
419 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
420 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
421                                                         CeedScalar *__restrict__ r_V) {
422   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
423   CeedScalar    r_t1[1], r_t2[1];
424 
425   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
426   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
427     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
428     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
429     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
430   }
431   __syncthreads();
432   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
433 }
434 
435 //------------------------------------------------------------------------------
436 // 3D derivatives at quadrature points
437 //------------------------------------------------------------------------------
438 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
439 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
440                                              CeedScalar *__restrict__ r_V) {
441   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
442   CeedScalar    r_t1[1], r_t2[1];
443 
444   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
445   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
446     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
447     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
448     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
449     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
450     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
451     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
452     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
453     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
454     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
455   }
456   __syncthreads();
457   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
458   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
459 }
460 
461 //------------------------------------------------------------------------------
462 // 3D derivatives transpose
463 //------------------------------------------------------------------------------
464 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
465 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
466                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
467   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
468   CeedScalar    r_t1[1], r_t2[1];
469 
470   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
471   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
472     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
473     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
474     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
475     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
476     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
477     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
478     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
479     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
480     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
481   }
482   __syncthreads();
483   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
484 }
485 
486 //------------------------------------------------------------------------------
487 // 3D derivatives at quadrature points
488 //------------------------------------------------------------------------------
489 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
490 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
491                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
492   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
493   CeedScalar    r_t1[1], r_t2[1];
494 
495   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
496   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
497     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
498     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
499     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
500     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
501     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
502     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
503   }
504   __syncthreads();
505   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
506   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
507 }
508 
509 //------------------------------------------------------------------------------
510 // 3D derivatives transpose
511 //------------------------------------------------------------------------------
512 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
513 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
514                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
515   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
516   CeedScalar    r_t1[1], r_t2[1];
517 
518   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
519   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
520     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
521     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
522     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
523     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
524     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
525     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
526   }
527   __syncthreads();
528   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
529 }
530 
531 //------------------------------------------------------------------------------
532 // 3D quadrature weights
533 //------------------------------------------------------------------------------
534 template <int P_1D, int Q_1D>
535 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
536   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
537 
538   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
539 }
540