xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h (revision bdcc27286a8034df1dd97bd8aefef85a0efa7b00)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 2D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 2D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
20 inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
21                                             CeedScalar *V) {
22   __syncthreads();
23   data.slice[t_id_x + t_id_y * T_1D] = *U;
24   __syncthreads();
25   *V = 0.0;
26   if (t_id_x < Q_1D && t_id_y < P_1D) {
27     for (CeedInt i = 0; i < P_1D; i++) {
28       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
29     }
30   }
31 }
32 
33 //------------------------------------------------------------------------------
34 // 2D tensor contract y
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
37 inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
38                                             CeedScalar *V) {
39   __syncthreads();
40   data.slice[t_id_x + t_id_y * T_1D] = *U;
41   __syncthreads();
42   *V = 0.0;
43   if (t_id_x < Q_1D && t_id_y < Q_1D) {
44     for (CeedInt i = 0; i < P_1D; i++) {
45       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
46     }
47   }
48 }
49 
50 //------------------------------------------------------------------------------
51 // 2D transpose tensor contract y
52 //------------------------------------------------------------------------------
53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
55                                                      const CeedScalar *B, CeedScalar *V) {
56   __syncthreads();
57   data.slice[t_id_x + t_id_y * T_1D] = *U;
58   __syncthreads();
59   *V = 0.0;
60   if (t_id_x < Q_1D && t_id_y < P_1D) {
61     for (CeedInt i = 0; i < Q_1D; i++) {
62       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
63     }
64   }
65 }
66 
67 //------------------------------------------------------------------------------
68 // 2D transpose tensor contract x
69 //------------------------------------------------------------------------------
70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
72                                                      const CeedScalar *B, CeedScalar *V) {
73   __syncthreads();
74   data.slice[t_id_x + t_id_y * T_1D] = *U;
75   __syncthreads();
76   *V = 0.0;
77   if (t_id_x < P_1D && t_id_y < P_1D) {
78     for (CeedInt i = 0; i < Q_1D; i++) {
79       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
80     }
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // 2D transpose tensor contract and add x
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
89                                                         const CeedScalar *B, CeedScalar *V) {
90   __syncthreads();
91   data.slice[t_id_x + t_id_y * T_1D] = *U;
92   __syncthreads();
93   if (t_id_x < P_1D && t_id_y < P_1D) {
94     for (CeedInt i = 0; i < Q_1D; i++) {
95       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
96     }
97   }
98 }
99 
100 //------------------------------------------------------------------------------
101 // 2D pack/unpack quadrature values
102 //------------------------------------------------------------------------------
103 template <int NUM_COMP, int Q_1D, int T_1D>
104 inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
105   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106 
107   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108     __syncthreads();
109     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
110     __syncthreads();
111     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
112   }
113 }
114 
115 template <int NUM_COMP, int Q_1D, int T_1D>
116 inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
117   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118 
119   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120     __syncthreads();
121     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
122     __syncthreads();
123     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
124   }
125 }
126 
127 //------------------------------------------------------------------------------
128 // 2D interpolate to quadrature points
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
131 inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
132                                                CeedScalar *__restrict__ r_V) {
133   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
134   CeedScalar r_t[1];
135 
136   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
137   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
139     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
140   }
141   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
142   if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
143 }
144 
145 //------------------------------------------------------------------------------
146 // 2D interpolate transpose
147 //------------------------------------------------------------------------------
148 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
149 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
150                                                         CeedScalar *__restrict__ r_V) {
151   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
152   CeedScalar r_t[1];
153 
154   if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
155   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
156     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
157     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
158   }
159   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
160 }
161 
162 //------------------------------------------------------------------------------
163 // 2D derivatives at quadrature points
164 //------------------------------------------------------------------------------
165 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
166 inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
167                                              CeedScalar *__restrict__ r_V) {
168   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
169   CeedScalar r_t[1];
170 
171   if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
172   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
173     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
174     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
175     ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
176     ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
177   }
178   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
179   if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
180 }
181 
182 //------------------------------------------------------------------------------
183 // 2D derivatives transpose
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
187                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
188   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
189   CeedScalar r_t[1];
190 
191   if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
192   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
194     ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
195     ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
196     ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
197   }
198   if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
199 }
200 
201 //------------------------------------------------------------------------------
202 // 2D quadrature weights
203 //------------------------------------------------------------------------------
204 template <int P_1D, int Q_1D>
205 inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
206   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
207 
208   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
209 }
210 
211 //------------------------------------------------------------------------------
212 // 3D
213 //------------------------------------------------------------------------------
214 
215 //------------------------------------------------------------------------------
216 // 3D tensor contract x
217 //------------------------------------------------------------------------------
218 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
219 inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
220                                             const CeedScalar *B, CeedScalar *V) {
221   __syncthreads();
222   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
223   __syncthreads();
224   *V = 0.0;
225   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
226     for (CeedInt i = 0; i < P_1D; i++) {
227       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
228     }
229   }
230 }
231 
232 //------------------------------------------------------------------------------
233 // 3D tensor contract y
234 //------------------------------------------------------------------------------
235 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
236 inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
237                                             const CeedScalar *B, CeedScalar *V) {
238   __syncthreads();
239   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
240   __syncthreads();
241   *V = 0.0;
242   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
243     for (CeedInt i = 0; i < P_1D; i++) {
244       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
245     }
246   }
247 }
248 
249 //------------------------------------------------------------------------------
250 // 3D tensor contract z
251 //------------------------------------------------------------------------------
252 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253 inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
254                                             const CeedScalar *B, CeedScalar *V) {
255   __syncthreads();
256   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
257   __syncthreads();
258   *V = 0.0;
259   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
260     for (CeedInt i = 0; i < P_1D; i++) {
261       *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
262     }
263   }
264 }
265 
266 //------------------------------------------------------------------------------
267 // 3D tensor contract z
268 //------------------------------------------------------------------------------
269 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
271                                                      const CeedScalar *B, CeedScalar *V) {
272   __syncthreads();
273   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
274   __syncthreads();
275   *V = 0.0;
276   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
277     for (CeedInt i = 0; i < Q_1D; i++) {
278       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
279     }
280   }
281 }
282 
283 //------------------------------------------------------------------------------
284 // 3D transpose tensor contract z
285 //------------------------------------------------------------------------------
286 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
287 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
288                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289   __syncthreads();
290   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
291   __syncthreads();
292   if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
293     for (CeedInt i = 0; i < Q_1D; i++) {
294       *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
295     }
296   }
297 }
298 
299 //------------------------------------------------------------------------------
300 // 3D transpose tensor contract y
301 //------------------------------------------------------------------------------
302 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
303 inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
304                                                      const CeedScalar *B, CeedScalar *V) {
305   __syncthreads();
306   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
307   __syncthreads();
308   *V = 0.0;
309   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
310     for (CeedInt i = 0; i < Q_1D; i++) {
311       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
312     }
313   }
314 }
315 
316 //------------------------------------------------------------------------------
317 // 3D transpose tensor contract y
318 //------------------------------------------------------------------------------
319 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
320 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
321                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322   __syncthreads();
323   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
324   __syncthreads();
325   if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
326     for (CeedInt i = 0; i < Q_1D; i++) {
327       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
328     }
329   }
330 }
331 
332 //------------------------------------------------------------------------------
333 // 3D transpose tensor contract x
334 //------------------------------------------------------------------------------
335 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
336 inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
337                                                      const CeedScalar *B, CeedScalar *V) {
338   __syncthreads();
339   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
340   __syncthreads();
341   *V = 0.0;
342   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
343     for (CeedInt i = 0; i < Q_1D; i++) {
344       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
345     }
346   }
347 }
348 
349 //------------------------------------------------------------------------------
350 // 3D transpose tensor contract add x
351 //------------------------------------------------------------------------------
352 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
354                                                         const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355   __syncthreads();
356   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
357   __syncthreads();
358   if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
359     for (CeedInt i = 0; i < Q_1D; i++) {
360       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
361     }
362   }
363 }
364 
365 //------------------------------------------------------------------------------
366 // 3D pack/unpack quadrature values
367 //------------------------------------------------------------------------------
368 template <int NUM_COMP, int Q_1D, int T_1D>
369 inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
370   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
371 
372   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
373     __syncthreads();
374     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
375     __syncthreads();
376     U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
377   }
378 }
379 
380 template <int NUM_COMP, int Q_1D, int T_1D>
381 inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
382   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
383 
384   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385     __syncthreads();
386     if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
387     __syncthreads();
388     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
389   }
390 }
391 
392 //------------------------------------------------------------------------------
393 // 3D interpolate to quadrature points
394 //------------------------------------------------------------------------------
395 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
396 inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
397                                                CeedScalar *__restrict__ r_V) {
398   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
399   CeedScalar    r_t1[1], r_t2[1];
400 
401   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
402   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
403     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
404     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
405     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
406   }
407   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
408   if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
409 }
410 
411 //------------------------------------------------------------------------------
412 // 3D interpolate transpose
413 //------------------------------------------------------------------------------
414 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
415 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
416                                                         CeedScalar *__restrict__ r_V) {
417   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
418   CeedScalar    r_t1[1], r_t2[1];
419 
420   if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
421   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
422     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
423     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
424     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
425   }
426   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
427 }
428 
429 //------------------------------------------------------------------------------
430 // 3D derivatives at quadrature points
431 //------------------------------------------------------------------------------
432 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
433 inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
434                                              CeedScalar *__restrict__ r_V) {
435   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
436   CeedScalar    r_t1[1], r_t2[1];
437 
438   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
439   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
440     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
441     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
442     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
443     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
444     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
445     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
446     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
447     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
448     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
449   }
450   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
451   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
452 }
453 
454 //------------------------------------------------------------------------------
455 // 3D derivatives transpose
456 //------------------------------------------------------------------------------
457 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
458 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
459                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
460   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
461   CeedScalar    r_t1[1], r_t2[1];
462 
463   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
464   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
465     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
466     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
467     ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
468     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
469     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
470     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
471     ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
472     ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
473     ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
474   }
475   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
476 }
477 
478 //------------------------------------------------------------------------------
479 // 3D derivatives at quadrature points
480 //------------------------------------------------------------------------------
481 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
482 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
483                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
484   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
485   CeedScalar    r_t1[1], r_t2[1];
486 
487   if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
488   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
489     ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
490     ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
491     ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
492     ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
493     ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
494     ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
495   }
496   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
497   if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
498 }
499 
500 //------------------------------------------------------------------------------
501 // 3D derivatives transpose
502 //------------------------------------------------------------------------------
503 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
504 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
505                                                                 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
506   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
507   CeedScalar    r_t1[1], r_t2[1];
508 
509   if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
510   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511     ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
512     ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
513     ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
514     ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
515     ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
516     ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
517   }
518   if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
519 }
520 
521 //------------------------------------------------------------------------------
522 // 3D quadrature weights
523 //------------------------------------------------------------------------------
524 template <int P_1D, int Q_1D>
525 inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
526   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
527 
528   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
529 }
530