xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision 02219a082eb38cf2d3edc97fbfe55fa395a4dc99)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   __syncthreads();
22   data.slice[data.t_id_x] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (data.t_id_x < Q_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
28     }
29   }
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   __syncthreads();
38   data.slice[data.t_id_x] = *U;
39   __syncthreads();
40   *V = 0.0;
41   if (data.t_id_x < P_1D) {
42     for (CeedInt i = 0; i < Q_1D; i++) {
43       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
44     }
45   }
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int P_1D, int Q_1D>
95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
108   __syncthreads();
109   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
115     }
116   }
117 }
118 
119 //------------------------------------------------------------------------------
120 // 2D tensor contract y
121 //------------------------------------------------------------------------------
122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
123 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124   __syncthreads();
125   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
126   __syncthreads();
127   *V = 0.0;
128   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
129     for (CeedInt i = 0; i < P_1D; i++) {
130       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
131     }
132   }
133 }
134 
135 //------------------------------------------------------------------------------
136 // 2D transpose tensor contract y
137 //------------------------------------------------------------------------------
138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
139 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140   __syncthreads();
141   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
142   __syncthreads();
143   *V = 0.0;
144   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
145     for (CeedInt i = 0; i < Q_1D; i++) {
146       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
147     }
148   }
149 }
150 
151 //------------------------------------------------------------------------------
152 // 2D transpose tensor contract x
153 //------------------------------------------------------------------------------
154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
155 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
156   __syncthreads();
157   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
158   __syncthreads();
159   *V = 0.0;
160   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
161     for (CeedInt i = 0; i < Q_1D; i++) {
162       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
163     }
164   }
165 }
166 
167 //------------------------------------------------------------------------------
168 // 2D transpose tensor contract and add x
169 //------------------------------------------------------------------------------
170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
171 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172   __syncthreads();
173   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
174   __syncthreads();
175   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
176     for (CeedInt i = 0; i < Q_1D; i++) {
177       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
178     }
179   }
180 }
181 
182 //------------------------------------------------------------------------------
183 // 2D interpolate to quadrature points
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
187   CeedScalar r_t[1];
188   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
189     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
190     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
191   }
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D interpolate transpose
196 //------------------------------------------------------------------------------
197 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
198 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
199                                                CeedScalar *__restrict__ r_V) {
200   CeedScalar r_t[1];
201   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
202     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
203     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
204   }
205 }
206 
207 //------------------------------------------------------------------------------
208 // 2D derivatives at quadrature points
209 //------------------------------------------------------------------------------
210 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
211 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
212                                     CeedScalar *__restrict__ r_V) {
213   CeedScalar r_t[1];
214   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
215     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
216     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
217     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
218     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
219   }
220 }
221 
222 //------------------------------------------------------------------------------
223 // 2D derivatives transpose
224 //------------------------------------------------------------------------------
225 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
226 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
227                                              CeedScalar *__restrict__ r_V) {
228   CeedScalar r_t[1];
229   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
230     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
231     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
232     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
233     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // 2D derivatives at quadrature points, nodes and quadrature points collocated
239 //------------------------------------------------------------------------------
240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
241 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
242                                                    CeedScalar *__restrict__ r_V) {
243   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
244     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
245     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
246   }
247 }
248 
249 //------------------------------------------------------------------------------
250 // 2D derivatives transpose, nodes and quadrature points collocated
251 //------------------------------------------------------------------------------
252 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
254                                                             CeedScalar *__restrict__ r_V) {
255   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
256     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
257     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
258   }
259 }
260 
261 //------------------------------------------------------------------------------
262 // 2D quadrature weights
263 //------------------------------------------------------------------------------
264 template <int P_1D, int Q_1D>
265 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
266   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
267 }
268 
269 //------------------------------------------------------------------------------
270 // 3D
271 //------------------------------------------------------------------------------
272 
273 //------------------------------------------------------------------------------
274 // 3D tensor contract x
275 //------------------------------------------------------------------------------
276 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
277 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278   CeedScalar r_B[P_1D];
279   for (CeedInt i = 0; i < P_1D; i++) {
280     r_B[i] = B[i + data.t_id_x * P_1D];
281   }
282 
283   for (CeedInt k = 0; k < P_1D; k++) {
284     __syncthreads();
285     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
286     __syncthreads();
287     V[k] = 0.0;
288     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
289       for (CeedInt i = 0; i < P_1D; i++) {
290         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
291       }
292     }
293   }
294 }
295 
296 //------------------------------------------------------------------------------
297 // 3D tensor contract y
298 //------------------------------------------------------------------------------
299 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
300 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
301   CeedScalar r_B[P_1D];
302   for (CeedInt i = 0; i < P_1D; i++) {
303     r_B[i] = B[i + data.t_id_y * P_1D];
304   }
305 
306   for (CeedInt k = 0; k < P_1D; k++) {
307     __syncthreads();
308     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
309     __syncthreads();
310     V[k] = 0.0;
311     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
312       for (CeedInt i = 0; i < P_1D; i++) {
313         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
314       }
315     }
316   }
317 }
318 
319 //------------------------------------------------------------------------------
320 // 3D tensor contract z
321 //------------------------------------------------------------------------------
322 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
323 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
324   for (CeedInt k = 0; k < Q_1D; k++) {
325     V[k] = 0.0;
326     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
327       for (CeedInt i = 0; i < P_1D; i++) {
328         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
329       }
330     }
331   }
332 }
333 
334 //------------------------------------------------------------------------------
335 // 3D transpose tensor contract z
336 //------------------------------------------------------------------------------
337 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
338 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
339   for (CeedInt k = 0; k < P_1D; k++) {
340     V[k] = 0.0;
341     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
342       for (CeedInt i = 0; i < Q_1D; i++) {
343         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
344       }
345     }
346   }
347 }
348 
349 //------------------------------------------------------------------------------
350 // 3D transpose tensor contract y
351 //------------------------------------------------------------------------------
352 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
354   CeedScalar r_B[Q_1D];
355   for (CeedInt i = 0; i < Q_1D; i++) {
356     r_B[i] = B[data.t_id_y + i * P_1D];
357   }
358 
359   for (CeedInt k = 0; k < P_1D; k++) {
360     __syncthreads();
361     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
362     __syncthreads();
363     V[k] = 0.0;
364     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
365       for (CeedInt i = 0; i < Q_1D; i++) {
366         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
367       }
368     }
369   }
370 }
371 
372 //------------------------------------------------------------------------------
373 // 3D transpose tensor contract y
374 //------------------------------------------------------------------------------
375 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
376 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
377   CeedScalar r_B[Q_1D];
378   for (CeedInt i = 0; i < Q_1D; i++) {
379     r_B[i] = B[data.t_id_y + i * P_1D];
380   }
381 
382   for (CeedInt k = 0; k < P_1D; k++) {
383     __syncthreads();
384     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
385     __syncthreads();
386     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
387       for (CeedInt i = 0; i < Q_1D; i++) {
388         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
389       }
390     }
391   }
392 }
393 
394 //------------------------------------------------------------------------------
395 // 3D transpose tensor contract x
396 //------------------------------------------------------------------------------
397 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
398 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
399   CeedScalar r_B[Q_1D];
400   for (CeedInt i = 0; i < Q_1D; i++) {
401     r_B[i] = B[data.t_id_x + i * P_1D];
402   }
403 
404   for (CeedInt k = 0; k < P_1D; k++) {
405     __syncthreads();
406     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
407     __syncthreads();
408     V[k] = 0.0;
409     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
410       for (CeedInt i = 0; i < Q_1D; i++) {
411         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
412       }
413     }
414   }
415 }
416 
417 //------------------------------------------------------------------------------
418 // 3D transpose tensor contract add x
419 //------------------------------------------------------------------------------
420 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
421 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
422   CeedScalar r_B[Q_1D];
423   for (CeedInt i = 0; i < Q_1D; i++) {
424     r_B[i] = B[data.t_id_x + i * P_1D];
425   }
426 
427   for (CeedInt k = 0; k < P_1D; k++) {
428     __syncthreads();
429     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
430     __syncthreads();
431     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
432       for (CeedInt i = 0; i < Q_1D; i++) {
433         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
434       }
435     }
436   }
437 }
438 
439 //------------------------------------------------------------------------------
440 // 3D interpolate to quadrature points
441 //------------------------------------------------------------------------------
442 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
443 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
444   CeedScalar r_t1[T_1D];
445   CeedScalar r_t2[T_1D];
446   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
447     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
448     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
449     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
450   }
451 }
452 
453 //------------------------------------------------------------------------------
454 // 3D interpolate transpose
455 //------------------------------------------------------------------------------
456 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
457 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
458                                                CeedScalar *__restrict__ r_V) {
459   CeedScalar r_t1[T_1D];
460   CeedScalar r_t2[T_1D];
461   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
462     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
463     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
464     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
465   }
466 }
467 
468 //------------------------------------------------------------------------------
469 // 3D derivatives at quadrature points
470 //------------------------------------------------------------------------------
471 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
472 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
473                                     CeedScalar *__restrict__ r_V) {
474   CeedScalar r_t1[T_1D];
475   CeedScalar r_t2[T_1D];
476   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
477     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
478     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
479     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
480     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
481     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
482     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
483     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
484     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
485     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
486   }
487 }
488 
489 //------------------------------------------------------------------------------
490 // 3D derivatives transpose
491 //------------------------------------------------------------------------------
492 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
493 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
494                                              CeedScalar *__restrict__ r_V) {
495   CeedScalar r_t1[T_1D];
496   CeedScalar r_t2[T_1D];
497   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
498     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
499     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
500     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
501     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
502     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
503     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
504     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
505     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
506     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
507   }
508 }
509 
510 //------------------------------------------------------------------------------
511 // 3D derivatives at quadrature points
512 //------------------------------------------------------------------------------
513 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
514 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
515                                               CeedScalar *__restrict__ r_V) {
516   CeedScalar r_t1[T_1D];
517   CeedScalar r_t2[T_1D];
518   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
519     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
520     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
521     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
522     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
523     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
524     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
525   }
526 }
527 
528 //------------------------------------------------------------------------------
529 // 3D derivatives transpose
530 //------------------------------------------------------------------------------
531 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
532 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
533                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
534   CeedScalar r_t1[T_1D];
535   CeedScalar r_t2[T_1D];
536   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
537     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
538     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
539     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
540     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
541     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
542     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
543   }
544 }
545 
546 //------------------------------------------------------------------------------
547 // 3D derivatives at quadrature points, nodes and quadrature points collocated
548 //------------------------------------------------------------------------------
549 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
550 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
551                                                    CeedScalar *__restrict__ r_V) {
552   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
553     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
554     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
555     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
556   }
557 }
558 
559 //------------------------------------------------------------------------------
560 // 3D derivatives transpose, nodes and quadrature points collocated
561 //------------------------------------------------------------------------------
562 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
563 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
564                                                             CeedScalar *__restrict__ r_V) {
565   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
566     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
567     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
568     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
569   }
570 }
571 
572 //------------------------------------------------------------------------------
573 // 3D quadrature weights
574 //------------------------------------------------------------------------------
575 template <int P_1D, int Q_1D>
576 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
577   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
578   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
579   for (CeedInt q = 0; q < Q_1D; q++) {
580     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
581   }
582 }
583