xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision f8fffaefe2b0bf336e4f45347da405b61e8c02d8)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   data.slice[data.t_id_x] = *U;
22   __syncthreads();
23   *V = 0.0;
24   if (data.t_id_x < Q_1D) {
25     for (CeedInt i = 0; i < P_1D; i++) {
26       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
27     }
28   }
29   __syncthreads();
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   data.slice[data.t_id_x] = *U;
38   __syncthreads();
39   *V = 0.0;
40   if (data.t_id_x < P_1D) {
41     for (CeedInt i = 0; i < Q_1D; i++) {
42       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
43     }
44   }
45   __syncthreads();
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int P_1D, int Q_1D>
95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
108   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
109   __syncthreads();
110   *V = 0.0;
111   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
112     for (CeedInt i = 0; i < P_1D; i++) {
113       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
114     }
115   }
116   __syncthreads();
117 }
118 
119 //------------------------------------------------------------------------------
120 // 2D tensor contract y
121 //------------------------------------------------------------------------------
122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
123 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
125   __syncthreads();
126   *V = 0.0;
127   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
128     for (CeedInt i = 0; i < P_1D; i++) {
129       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
130     }
131   }
132   __syncthreads();
133 }
134 
135 //------------------------------------------------------------------------------
136 // 2D transpose tensor contract y
137 //------------------------------------------------------------------------------
138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
139 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
141   __syncthreads();
142   *V = 0.0;
143   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
144     for (CeedInt i = 0; i < Q_1D; i++) {
145       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
146     }
147   }
148   __syncthreads();
149 }
150 
151 //------------------------------------------------------------------------------
152 // 2D transpose tensor contract x
153 //------------------------------------------------------------------------------
154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
155 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
156   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
157   __syncthreads();
158   *V = 0.0;
159   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
160     for (CeedInt i = 0; i < Q_1D; i++) {
161       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
162     }
163   }
164   __syncthreads();
165 }
166 
167 //------------------------------------------------------------------------------
168 // 2D transpose tensor contract and add x
169 //------------------------------------------------------------------------------
170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
171 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
173   __syncthreads();
174   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
175     for (CeedInt i = 0; i < Q_1D; i++) {
176       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
177     }
178   }
179   __syncthreads();
180 }
181 
182 //------------------------------------------------------------------------------
183 // 2D interpolate to quadrature points
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
187   CeedScalar r_t[1];
188   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
189     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
190     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
191   }
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D interpolate transpose
196 //------------------------------------------------------------------------------
197 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
198 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
199                                                CeedScalar *__restrict__ r_V) {
200   CeedScalar r_t[1];
201   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
202     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
203     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
204   }
205 }
206 
207 //------------------------------------------------------------------------------
208 // 2D derivatives at quadrature points
209 //------------------------------------------------------------------------------
210 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
211 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
212                                     CeedScalar *__restrict__ r_V) {
213   CeedScalar r_t[1];
214   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
215     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
216     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
217     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
218     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
219   }
220 }
221 
222 //------------------------------------------------------------------------------
223 // 2D derivatives transpose
224 //------------------------------------------------------------------------------
225 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
226 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
227                                              CeedScalar *__restrict__ r_V) {
228   CeedScalar r_t[1];
229   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
230     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
231     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
232     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
233     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // 2D quadrature weights
239 //------------------------------------------------------------------------------
240 template <int P_1D, int Q_1D>
241 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
242   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
243 }
244 
245 //------------------------------------------------------------------------------
246 // 3D
247 //------------------------------------------------------------------------------
248 
249 //------------------------------------------------------------------------------
250 // 3D tensor contract x
251 //------------------------------------------------------------------------------
252 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
254   CeedScalar r_B[P_1D];
255   for (CeedInt i = 0; i < P_1D; i++) {
256     r_B[i] = B[i + data.t_id_x * P_1D];
257   }
258 
259   for (CeedInt k = 0; k < P_1D; k++) {
260     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
261     __syncthreads();
262     V[k] = 0.0;
263     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
264       for (CeedInt i = 0; i < P_1D; i++) {
265         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
266       }
267     }
268     __syncthreads();
269   }
270 }
271 
272 //------------------------------------------------------------------------------
273 // 3D tensor contract y
274 //------------------------------------------------------------------------------
275 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
276 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
277   CeedScalar r_B[P_1D];
278   for (CeedInt i = 0; i < P_1D; i++) {
279     r_B[i] = B[i + data.t_id_y * P_1D];
280   }
281 
282   for (CeedInt k = 0; k < P_1D; k++) {
283     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
284     __syncthreads();
285     V[k] = 0.0;
286     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
287       for (CeedInt i = 0; i < P_1D; i++) {
288         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
289       }
290     }
291     __syncthreads();
292   }
293 }
294 
295 //------------------------------------------------------------------------------
296 // 3D tensor contract z
297 //------------------------------------------------------------------------------
298 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
299 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
300   for (CeedInt k = 0; k < Q_1D; k++) {
301     V[k] = 0.0;
302     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
303       for (CeedInt i = 0; i < P_1D; i++) {
304         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
305       }
306     }
307   }
308 }
309 
310 //------------------------------------------------------------------------------
311 // 3D transpose tensor contract z
312 //------------------------------------------------------------------------------
313 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
314 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
315   for (CeedInt k = 0; k < P_1D; k++) {
316     V[k] = 0.0;
317     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
318       for (CeedInt i = 0; i < Q_1D; i++) {
319         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
320       }
321     }
322   }
323 }
324 
325 //------------------------------------------------------------------------------
326 // 3D transpose tensor contract y
327 //------------------------------------------------------------------------------
328 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
329 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
330   CeedScalar r_B[Q_1D];
331   for (CeedInt i = 0; i < Q_1D; i++) {
332     r_B[i] = B[data.t_id_y + i * P_1D];
333   }
334 
335   for (CeedInt k = 0; k < P_1D; k++) {
336     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
337     __syncthreads();
338     V[k] = 0.0;
339     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
340       for (CeedInt i = 0; i < Q_1D; i++) {
341         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
342       }
343     }
344     __syncthreads();
345   }
346 }
347 
348 //------------------------------------------------------------------------------
349 // 3D transpose tensor contract y
350 //------------------------------------------------------------------------------
351 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
352 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
353   CeedScalar r_B[Q_1D];
354   for (CeedInt i = 0; i < Q_1D; i++) {
355     r_B[i] = B[data.t_id_y + i * P_1D];
356   }
357 
358   for (CeedInt k = 0; k < P_1D; k++) {
359     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
360     __syncthreads();
361     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
362       for (CeedInt i = 0; i < Q_1D; i++) {
363         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
364       }
365     }
366     __syncthreads();
367   }
368 }
369 
370 //------------------------------------------------------------------------------
371 // 3D transpose tensor contract x
372 //------------------------------------------------------------------------------
373 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
374 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
375   CeedScalar r_B[Q_1D];
376   for (CeedInt i = 0; i < Q_1D; i++) {
377     r_B[i] = B[data.t_id_x + i * P_1D];
378   }
379 
380   for (CeedInt k = 0; k < P_1D; k++) {
381     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
382     __syncthreads();
383     V[k] = 0.0;
384     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
385       for (CeedInt i = 0; i < Q_1D; i++) {
386         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
387       }
388     }
389     __syncthreads();
390   }
391 }
392 
393 //------------------------------------------------------------------------------
394 // 3D transpose tensor contract add x
395 //------------------------------------------------------------------------------
396 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
397 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
398   CeedScalar r_B[Q_1D];
399   for (CeedInt i = 0; i < Q_1D; i++) {
400     r_B[i] = B[data.t_id_x + i * P_1D];
401   }
402 
403   for (CeedInt k = 0; k < P_1D; k++) {
404     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
405     __syncthreads();
406     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
407       for (CeedInt i = 0; i < Q_1D; i++) {
408         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
409       }
410     }
411     __syncthreads();
412   }
413 }
414 
415 //------------------------------------------------------------------------------
416 // 3D interpolate to quadrature points
417 //------------------------------------------------------------------------------
418 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
419 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
420   CeedScalar r_t1[T_1D];
421   CeedScalar r_t2[T_1D];
422   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
423     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
424     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
425     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
426   }
427 }
428 
429 //------------------------------------------------------------------------------
430 // 3D interpolate transpose
431 //------------------------------------------------------------------------------
432 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
433 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
434                                                CeedScalar *__restrict__ r_V) {
435   CeedScalar r_t1[T_1D];
436   CeedScalar r_t2[T_1D];
437   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
438     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
439     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
440     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
441   }
442 }
443 
444 //------------------------------------------------------------------------------
445 // 3D derivatives at quadrature points
446 //------------------------------------------------------------------------------
447 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
448 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
449                                     CeedScalar *__restrict__ r_V) {
450   CeedScalar r_t1[T_1D];
451   CeedScalar r_t2[T_1D];
452   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
453     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
454     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
455     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
456     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
457     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
458     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
459     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
460     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
461     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
462   }
463 }
464 
465 //------------------------------------------------------------------------------
466 // 3D derivatives transpose
467 //------------------------------------------------------------------------------
468 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
469 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
470                                              CeedScalar *__restrict__ r_V) {
471   CeedScalar r_t1[T_1D];
472   CeedScalar r_t2[T_1D];
473   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
474     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
475     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
476     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
477     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
478     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
479     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
480     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
481     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
482     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
483   }
484 }
485 
486 //------------------------------------------------------------------------------
487 // 3D derivatives at quadrature points
488 //------------------------------------------------------------------------------
489 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
490 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
491                                               CeedScalar *__restrict__ r_V) {
492   CeedScalar r_t1[T_1D];
493   CeedScalar r_t2[T_1D];
494   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
496     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
497     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
498     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
499     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
500     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
501   }
502 }
503 
504 //------------------------------------------------------------------------------
505 // 3D derivatives transpose
506 //------------------------------------------------------------------------------
507 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
508 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
509                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
510   CeedScalar r_t1[T_1D];
511   CeedScalar r_t2[T_1D];
512   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
514     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
515     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
516     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
517     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
518     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
519   }
520 }
521 
522 //------------------------------------------------------------------------------
523 // 3D quadrature weights
524 //------------------------------------------------------------------------------
525 template <int P_1D, int Q_1D>
526 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
527   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
528   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
529   for (CeedInt q = 0; q < Q_1D; q++) {
530     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
531   }
532 }
533