xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision d1931fc83dfaa61549375a0461a8efe2c16b442e)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 
11 #include <ceed.h>
12 
13 //------------------------------------------------------------------------------
14 // 1D
15 //------------------------------------------------------------------------------
16 
17 //------------------------------------------------------------------------------
18 // 1D tensor contraction x
19 //------------------------------------------------------------------------------
20 template <int NUM_COMP, int P_1D, int Q_1D>
21 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
22   data.slice[data.t_id_x] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (data.t_id_x < Q_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
28     }
29   }
30   __syncthreads();
31 }
32 
33 //------------------------------------------------------------------------------
34 // 1D transpose tensor contraction x
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D>
37 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
38   data.slice[data.t_id_x] = *U;
39   __syncthreads();
40   *V = 0.0;
41   if (data.t_id_x < P_1D) {
42     for (CeedInt i = 0; i < Q_1D; i++) {
43       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
44     }
45   }
46   __syncthreads();
47 }
48 
49 //------------------------------------------------------------------------------
50 // 1D interpolate to quadrature points
51 //------------------------------------------------------------------------------
52 template <int NUM_COMP, int P_1D, int Q_1D>
53 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
54   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
55     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
56   }
57 }
58 
59 //------------------------------------------------------------------------------
60 // 1D interpolate transpose
61 //------------------------------------------------------------------------------
62 template <int NUM_COMP, int P_1D, int Q_1D>
63 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
64                                          CeedScalar *__restrict__ r_V) {
65   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
66     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
67   }
68 }
69 
70 //------------------------------------------------------------------------------
71 // 1D derivatives at quadrature points
72 //------------------------------------------------------------------------------
73 template <int NUM_COMP, int P_1D, int Q_1D>
74 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
75                               CeedScalar *__restrict__ r_V) {
76   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
77     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // 1D derivatives transpose
83 //------------------------------------------------------------------------------
84 template <int NUM_COMP, int P_1D, int Q_1D>
85 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
86                                        CeedScalar *__restrict__ r_V) {
87   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
88     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
89   }
90 }
91 
92 //------------------------------------------------------------------------------
93 // 1D quadrature weights
94 //------------------------------------------------------------------------------
95 template <int Q_1D>
96 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
97   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
98 }
99 
100 //------------------------------------------------------------------------------
101 // 2D
102 //------------------------------------------------------------------------------
103 
104 //------------------------------------------------------------------------------
105 // 2D tensor contraction x
106 //------------------------------------------------------------------------------
107 template <int NUM_COMP, int P_1D, int Q_1D>
108 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
109   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
115     }
116   }
117   __syncthreads();
118 }
119 
120 //------------------------------------------------------------------------------
121 // 2D tensor contract y
122 //------------------------------------------------------------------------------
123 template <int NUM_COMP, int P_1D, int Q_1D>
124 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
125   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
126   __syncthreads();
127   *V = 0.0;
128   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
129     for (CeedInt i = 0; i < P_1D; i++) {
130       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
131     }
132   }
133   __syncthreads();
134 }
135 
136 //------------------------------------------------------------------------------
137 // 2D transpose tensor contract y
138 //------------------------------------------------------------------------------
139 template <int NUM_COMP, int P_1D, int Q_1D>
140 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
141   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
142   __syncthreads();
143   *V = 0.0;
144   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
145     for (CeedInt i = 0; i < Q_1D; i++) {
146       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
147     }
148   }
149   __syncthreads();
150 }
151 
152 //------------------------------------------------------------------------------
153 // 2D transpose tensor contract x
154 //------------------------------------------------------------------------------
155 template <int NUM_COMP, int P_1D, int Q_1D>
156 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
157   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
158   __syncthreads();
159   *V = 0.0;
160   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
161     for (CeedInt i = 0; i < Q_1D; i++) {
162       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
163     }
164   }
165   __syncthreads();
166 }
167 
168 //------------------------------------------------------------------------------
169 // 2D transpose tensor contract and add x
170 //------------------------------------------------------------------------------
171 template <int NUM_COMP, int P_1D, int Q_1D>
172 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
173   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
174   __syncthreads();
175   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
176     for (CeedInt i = 0; i < Q_1D; i++) {
177       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
178     }
179   }
180   __syncthreads();
181 }
182 
183 //------------------------------------------------------------------------------
184 // 2D interpolate to quadrature points
185 //------------------------------------------------------------------------------
186 template <int NUM_COMP, int P_1D, int Q_1D>
187 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
188   CeedScalar r_t[1];
189   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
190     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
191     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // 2D interpolate transpose
197 //------------------------------------------------------------------------------
198 template <int NUM_COMP, int P_1D, int Q_1D>
199 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
200                                                CeedScalar *__restrict__ r_V) {
201   CeedScalar r_t[1];
202   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
203     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
204     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
205   }
206 }
207 
208 //------------------------------------------------------------------------------
209 // 2D derivatives at quadrature points
210 //------------------------------------------------------------------------------
211 template <int NUM_COMP, int P_1D, int Q_1D>
212 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
213                                     CeedScalar *__restrict__ r_V) {
214   CeedScalar r_t[1];
215   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
216     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
217     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
218     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
219     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
220   }
221 }
222 
223 //------------------------------------------------------------------------------
224 // 2D derivatives transpose
225 //------------------------------------------------------------------------------
226 template <int NUM_COMP, int P_1D, int Q_1D>
227 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
228                                              CeedScalar *__restrict__ r_V) {
229   CeedScalar r_t[1];
230   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
231     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
232     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
233     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
234     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
235   }
236 }
237 
238 //------------------------------------------------------------------------------
239 // 2D quadrature weights
240 //------------------------------------------------------------------------------
241 template <int Q_1D>
242 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
243   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
244 }
245 
246 //------------------------------------------------------------------------------
247 // 3D
248 //------------------------------------------------------------------------------
249 
250 //------------------------------------------------------------------------------
251 // 3D tensor contract x
252 //------------------------------------------------------------------------------
253 template <int NUM_COMP, int P_1D, int Q_1D>
254 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
255   CeedScalar r_B[P_1D];
256   for (CeedInt i = 0; i < P_1D; i++) {
257     r_B[i] = B[i + data.t_id_x * P_1D];
258   }
259 
260   for (CeedInt k = 0; k < P_1D; k++) {
261     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
262     __syncthreads();
263     V[k] = 0.0;
264     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
265       for (CeedInt i = 0; i < P_1D; i++) {
266         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
267       }
268     }
269     __syncthreads();
270   }
271 }
272 
273 //------------------------------------------------------------------------------
274 // 3D tensor contract y
275 //------------------------------------------------------------------------------
276 template <int NUM_COMP, int P_1D, int Q_1D>
277 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278   CeedScalar r_B[P_1D];
279   for (CeedInt i = 0; i < P_1D; i++) {
280     r_B[i] = B[i + data.t_id_y * P_1D];
281   }
282 
283   for (CeedInt k = 0; k < P_1D; k++) {
284     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
285     __syncthreads();
286     V[k] = 0.0;
287     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
288       for (CeedInt i = 0; i < P_1D; i++) {
289         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
290       }
291     }
292     __syncthreads();
293   }
294 }
295 
296 //------------------------------------------------------------------------------
297 // 3D tensor contract z
298 //------------------------------------------------------------------------------
299 template <int NUM_COMP, int P_1D, int Q_1D>
300 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
301   for (CeedInt k = 0; k < Q_1D; k++) {
302     V[k] = 0.0;
303     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
304       for (CeedInt i = 0; i < P_1D; i++) {
305         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
306       }
307     }
308   }
309 }
310 
311 //------------------------------------------------------------------------------
312 // 3D transpose tensor contract z
313 //------------------------------------------------------------------------------
314 template <int NUM_COMP, int P_1D, int Q_1D>
315 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
316   for (CeedInt k = 0; k < P_1D; k++) {
317     V[k] = 0.0;
318     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
319       for (CeedInt i = 0; i < Q_1D; i++) {
320         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
321       }
322     }
323   }
324 }
325 
326 //------------------------------------------------------------------------------
327 // 3D transpose tensor contract y
328 //------------------------------------------------------------------------------
329 template <int NUM_COMP, int P_1D, int Q_1D>
330 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
331   CeedScalar r_B[Q_1D];
332   for (CeedInt i = 0; i < Q_1D; i++) {
333     r_B[i] = B[data.t_id_y + i * P_1D];
334   }
335 
336   for (CeedInt k = 0; k < P_1D; k++) {
337     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
338     __syncthreads();
339     V[k] = 0.0;
340     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
341       for (CeedInt i = 0; i < Q_1D; i++) {
342         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
343       }
344     }
345     __syncthreads();
346   }
347 }
348 
349 //------------------------------------------------------------------------------
350 // 3D transpose tensor contract y
351 //------------------------------------------------------------------------------
352 template <int NUM_COMP, int P_1D, int Q_1D>
353 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
354   CeedScalar r_B[Q_1D];
355   for (CeedInt i = 0; i < Q_1D; i++) {
356     r_B[i] = B[data.t_id_y + i * P_1D];
357   }
358 
359   for (CeedInt k = 0; k < P_1D; k++) {
360     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
361     __syncthreads();
362     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
363       for (CeedInt i = 0; i < Q_1D; i++) {
364         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
365       }
366     }
367     __syncthreads();
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D transpose tensor contract x
373 //------------------------------------------------------------------------------
374 template <int NUM_COMP, int P_1D, int Q_1D>
375 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
376   CeedScalar r_B[Q_1D];
377   for (CeedInt i = 0; i < Q_1D; i++) {
378     r_B[i] = B[data.t_id_x + i * P_1D];
379   }
380 
381   for (CeedInt k = 0; k < P_1D; k++) {
382     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
383     __syncthreads();
384     V[k] = 0.0;
385     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
386       for (CeedInt i = 0; i < Q_1D; i++) {
387         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
388       }
389     }
390     __syncthreads();
391   }
392 }
393 
394 //------------------------------------------------------------------------------
395 // 3D transpose tensor contract add x
396 //------------------------------------------------------------------------------
397 template <int NUM_COMP, int P_1D, int Q_1D>
398 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
399   CeedScalar r_B[Q_1D];
400   for (CeedInt i = 0; i < Q_1D; i++) {
401     r_B[i] = B[data.t_id_x + i * P_1D];
402   }
403 
404   for (CeedInt k = 0; k < P_1D; k++) {
405     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
406     __syncthreads();
407     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
408       for (CeedInt i = 0; i < Q_1D; i++) {
409         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
410       }
411     }
412     __syncthreads();
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // 3D interpolate to quadrature points
418 //------------------------------------------------------------------------------
419 template <int NUM_COMP, int P_1D, int Q_1D>
420 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
421   CeedScalar r_t1[T_1D];
422   CeedScalar r_t2[T_1D];
423   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
424     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
425     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
426     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
427   }
428 }
429 
430 //------------------------------------------------------------------------------
431 // 3D interpolate transpose
432 //------------------------------------------------------------------------------
433 template <int NUM_COMP, int P_1D, int Q_1D>
434 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
435                                                CeedScalar *__restrict__ r_V) {
436   CeedScalar r_t1[T_1D];
437   CeedScalar r_t2[T_1D];
438   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
439     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
440     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
441     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
442   }
443 }
444 
445 //------------------------------------------------------------------------------
446 // 3D derivatives at quadrature points
447 //------------------------------------------------------------------------------
448 template <int NUM_COMP, int P_1D, int Q_1D>
449 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
450                                     CeedScalar *__restrict__ r_V) {
451   CeedScalar r_t1[T_1D];
452   CeedScalar r_t2[T_1D];
453   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
454     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
455     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
456     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
457     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
458     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
459     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
460     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
461     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
462     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
463   }
464 }
465 
466 //------------------------------------------------------------------------------
467 // 3D derivatives transpose
468 //------------------------------------------------------------------------------
469 template <int NUM_COMP, int P_1D, int Q_1D>
470 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
471                                              CeedScalar *__restrict__ r_V) {
472   CeedScalar r_t1[T_1D];
473   CeedScalar r_t2[T_1D];
474   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
475     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
476     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
477     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
478     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
479     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
480     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
481     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
482     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
483     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
484   }
485 }
486 
487 //------------------------------------------------------------------------------
488 // 3D derivatives at quadrature points
489 //------------------------------------------------------------------------------
490 template <int NUM_COMP, int P_1D, int Q_1D>
491 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
492                                               CeedScalar *__restrict__ r_V) {
493   CeedScalar r_t1[T_1D];
494   CeedScalar r_t2[T_1D];
495   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
496     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
497     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
498     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
499     ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
500     ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
501     ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
502   }
503 }
504 
505 //------------------------------------------------------------------------------
506 // 3D derivatives transpose
507 //------------------------------------------------------------------------------
508 template <int NUM_COMP, int P_1D, int Q_1D>
509 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
510                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
511   CeedScalar r_t1[T_1D];
512   CeedScalar r_t2[T_1D];
513   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
514     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
515     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
516     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
517     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
518     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
519     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D quadrature weights
525 //------------------------------------------------------------------------------
526 template <int Q_1D>
527 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
528   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
529   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
530   for (CeedInt q = 0; q < Q_1D; q++) {
531     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
532   }
533 }
534