xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision 24a65d3da2f623912f26b42c0b9ba6f37de25307)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #ifndef CEED_HIP_SHARED_BASIS_TENSOR_TEMPLATES_H
11 #define CEED_HIP_SHARED_BASIS_TENSOR_TEMPLATES_H
12 
13 #include <ceed.h>
14 
15 //------------------------------------------------------------------------------
16 // 1D
17 //------------------------------------------------------------------------------
18 
19 //------------------------------------------------------------------------------
20 // 1D tensor contraction x
21 //------------------------------------------------------------------------------
22 template <int NUM_COMP, int P_1D, int Q_1D>
23 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
24   data.slice[data.t_id_x] = *U;
25   __syncthreads();
26   *V = 0.0;
27   if (data.t_id_x < Q_1D) {
28     for (CeedInt i = 0; i < P_1D; i++) {
29       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
30     }
31   }
32   __syncthreads();
33 }
34 
35 //------------------------------------------------------------------------------
36 // 1D transpose tensor contraction x
37 //------------------------------------------------------------------------------
38 template <int NUM_COMP, int P_1D, int Q_1D>
39 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
40   data.slice[data.t_id_x] = *U;
41   __syncthreads();
42   *V = 0.0;
43   if (data.t_id_x < P_1D) {
44     for (CeedInt i = 0; i < Q_1D; i++) {
45       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
46     }
47   }
48   __syncthreads();
49 }
50 
51 //------------------------------------------------------------------------------
52 // 1D interpolate to quadrature points
53 //------------------------------------------------------------------------------
54 template <int NUM_COMP, int P_1D, int Q_1D>
55 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
56   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
57     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int P_1D, int Q_1D>
65 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
66                                          CeedScalar *__restrict__ r_V) {
67   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
68     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
69   }
70 }
71 
72 //------------------------------------------------------------------------------
73 // 1D derivatives at quadrature points
74 //------------------------------------------------------------------------------
75 template <int NUM_COMP, int P_1D, int Q_1D>
76 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
77                               CeedScalar *__restrict__ r_V) {
78   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
79     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // 1D derivatives transpose
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int Q_1D>
87 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
88                                        CeedScalar *__restrict__ r_V) {
89   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
90     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
91   }
92 }
93 
94 //------------------------------------------------------------------------------
95 // 1D quadrature weights
96 //------------------------------------------------------------------------------
97 template <int Q_1D>
98 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
99   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
100 }
101 
102 //------------------------------------------------------------------------------
103 // 2D
104 //------------------------------------------------------------------------------
105 
106 //------------------------------------------------------------------------------
107 // 2D tensor contraction x
108 //------------------------------------------------------------------------------
109 template <int NUM_COMP, int P_1D, int Q_1D>
110 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
111   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
112   __syncthreads();
113   *V = 0.0;
114   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
115     for (CeedInt i = 0; i < P_1D; i++) {
116       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
117     }
118   }
119   __syncthreads();
120 }
121 
122 //------------------------------------------------------------------------------
123 // 2D tensor contract y
124 //------------------------------------------------------------------------------
125 template <int NUM_COMP, int P_1D, int Q_1D>
126 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
127   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
128   __syncthreads();
129   *V = 0.0;
130   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
131     for (CeedInt i = 0; i < P_1D; i++) {
132       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
133     }
134   }
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 2D transpose tensor contract y
140 //------------------------------------------------------------------------------
141 template <int NUM_COMP, int P_1D, int Q_1D>
142 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
143   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
144   __syncthreads();
145   *V = 0.0;
146   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
147     for (CeedInt i = 0; i < Q_1D; i++) {
148       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
149     }
150   }
151   __syncthreads();
152 }
153 
154 //------------------------------------------------------------------------------
155 // 2D transpose tensor contract x
156 //------------------------------------------------------------------------------
157 template <int NUM_COMP, int P_1D, int Q_1D>
158 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
159   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
160   __syncthreads();
161   *V = 0.0;
162   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
163     for (CeedInt i = 0; i < Q_1D; i++) {
164       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
165     }
166   }
167   __syncthreads();
168 }
169 
170 //------------------------------------------------------------------------------
171 // 2D transpose tensor contract and add x
172 //------------------------------------------------------------------------------
173 template <int NUM_COMP, int P_1D, int Q_1D>
174 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
175   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
176   __syncthreads();
177   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
178     for (CeedInt i = 0; i < Q_1D; i++) {
179       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
180     }
181   }
182   __syncthreads();
183 }
184 
185 //------------------------------------------------------------------------------
186 // 2D interpolate to quadrature points
187 //------------------------------------------------------------------------------
188 template <int NUM_COMP, int P_1D, int Q_1D>
189 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
190   CeedScalar r_t[1];
191   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
192     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
193     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
194   }
195 }
196 
197 //------------------------------------------------------------------------------
198 // 2D interpolate transpose
199 //------------------------------------------------------------------------------
200 template <int NUM_COMP, int P_1D, int Q_1D>
201 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
202                                                CeedScalar *__restrict__ r_V) {
203   CeedScalar r_t[1];
204   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
205     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
206     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
207   }
208 }
209 
210 //------------------------------------------------------------------------------
211 // 2D derivatives at quadrature points
212 //------------------------------------------------------------------------------
213 template <int NUM_COMP, int P_1D, int Q_1D>
214 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
215                                     CeedScalar *__restrict__ r_V) {
216   CeedScalar r_t[1];
217   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
218     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
219     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
220     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
221     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
222   }
223 }
224 
225 //------------------------------------------------------------------------------
226 // 2D derivatives transpose
227 //------------------------------------------------------------------------------
228 template <int NUM_COMP, int P_1D, int Q_1D>
229 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
230                                              CeedScalar *__restrict__ r_V) {
231   CeedScalar r_t[1];
232   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
233     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
234     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
235     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
236     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
237   }
238 }
239 
240 //------------------------------------------------------------------------------
241 // 2D quadrature weights
242 //------------------------------------------------------------------------------
243 template <int Q_1D>
244 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
245   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
246 }
247 
248 //------------------------------------------------------------------------------
249 // 3D
250 //------------------------------------------------------------------------------
251 
252 //------------------------------------------------------------------------------
253 // 3D tensor contract x
254 //------------------------------------------------------------------------------
255 template <int NUM_COMP, int P_1D, int Q_1D>
256 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
257   CeedScalar r_B[P_1D];
258   for (CeedInt i = 0; i < P_1D; i++) {
259     r_B[i] = B[i + data.t_id_x * P_1D];
260   }
261 
262   for (CeedInt k = 0; k < P_1D; k++) {
263     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
264     __syncthreads();
265     V[k] = 0.0;
266     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
267       for (CeedInt i = 0; i < P_1D; i++) {
268         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
269       }
270     }
271     __syncthreads();
272   }
273 }
274 
275 //------------------------------------------------------------------------------
276 // 3D tensor contract y
277 //------------------------------------------------------------------------------
278 template <int NUM_COMP, int P_1D, int Q_1D>
279 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
280   CeedScalar r_B[P_1D];
281   for (CeedInt i = 0; i < P_1D; i++) {
282     r_B[i] = B[i + data.t_id_y * P_1D];
283   }
284 
285   for (CeedInt k = 0; k < P_1D; k++) {
286     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
287     __syncthreads();
288     V[k] = 0.0;
289     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
290       for (CeedInt i = 0; i < P_1D; i++) {
291         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
292       }
293     }
294     __syncthreads();
295   }
296 }
297 
298 //------------------------------------------------------------------------------
299 // 3D tensor contract z
300 //------------------------------------------------------------------------------
301 template <int NUM_COMP, int P_1D, int Q_1D>
302 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
303   for (CeedInt k = 0; k < Q_1D; k++) {
304     V[k] = 0.0;
305     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
306       for (CeedInt i = 0; i < P_1D; i++) {
307         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
308       }
309     }
310   }
311 }
312 
313 //------------------------------------------------------------------------------
314 // 3D transpose tensor contract z
315 //------------------------------------------------------------------------------
316 template <int NUM_COMP, int P_1D, int Q_1D>
317 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
318   for (CeedInt k = 0; k < P_1D; k++) {
319     V[k] = 0.0;
320     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
321       for (CeedInt i = 0; i < Q_1D; i++) {
322         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
323       }
324     }
325   }
326 }
327 
328 //------------------------------------------------------------------------------
329 // 3D transpose tensor contract y
330 //------------------------------------------------------------------------------
331 template <int NUM_COMP, int P_1D, int Q_1D>
332 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
333   CeedScalar r_B[Q_1D];
334   for (CeedInt i = 0; i < Q_1D; i++) {
335     r_B[i] = B[data.t_id_y + i * P_1D];
336   }
337 
338   for (CeedInt k = 0; k < P_1D; k++) {
339     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
340     __syncthreads();
341     V[k] = 0.0;
342     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
343       for (CeedInt i = 0; i < Q_1D; i++) {
344         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
345       }
346     }
347     __syncthreads();
348   }
349 }
350 
351 //------------------------------------------------------------------------------
352 // 3D transpose tensor contract y
353 //------------------------------------------------------------------------------
354 template <int NUM_COMP, int P_1D, int Q_1D>
355 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
356   CeedScalar r_B[Q_1D];
357   for (CeedInt i = 0; i < Q_1D; i++) {
358     r_B[i] = B[data.t_id_y + i * P_1D];
359   }
360 
361   for (CeedInt k = 0; k < P_1D; k++) {
362     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
363     __syncthreads();
364     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
365       for (CeedInt i = 0; i < Q_1D; i++) {
366         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
367       }
368     }
369     __syncthreads();
370   }
371 }
372 
373 //------------------------------------------------------------------------------
374 // 3D transpose tensor contract x
375 //------------------------------------------------------------------------------
376 template <int NUM_COMP, int P_1D, int Q_1D>
377 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
378   CeedScalar r_B[Q_1D];
379   for (CeedInt i = 0; i < Q_1D; i++) {
380     r_B[i] = B[data.t_id_x + i * P_1D];
381   }
382 
383   for (CeedInt k = 0; k < P_1D; k++) {
384     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
385     __syncthreads();
386     V[k] = 0.0;
387     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
388       for (CeedInt i = 0; i < Q_1D; i++) {
389         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
390       }
391     }
392     __syncthreads();
393   }
394 }
395 
396 //------------------------------------------------------------------------------
397 // 3D transpose tensor contract add x
398 //------------------------------------------------------------------------------
399 template <int NUM_COMP, int P_1D, int Q_1D>
400 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
401   CeedScalar r_B[Q_1D];
402   for (CeedInt i = 0; i < Q_1D; i++) {
403     r_B[i] = B[data.t_id_x + i * P_1D];
404   }
405 
406   for (CeedInt k = 0; k < P_1D; k++) {
407     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
408     __syncthreads();
409     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
410       for (CeedInt i = 0; i < Q_1D; i++) {
411         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
412       }
413     }
414     __syncthreads();
415   }
416 }
417 
418 //------------------------------------------------------------------------------
419 // 3D interpolate to quadrature points
420 //------------------------------------------------------------------------------
421 template <int NUM_COMP, int P_1D, int Q_1D>
422 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
423   CeedScalar r_t1[T_1D];
424   CeedScalar r_t2[T_1D];
425   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
426     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
427     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
428     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
429   }
430 }
431 
432 //------------------------------------------------------------------------------
433 // 3D interpolate transpose
434 //------------------------------------------------------------------------------
435 template <int NUM_COMP, int P_1D, int Q_1D>
436 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
437                                                CeedScalar *__restrict__ r_V) {
438   CeedScalar r_t1[T_1D];
439   CeedScalar r_t2[T_1D];
440   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
441     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
442     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
443     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
444   }
445 }
446 
447 //------------------------------------------------------------------------------
448 // 3D derivatives at quadrature points
449 //------------------------------------------------------------------------------
450 template <int NUM_COMP, int P_1D, int Q_1D>
451 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
452                                     CeedScalar *__restrict__ r_V) {
453   CeedScalar r_t1[T_1D];
454   CeedScalar r_t2[T_1D];
455   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
456     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
457     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
458     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
459     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
460     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
461     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
462     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
463     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
464     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
465   }
466 }
467 
468 //------------------------------------------------------------------------------
469 // 3D derivatives transpose
470 //------------------------------------------------------------------------------
471 template <int NUM_COMP, int P_1D, int Q_1D>
472 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
473                                              CeedScalar *__restrict__ r_V) {
474   CeedScalar r_t1[T_1D];
475   CeedScalar r_t2[T_1D];
476   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
477     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
478     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
479     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
480     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
481     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
482     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
483     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
484     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
485     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
486   }
487 }
488 
489 //------------------------------------------------------------------------------
490 // 3D derivatives at quadrature points
491 //------------------------------------------------------------------------------
492 template <int NUM_COMP, int P_1D, int Q_1D>
493 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
494                                               CeedScalar *__restrict__ r_V) {
495   CeedScalar r_t1[T_1D];
496   CeedScalar r_t2[T_1D];
497   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
498     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
499     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
500     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
501     ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
502     ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
503     ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
504   }
505 }
506 
507 //------------------------------------------------------------------------------
508 // 3D derivatives transpose
509 //------------------------------------------------------------------------------
510 template <int NUM_COMP, int P_1D, int Q_1D>
511 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
512                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
513   CeedScalar r_t1[T_1D];
514   CeedScalar r_t2[T_1D];
515   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
516     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
517     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
518     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
519     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
520     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
521     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
522   }
523 }
524 
525 //------------------------------------------------------------------------------
526 // 3D quadrature weights
527 //------------------------------------------------------------------------------
528 template <int Q_1D>
529 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
530   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
531   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
532   for (CeedInt q = 0; q < Q_1D; q++) {
533     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
534   }
535 }
536 
537 //------------------------------------------------------------------------------
538 
539 #endif  // CEED_HIP_SHARED_BASIS_TENSOR_TEMPLATES_H
540