xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision 343e3094792a64f9c2da70ef2256f98e7dc173cf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   data.slice[data.t_id_x] = *U;
22   __syncthreads();
23   *V = 0.0;
24   if (data.t_id_x < Q_1D) {
25     for (CeedInt i = 0; i < P_1D; i++) {
26       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
27     }
28   }
29   __syncthreads();
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   data.slice[data.t_id_x] = *U;
38   __syncthreads();
39   *V = 0.0;
40   if (data.t_id_x < P_1D) {
41     for (CeedInt i = 0; i < Q_1D; i++) {
42       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
43     }
44   }
45   __syncthreads();
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int Q_1D>
95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
108                                    CeedScalar *V) {
109   data.slice[t_id_x + t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (t_id_x < Q_1D && t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
115     }
116   }
117   __syncthreads();
118 }
119 
120 //------------------------------------------------------------------------------
121 // 2D tensor contract y
122 //------------------------------------------------------------------------------
123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
124 inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
125                                    CeedScalar *V) {
126   data.slice[t_id_x + t_id_y * T_1D] = *U;
127 >>>>>>> b855402d (gpu - isolate core 2D tensor logic to allow flat version)
128   __syncthreads();
129   *V = 0.0;
130   if (t_id_x < Q_1D && t_id_y < Q_1D) {
131     for (CeedInt i = 0; i < P_1D; i++) {
132       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
133     }
134   }
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 2D transpose tensor contract y
140 //------------------------------------------------------------------------------
141 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
142 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
143                                             CeedScalar *V) {
144   data.slice[t_id_x + t_id_y * T_1D] = *U;
145   __syncthreads();
146   *V = 0.0;
147   if (t_id_x < Q_1D && t_id_y < P_1D) {
148     for (CeedInt i = 0; i < Q_1D; i++) {
149       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
150     }
151   }
152   __syncthreads();
153 }
154 
155 //------------------------------------------------------------------------------
156 // 2D transpose tensor contract x
157 //------------------------------------------------------------------------------
158 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
159 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
160                                             CeedScalar *V) {
161   data.slice[t_id_x + t_id_y * T_1D] = *U;
162   __syncthreads();
163   *V = 0.0;
164   if (t_id_x < P_1D && t_id_y < P_1D) {
165     for (CeedInt i = 0; i < Q_1D; i++) {
166       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
167     }
168   }
169   __syncthreads();
170 }
171 
172 //------------------------------------------------------------------------------
173 // 2D transpose tensor contract and add x
174 //------------------------------------------------------------------------------
175 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
176 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
177                                                CeedScalar *V) {
178   data.slice[t_id_x + t_id_y * T_1D] = *U;
179   __syncthreads();
180   if (t_id_x < P_1D && t_id_y < P_1D) {
181     for (CeedInt i = 0; i < Q_1D; i++) {
182       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
183     }
184   }
185   __syncthreads();
186 }
187 
188 //------------------------------------------------------------------------------
189 // 2D interpolate to quadrature points
190 //------------------------------------------------------------------------------
191 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
192 inline __device__ void InterpTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
193                                            const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
194   CeedScalar r_t[1];
195   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
196     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
197     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
198   }
199 }
200 
201 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
202 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
203   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
204 }
205 
206 //------------------------------------------------------------------------------
207 // 2D interpolate transpose
208 //------------------------------------------------------------------------------
209 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
210 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
211                                                     const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
212   CeedScalar r_t[1];
213   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
214     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
215     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
216   }
217 }
218 
219 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
220 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
221                                                CeedScalar *__restrict__ r_V) {
222   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
223 }
224 
225 //------------------------------------------------------------------------------
226 // 2D derivatives at quadrature points
227 //------------------------------------------------------------------------------
228 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
229 inline __device__ void GradTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
230                                          const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
231   CeedScalar r_t[1];
232   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
233     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
234     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
235     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
236     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
237   }
238 }
239 
240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
241 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
242                                     CeedScalar *__restrict__ r_V) {
243   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
244 }
245 
246 //------------------------------------------------------------------------------
247 // 2D derivatives transpose
248 //------------------------------------------------------------------------------
249 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
250 inline __device__ void GradTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
251                                                   const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
252   CeedScalar r_t[1];
253   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
254     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
255     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
256     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
257     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
258   }
259 }
260 
261 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
262 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
263                                              CeedScalar *__restrict__ r_V) {
264   GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
265 }
266 
267 //------------------------------------------------------------------------------
268 // 2D quadrature weights
269 //------------------------------------------------------------------------------
270 template <int Q_1D>
271 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
272   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
273 }
274 
275 //------------------------------------------------------------------------------
276 // 3D
277 //------------------------------------------------------------------------------
278 
279 //------------------------------------------------------------------------------
280 // 3D tensor contract x
281 //------------------------------------------------------------------------------
282 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
283 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
284   CeedScalar r_B[P_1D];
285   for (CeedInt i = 0; i < P_1D; i++) {
286     r_B[i] = B[i + data.t_id_x * P_1D];
287   }
288 
289   for (CeedInt k = 0; k < P_1D; k++) {
290     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
291     __syncthreads();
292     V[k] = 0.0;
293     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
294       for (CeedInt i = 0; i < P_1D; i++) {
295         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
296       }
297     }
298     __syncthreads();
299   }
300 }
301 
302 //------------------------------------------------------------------------------
303 // 3D tensor contract y
304 //------------------------------------------------------------------------------
305 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
306 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
307   CeedScalar r_B[P_1D];
308   for (CeedInt i = 0; i < P_1D; i++) {
309     r_B[i] = B[i + data.t_id_y * P_1D];
310   }
311 
312   for (CeedInt k = 0; k < P_1D; k++) {
313     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
314     __syncthreads();
315     V[k] = 0.0;
316     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
317       for (CeedInt i = 0; i < P_1D; i++) {
318         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
319       }
320     }
321     __syncthreads();
322   }
323 }
324 
325 //------------------------------------------------------------------------------
326 // 3D tensor contract z
327 //------------------------------------------------------------------------------
328 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
329 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
330   for (CeedInt k = 0; k < Q_1D; k++) {
331     V[k] = 0.0;
332     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
333       for (CeedInt i = 0; i < P_1D; i++) {
334         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
335       }
336     }
337   }
338 }
339 
340 //------------------------------------------------------------------------------
341 // 3D transpose tensor contract z
342 //------------------------------------------------------------------------------
343 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
344 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
345   for (CeedInt k = 0; k < P_1D; k++) {
346     V[k] = 0.0;
347     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
348       for (CeedInt i = 0; i < Q_1D; i++) {
349         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
350       }
351     }
352   }
353 }
354 
355 //------------------------------------------------------------------------------
356 // 3D transpose tensor contract y
357 //------------------------------------------------------------------------------
358 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
359 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
360   CeedScalar r_B[Q_1D];
361   for (CeedInt i = 0; i < Q_1D; i++) {
362     r_B[i] = B[data.t_id_y + i * P_1D];
363   }
364 
365   for (CeedInt k = 0; k < P_1D; k++) {
366     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
367     __syncthreads();
368     V[k] = 0.0;
369     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
370       for (CeedInt i = 0; i < Q_1D; i++) {
371         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
372       }
373     }
374     __syncthreads();
375   }
376 }
377 
378 //------------------------------------------------------------------------------
379 // 3D transpose tensor contract y
380 //------------------------------------------------------------------------------
381 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
382 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
383   CeedScalar r_B[Q_1D];
384   for (CeedInt i = 0; i < Q_1D; i++) {
385     r_B[i] = B[data.t_id_y + i * P_1D];
386   }
387 
388   for (CeedInt k = 0; k < P_1D; k++) {
389     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
390     __syncthreads();
391     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
392       for (CeedInt i = 0; i < Q_1D; i++) {
393         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
394       }
395     }
396     __syncthreads();
397   }
398 }
399 
400 //------------------------------------------------------------------------------
401 // 3D transpose tensor contract x
402 //------------------------------------------------------------------------------
403 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
404 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
405   CeedScalar r_B[Q_1D];
406   for (CeedInt i = 0; i < Q_1D; i++) {
407     r_B[i] = B[data.t_id_x + i * P_1D];
408   }
409 
410   for (CeedInt k = 0; k < P_1D; k++) {
411     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
412     __syncthreads();
413     V[k] = 0.0;
414     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
415       for (CeedInt i = 0; i < Q_1D; i++) {
416         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
417       }
418     }
419     __syncthreads();
420   }
421 }
422 
423 //------------------------------------------------------------------------------
424 // 3D transpose tensor contract add x
425 //------------------------------------------------------------------------------
426 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
427 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
428   CeedScalar r_B[Q_1D];
429   for (CeedInt i = 0; i < Q_1D; i++) {
430     r_B[i] = B[data.t_id_x + i * P_1D];
431   }
432 
433   for (CeedInt k = 0; k < P_1D; k++) {
434     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
435     __syncthreads();
436     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
437       for (CeedInt i = 0; i < Q_1D; i++) {
438         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
439       }
440     }
441     __syncthreads();
442   }
443 }
444 
445 //------------------------------------------------------------------------------
446 // 3D interpolate to quadrature points
447 //------------------------------------------------------------------------------
448 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
449 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
450   CeedScalar r_t1[T_1D];
451   CeedScalar r_t2[T_1D];
452   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
453     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
454     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
455     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
456   }
457 }
458 
459 //------------------------------------------------------------------------------
460 // 3D interpolate transpose
461 //------------------------------------------------------------------------------
462 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
463 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
464                                                CeedScalar *__restrict__ r_V) {
465   CeedScalar r_t1[T_1D];
466   CeedScalar r_t2[T_1D];
467   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
468     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
469     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
470     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
471   }
472 }
473 
474 //------------------------------------------------------------------------------
475 // 3D derivatives at quadrature points
476 //------------------------------------------------------------------------------
477 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
478 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
479                                     CeedScalar *__restrict__ r_V) {
480   CeedScalar r_t1[T_1D];
481   CeedScalar r_t2[T_1D];
482   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
483     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
484     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
485     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
486     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
487     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
488     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
489     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
490     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
491     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
492   }
493 }
494 
495 //------------------------------------------------------------------------------
496 // 3D derivatives transpose
497 //------------------------------------------------------------------------------
498 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
499 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
500                                              CeedScalar *__restrict__ r_V) {
501   CeedScalar r_t1[T_1D];
502   CeedScalar r_t2[T_1D];
503   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
504     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
505     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
506     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
507     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
508     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
509     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
510     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
511     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
512     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
513   }
514 }
515 
516 //------------------------------------------------------------------------------
517 // 3D derivatives at quadrature points
518 //------------------------------------------------------------------------------
519 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
520 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
521                                               CeedScalar *__restrict__ r_V) {
522   CeedScalar r_t1[T_1D];
523   CeedScalar r_t2[T_1D];
524   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
525     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
526     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
527     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
528     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
529     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
530     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
531   }
532 }
533 
534 //------------------------------------------------------------------------------
535 // 3D derivatives transpose
536 //------------------------------------------------------------------------------
537 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
538 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
539                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
540   CeedScalar r_t1[T_1D];
541   CeedScalar r_t2[T_1D];
542   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
543     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
544     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
545     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
546     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
547     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
548     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
549   }
550 }
551 
552 //------------------------------------------------------------------------------
553 // 3D quadrature weights
554 //------------------------------------------------------------------------------
555 template <int Q_1D>
556 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
557   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
558   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
559   for (CeedInt q = 0; q < Q_1D; q++) {
560     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
561   }
562 }
563