xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision 346c77e6436e93de99b1714e06a264fc70d47960)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   __syncthreads();
22   data.slice[data.t_id_x] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (data.t_id_x < Q_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
28     }
29   }
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   __syncthreads();
38   data.slice[data.t_id_x] = *U;
39   __syncthreads();
40   *V = 0.0;
41   if (data.t_id_x < P_1D) {
42     for (CeedInt i = 0; i < Q_1D; i++) {
43       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
44     }
45   }
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D interpolate to quadrature points, nodes and quadrature points collocated
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void InterpCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
74                                                CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     r_V[comp] = r_U[comp];
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D interpolate transpose, nodes and quadrature points collocated
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
85                                                         CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     r_V[comp] = r_U[comp];
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D derivatives at quadrature points
93 //------------------------------------------------------------------------------
94 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
95 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
96                               CeedScalar *__restrict__ r_V) {
97   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
99   }
100 }
101 
102 //------------------------------------------------------------------------------
103 // 1D derivatives transpose
104 //------------------------------------------------------------------------------
105 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
106 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
107                                        CeedScalar *__restrict__ r_V) {
108   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
109     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
110   }
111 }
112 
113 //------------------------------------------------------------------------------
114 // 1D quadrature weights
115 //------------------------------------------------------------------------------
116 template <int P_1D, int Q_1D>
117 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
118   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
119 }
120 
121 //------------------------------------------------------------------------------
122 // 2D
123 //------------------------------------------------------------------------------
124 
125 //------------------------------------------------------------------------------
126 // 2D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
129 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130   __syncthreads();
131   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
132   __syncthreads();
133   *V = 0.0;
134   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
135     for (CeedInt i = 0; i < P_1D; i++) {
136       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
137     }
138   }
139 }
140 
141 //------------------------------------------------------------------------------
142 // 2D tensor contract y
143 //------------------------------------------------------------------------------
144 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
145 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
146   __syncthreads();
147   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
148   __syncthreads();
149   *V = 0.0;
150   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
151     for (CeedInt i = 0; i < P_1D; i++) {
152       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
153     }
154   }
155 }
156 
157 //------------------------------------------------------------------------------
158 // 2D transpose tensor contract y
159 //------------------------------------------------------------------------------
160 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
161 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162   __syncthreads();
163   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
164   __syncthreads();
165   *V = 0.0;
166   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
167     for (CeedInt i = 0; i < Q_1D; i++) {
168       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
169     }
170   }
171 }
172 
173 //------------------------------------------------------------------------------
174 // 2D transpose tensor contract x
175 //------------------------------------------------------------------------------
176 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
177 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
178   __syncthreads();
179   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
180   __syncthreads();
181   *V = 0.0;
182   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
183     for (CeedInt i = 0; i < Q_1D; i++) {
184       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
185     }
186   }
187 }
188 
189 //------------------------------------------------------------------------------
190 // 2D transpose tensor contract and add x
191 //------------------------------------------------------------------------------
192 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
193 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
194   __syncthreads();
195   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
196   __syncthreads();
197   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
198     for (CeedInt i = 0; i < Q_1D; i++) {
199       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
200     }
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // 2D interpolate to quadrature points
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
208 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
209   CeedScalar r_t[1];
210   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
211     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
212     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
213   }
214 }
215 
216 //------------------------------------------------------------------------------
217 // 2D interpolate transpose
218 //------------------------------------------------------------------------------
219 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
220 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
221                                                CeedScalar *__restrict__ r_V) {
222   CeedScalar r_t[1];
223   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
224     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
225     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
226   }
227 }
228 
229 //------------------------------------------------------------------------------
230 // 2D interpolate to quadrature points, nodes and quadrature points collocated
231 //------------------------------------------------------------------------------
232 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
233 inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
234                                                      CeedScalar *__restrict__ r_V) {
235   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
236     r_V[comp] = r_U[comp];
237   }
238 }
239 
240 //------------------------------------------------------------------------------
241 // 2D interpolate transpose, nodes and quadrature points collocated
242 //------------------------------------------------------------------------------
243 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
244 inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
245                                                               CeedScalar *__restrict__ r_V) {
246   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
247     r_V[comp] = r_U[comp];
248   }
249 }
250 
251 //------------------------------------------------------------------------------
252 // 2D derivatives at quadrature points
253 //------------------------------------------------------------------------------
254 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
255 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
256                                     CeedScalar *__restrict__ r_V) {
257   CeedScalar r_t[1];
258   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
259     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
260     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
261     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
262     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
263   }
264 }
265 
266 //------------------------------------------------------------------------------
267 // 2D derivatives transpose
268 //------------------------------------------------------------------------------
269 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
271                                              CeedScalar *__restrict__ r_V) {
272   CeedScalar r_t[1];
273   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
274     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
275     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
276     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
277     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
278   }
279 }
280 
281 //------------------------------------------------------------------------------
282 // 2D derivatives at quadrature points, nodes and quadrature points collocated
283 //------------------------------------------------------------------------------
284 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
285 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
286                                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
287   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
288     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
289     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
290   }
291 }
292 
293 //------------------------------------------------------------------------------
294 // 2D derivatives transpose, nodes and quadrature points collocated
295 //------------------------------------------------------------------------------
296 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
297 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
298                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
299   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
300     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
301     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
302   }
303 }
304 
305 //------------------------------------------------------------------------------
306 // 2D quadrature weights
307 //------------------------------------------------------------------------------
308 template <int P_1D, int Q_1D>
309 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
310   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
311 }
312 
313 //------------------------------------------------------------------------------
314 // 3D
315 //------------------------------------------------------------------------------
316 
317 //------------------------------------------------------------------------------
318 // 3D tensor contract x
319 //------------------------------------------------------------------------------
320 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
321 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322   CeedScalar r_B[P_1D];
323   for (CeedInt i = 0; i < P_1D; i++) {
324     r_B[i] = B[i + data.t_id_x * P_1D];
325   }
326 
327   for (CeedInt k = 0; k < P_1D; k++) {
328     __syncthreads();
329     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
330     __syncthreads();
331     V[k] = 0.0;
332     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
333       for (CeedInt i = 0; i < P_1D; i++) {
334         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
335       }
336     }
337   }
338 }
339 
340 //------------------------------------------------------------------------------
341 // 3D tensor contract y
342 //------------------------------------------------------------------------------
343 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
344 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
345   CeedScalar r_B[P_1D];
346   for (CeedInt i = 0; i < P_1D; i++) {
347     r_B[i] = B[i + data.t_id_y * P_1D];
348   }
349 
350   for (CeedInt k = 0; k < P_1D; k++) {
351     __syncthreads();
352     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
353     __syncthreads();
354     V[k] = 0.0;
355     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
356       for (CeedInt i = 0; i < P_1D; i++) {
357         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
358       }
359     }
360   }
361 }
362 
363 //------------------------------------------------------------------------------
364 // 3D tensor contract z
365 //------------------------------------------------------------------------------
366 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
367 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
368   for (CeedInt k = 0; k < Q_1D; k++) {
369     V[k] = 0.0;
370     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
371       for (CeedInt i = 0; i < P_1D; i++) {
372         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
373       }
374     }
375   }
376 }
377 
378 //------------------------------------------------------------------------------
379 // 3D transpose tensor contract z
380 //------------------------------------------------------------------------------
381 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
382 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
383   for (CeedInt k = 0; k < P_1D; k++) {
384     V[k] = 0.0;
385     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
386       for (CeedInt i = 0; i < Q_1D; i++) {
387         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
388       }
389     }
390   }
391 }
392 
393 //------------------------------------------------------------------------------
394 // 3D transpose tensor contract y
395 //------------------------------------------------------------------------------
396 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
397 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
398   CeedScalar r_B[Q_1D];
399   for (CeedInt i = 0; i < Q_1D; i++) {
400     r_B[i] = B[data.t_id_y + i * P_1D];
401   }
402 
403   for (CeedInt k = 0; k < P_1D; k++) {
404     __syncthreads();
405     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
406     __syncthreads();
407     V[k] = 0.0;
408     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
409       for (CeedInt i = 0; i < Q_1D; i++) {
410         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
411       }
412     }
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // 3D transpose tensor contract y
418 //------------------------------------------------------------------------------
419 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
420 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
421   CeedScalar r_B[Q_1D];
422   for (CeedInt i = 0; i < Q_1D; i++) {
423     r_B[i] = B[data.t_id_y + i * P_1D];
424   }
425 
426   for (CeedInt k = 0; k < P_1D; k++) {
427     __syncthreads();
428     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
429     __syncthreads();
430     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
431       for (CeedInt i = 0; i < Q_1D; i++) {
432         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
433       }
434     }
435   }
436 }
437 
438 //------------------------------------------------------------------------------
439 // 3D transpose tensor contract x
440 //------------------------------------------------------------------------------
441 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
442 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
443   CeedScalar r_B[Q_1D];
444   for (CeedInt i = 0; i < Q_1D; i++) {
445     r_B[i] = B[data.t_id_x + i * P_1D];
446   }
447 
448   for (CeedInt k = 0; k < P_1D; k++) {
449     __syncthreads();
450     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
451     __syncthreads();
452     V[k] = 0.0;
453     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
454       for (CeedInt i = 0; i < Q_1D; i++) {
455         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
456       }
457     }
458   }
459 }
460 
461 //------------------------------------------------------------------------------
462 // 3D transpose tensor contract add x
463 //------------------------------------------------------------------------------
464 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
465 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
466   CeedScalar r_B[Q_1D];
467   for (CeedInt i = 0; i < Q_1D; i++) {
468     r_B[i] = B[data.t_id_x + i * P_1D];
469   }
470 
471   for (CeedInt k = 0; k < P_1D; k++) {
472     __syncthreads();
473     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
474     __syncthreads();
475     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
476       for (CeedInt i = 0; i < Q_1D; i++) {
477         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
478       }
479     }
480   }
481 }
482 
483 //------------------------------------------------------------------------------
484 // 3D interpolate to quadrature points
485 //------------------------------------------------------------------------------
486 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
487 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
488   CeedScalar r_t1[T_1D];
489   CeedScalar r_t2[T_1D];
490   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
491     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
492     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
493     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
494   }
495 }
496 
497 //------------------------------------------------------------------------------
498 // 3D interpolate transpose
499 //------------------------------------------------------------------------------
500 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
501 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
502                                                CeedScalar *__restrict__ r_V) {
503   CeedScalar r_t1[T_1D];
504   CeedScalar r_t2[T_1D];
505   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
506     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
507     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
508     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
509   }
510 }
511 
512 //------------------------------------------------------------------------------
513 // 3D interpolate to quadrature points, nodes and quadrature points collocated
514 //------------------------------------------------------------------------------
515 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
516 inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
517                                                      CeedScalar *__restrict__ r_V) {
518   for (CeedInt i = 0; i < Q_1D; i++) {
519     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
520       r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
521     }
522   }
523 }
524 
525 //------------------------------------------------------------------------------
526 // 3D interpolate transpose, nodes and quadrature points collocated
527 //------------------------------------------------------------------------------
528 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
529 inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
530                                                               CeedScalar *__restrict__ r_V) {
531   for (CeedInt i = 0; i < Q_1D; i++) {
532     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
533       r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
534     }
535   }
536 }
537 
538 //------------------------------------------------------------------------------
539 // 3D derivatives at quadrature points
540 //------------------------------------------------------------------------------
541 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
542 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
543                                     CeedScalar *__restrict__ r_V) {
544   CeedScalar r_t1[T_1D];
545   CeedScalar r_t2[T_1D];
546   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
547     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
548     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
549     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
550     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
551     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
552     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
553     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
554     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
555     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
556   }
557 }
558 
559 //------------------------------------------------------------------------------
560 // 3D derivatives transpose
561 //------------------------------------------------------------------------------
562 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
563 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
564                                              CeedScalar *__restrict__ r_V) {
565   CeedScalar r_t1[T_1D];
566   CeedScalar r_t2[T_1D];
567   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
568     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
569     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
570     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
571     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
572     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
573     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
574     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
575     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
576     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
577   }
578 }
579 
580 //------------------------------------------------------------------------------
581 // 3D derivatives at quadrature points
582 //------------------------------------------------------------------------------
583 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
584 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
585                                               CeedScalar *__restrict__ r_V) {
586   CeedScalar r_t1[T_1D];
587   CeedScalar r_t2[T_1D];
588   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
589     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
590     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
591     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
592     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
593     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
594     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
595   }
596 }
597 
598 //------------------------------------------------------------------------------
599 // 3D derivatives transpose
600 //------------------------------------------------------------------------------
601 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
602 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
603                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
604   CeedScalar r_t1[T_1D];
605   CeedScalar r_t2[T_1D];
606   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
607     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
608     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
609     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
610     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
611     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
612     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
613   }
614 }
615 
616 //------------------------------------------------------------------------------
617 // 3D derivatives at quadrature points, nodes and quadrature points collocated
618 //------------------------------------------------------------------------------
619 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
620 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
621                                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
622   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
623     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
624     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
625     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
626   }
627 }
628 
629 //------------------------------------------------------------------------------
630 // 3D derivatives transpose, nodes and quadrature points collocated
631 //------------------------------------------------------------------------------
632 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
633 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
634                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
635   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
636     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
637     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
638     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
639   }
640 }
641 
642 //------------------------------------------------------------------------------
643 // 3D quadrature weights
644 //------------------------------------------------------------------------------
645 template <int P_1D, int Q_1D>
646 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
647   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
648   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
649   for (CeedInt q = 0; q < Q_1D; q++) {
650     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
651   }
652 }
653