xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h (revision 947f93aa7135eb1759bf2866bd2fbd481436b113)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis templates
10 #ifndef _ceed_hip_shared_basis_tensor_templates_h
11 #define _ceed_hip_shared_basis_tensor_templates_h
12 
13 #include <ceed.h>
14 
15 //------------------------------------------------------------------------------
16 // 1D
17 //------------------------------------------------------------------------------
18 
19 //------------------------------------------------------------------------------
20 // 1D tensor contraction x
21 //------------------------------------------------------------------------------
22 template <int NUM_COMP, int P_1D, int Q_1D>
23 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
24   data.slice[data.t_id_x] = *U;
25   __syncthreads();
26   *V = 0.0;
27   if (data.t_id_x < Q_1D) {
28     for (CeedInt i = 0; i < P_1D; i++) {
29       *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction
30     }
31   }
32   __syncthreads();
33 }
34 
35 //------------------------------------------------------------------------------
36 // 1D transpose tensor contraction x
37 //------------------------------------------------------------------------------
38 template <int NUM_COMP, int P_1D, int Q_1D>
39 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
40   data.slice[data.t_id_x] = *U;
41   __syncthreads();
42   *V = 0.0;
43   if (data.t_id_x < P_1D) {
44     for (CeedInt i = 0; i < Q_1D; i++) {
45       *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction
46     }
47   }
48   __syncthreads();
49 }
50 
51 //------------------------------------------------------------------------------
52 // 1D interpolate to quadrature points
53 //------------------------------------------------------------------------------
54 template <int NUM_COMP, int P_1D, int Q_1D>
55 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
56   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
57     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int P_1D, int Q_1D>
65 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
66   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
67     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
68   }
69 }
70 
71 //------------------------------------------------------------------------------
72 // 1D derivatives at quadrature points
73 //------------------------------------------------------------------------------
74 template <int NUM_COMP, int P_1D, int Q_1D>
75 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
76   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
77     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // 1D derivatives transpose
83 //------------------------------------------------------------------------------
84 template <int NUM_COMP, int P_1D, int Q_1D>
85 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int Q_1D>
95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D>
107 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
108   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
109   __syncthreads();
110   *V = 0.0;
111   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
112     for (CeedInt i = 0; i < P_1D; i++) {
113       *V += B[i + data.t_id_x*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
114     }
115   }
116   __syncthreads();
117 }
118 
119 //------------------------------------------------------------------------------
120 // 2D tensor contract y
121 //------------------------------------------------------------------------------
122 template <int NUM_COMP, int P_1D, int Q_1D>
123 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
125   __syncthreads();
126   *V = 0.0;
127   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
128     for (CeedInt i = 0; i < P_1D; i++) {
129       *V += B[i + data.t_id_y*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
130     }
131   }
132   __syncthreads();
133 }
134 
135 //------------------------------------------------------------------------------
136 // 2D transpose tensor contract y
137 //------------------------------------------------------------------------------
138 template <int NUM_COMP, int P_1D, int Q_1D>
139 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
141   __syncthreads();
142   *V = 0.0;
143   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
144     for (CeedInt i = 0; i < Q_1D; i++) {
145       *V += B[data.t_id_y + i*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
146     }
147   }
148   __syncthreads();
149 }
150 
151 //------------------------------------------------------------------------------
152 // 2D transpose tensor contract x
153 //------------------------------------------------------------------------------
154 template <int NUM_COMP, int P_1D, int Q_1D>
155 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
156   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
157   __syncthreads();
158   *V = 0.0;
159   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
160     for (CeedInt i = 0; i < Q_1D; i++) {
161       *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
162     }
163   }
164   __syncthreads();
165 }
166 
167 //------------------------------------------------------------------------------
168 // 2D transpose tensor contract and add x
169 //------------------------------------------------------------------------------
170 template <int NUM_COMP, int P_1D, int Q_1D>
171 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172   data.slice[data.t_id_x+data.t_id_y*T_1D] = *U;
173   __syncthreads();
174   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
175     for (CeedInt i = 0; i < Q_1D; i++) {
176       *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
177     }
178   }
179   __syncthreads();
180 }
181 
182 //------------------------------------------------------------------------------
183 // 2D interpolate to quadrature points
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int P_1D, int Q_1D>
186 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
187   CeedScalar r_t[1];
188   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
189     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
190     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
191   }
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D interpolate transpose
196 //------------------------------------------------------------------------------
197 template <int NUM_COMP, int P_1D, int Q_1D>
198 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
199   CeedScalar r_t[1];
200   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
201     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
202     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
203   }
204 }
205 
206 //------------------------------------------------------------------------------
207 // 2D derivatives at quadrature points
208 //------------------------------------------------------------------------------
209 template <int NUM_COMP, int P_1D, int Q_1D>
210 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
211   CeedScalar r_t[1];
212   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
213     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
214     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0*NUM_COMP);
215     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
216     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1*NUM_COMP);
217   }
218 }
219 
220 //------------------------------------------------------------------------------
221 // 2D derivatives transpose
222 //------------------------------------------------------------------------------
223 template <int NUM_COMP, int P_1D, int Q_1D>
224 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
225   CeedScalar r_t[1];
226   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
227     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0*NUM_COMP, c_B, r_t);
228     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
229     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1*NUM_COMP, c_G, r_t);
230     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
231   }
232 }
233 
234 //------------------------------------------------------------------------------
235 // 2D quadrature weights
236 //------------------------------------------------------------------------------
237 template <int Q_1D>
238 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
239   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ?
240         q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0;
241 }
242 
243 //------------------------------------------------------------------------------
244 // 3D
245 //------------------------------------------------------------------------------
246 
247 //------------------------------------------------------------------------------
248 // 3D tensor contract x
249 //------------------------------------------------------------------------------
250 template <int NUM_COMP, int P_1D, int Q_1D>
251 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
252   CeedScalar r_B[P_1D];
253   for (CeedInt i = 0; i < P_1D; i++) {
254     r_B[i] = B[i + data.t_id_x*P_1D];
255   }
256 
257   for (CeedInt k = 0; k < P_1D; k++) {
258     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
259     __syncthreads();
260     V[k] = 0.0;
261     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
262       for (CeedInt i = 0; i < P_1D; i++) {
263         V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
264       }
265     }
266     __syncthreads();
267   }
268 }
269 
270 //------------------------------------------------------------------------------
271 // 3D tensor contract y
272 //------------------------------------------------------------------------------
273 template <int NUM_COMP, int P_1D, int Q_1D>
274 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
275   CeedScalar r_B[P_1D];
276   for (CeedInt i = 0; i < P_1D; i++) {
277     r_B[i] = B[i + data.t_id_y*P_1D];
278   }
279 
280   for (CeedInt k = 0; k < P_1D; k++) {
281     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
282     __syncthreads();
283     V[k] = 0.0;
284     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
285       for (CeedInt i = 0; i < P_1D; i++) {
286         V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
287       }
288     }
289     __syncthreads();
290   }
291 }
292 
293 //------------------------------------------------------------------------------
294 // 3D tensor contract z
295 //------------------------------------------------------------------------------
296 template <int NUM_COMP, int P_1D, int Q_1D>
297 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
298   for (CeedInt k = 0; k < Q_1D; k++) {
299     V[k] = 0.0;
300     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
301       for (CeedInt i = 0; i < P_1D; i++) {
302         V[k] += B[i + k*P_1D] * U[i]; // Contract z direction
303       }
304     }
305   }
306 }
307 
308 //------------------------------------------------------------------------------
309 // 3D transpose tensor contract z
310 //------------------------------------------------------------------------------
311 template <int NUM_COMP, int P_1D, int Q_1D>
312 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
313   for (CeedInt k = 0; k < P_1D; k++) {
314     V[k] = 0.0;
315     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
316       for (CeedInt i = 0; i < Q_1D; i++) {
317         V[k] += B[k + i*P_1D] * U[i]; // Contract z direction
318       }
319     }
320   }
321 }
322 
323 //------------------------------------------------------------------------------
324 // 3D transpose tensor contract y
325 //------------------------------------------------------------------------------
326 template <int NUM_COMP, int P_1D, int Q_1D>
327 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
328   CeedScalar r_B[Q_1D];
329   for (CeedInt i = 0; i < Q_1D; i++) {
330     r_B[i] = B[data.t_id_y + i*P_1D];
331   }
332 
333   for (CeedInt k = 0; k < P_1D; k++) {
334     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
335     __syncthreads();
336     V[k] = 0.0;
337     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
338       for (CeedInt i = 0; i < Q_1D; i++) {
339         V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
340       }
341     }
342     __syncthreads();
343   }
344 }
345 
346 //------------------------------------------------------------------------------
347 // 3D transpose tensor contract y
348 //------------------------------------------------------------------------------
349 template <int NUM_COMP, int P_1D, int Q_1D>
350 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
351   CeedScalar r_B[Q_1D];
352   for (CeedInt i = 0; i < Q_1D; i++) {
353     r_B[i] = B[data.t_id_y + i*P_1D];
354   }
355 
356   for (CeedInt k = 0; k < P_1D; k++) {
357     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
358     __syncthreads();
359     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
360       for (CeedInt i = 0; i < Q_1D; i++) {
361         V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction
362       }
363     }
364     __syncthreads();
365   }
366 }
367 
368 //------------------------------------------------------------------------------
369 // 3D transpose tensor contract x
370 //------------------------------------------------------------------------------
371 template <int NUM_COMP, int P_1D, int Q_1D>
372 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
373   CeedScalar r_B[Q_1D];
374   for (CeedInt i = 0; i < Q_1D; i++) {
375     r_B[i] = B[data.t_id_x + i*P_1D];
376   }
377 
378   for (CeedInt k = 0; k < P_1D; k++) {
379     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
380     __syncthreads();
381     V[k] = 0.0;
382     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
383       for (CeedInt i = 0; i < Q_1D; i++) {
384         V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
385       }
386     }
387     __syncthreads();
388   }
389 }
390 
391 //------------------------------------------------------------------------------
392 // 3D transpose tensor contract add x
393 //------------------------------------------------------------------------------
394 template <int NUM_COMP, int P_1D, int Q_1D>
395 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
396   CeedScalar r_B[Q_1D];
397   for (CeedInt i = 0; i < Q_1D; i++) {
398     r_B[i] = B[data.t_id_x + i*P_1D];
399   }
400 
401   for (CeedInt k = 0; k < P_1D; k++) {
402     data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k];
403     __syncthreads();
404     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
405       for (CeedInt i = 0; i < Q_1D; i++) {
406         V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction
407       }
408     }
409     __syncthreads();
410   }
411 }
412 
413 //------------------------------------------------------------------------------
414 // 3D interpolate to quadrature points
415 //------------------------------------------------------------------------------
416 template <int NUM_COMP, int P_1D, int Q_1D>
417 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
418   CeedScalar r_t1[T_1D];
419   CeedScalar r_t2[T_1D];
420   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
421     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
422     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
423     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D);
424   }
425 }
426 
427 //------------------------------------------------------------------------------
428 // 3D interpolate transpose
429 //------------------------------------------------------------------------------
430 template <int NUM_COMP, int P_1D, int Q_1D>
431 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
432   CeedScalar r_t1[T_1D];
433   CeedScalar r_t2[T_1D];
434   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
435     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D, c_B, r_t1);
436     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
437     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
438   }
439 }
440 
441 //------------------------------------------------------------------------------
442 // 3D derivatives at quadrature points
443 //------------------------------------------------------------------------------
444 template <int NUM_COMP, int P_1D, int Q_1D>
445 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
446   CeedScalar r_t1[T_1D];
447   CeedScalar r_t2[T_1D];
448   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
449     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_G, r_t1);
450     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
451     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D);
452     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
453     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
454     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D);
455     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
456     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
457     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D);
458   }
459 }
460 
461 //------------------------------------------------------------------------------
462 // 3D derivatives transpose
463 //------------------------------------------------------------------------------
464 template <int NUM_COMP, int P_1D, int Q_1D>
465 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
466   CeedScalar r_t1[T_1D];
467   CeedScalar r_t2[T_1D];
468   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
469     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_B, r_t1);
470     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
471     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*P_1D);
472     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_B, r_t1);
473     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
474     ContractTransposeAddX3d<NUM_COMP,P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
475     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t1);
476     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
477     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
478   }
479 }
480 
481 //------------------------------------------------------------------------------
482 // 3D derivatives at quadrature points
483 //------------------------------------------------------------------------------
484 template <int NUM_COMP, int P_1D, int Q_1D>
485 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
486   CeedScalar r_t1[T_1D];
487   CeedScalar r_t2[T_1D];
488   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
489     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1);
490     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
491     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
492     ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D);
493     ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D);
494     ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D);
495   }
496 }
497 
498 //------------------------------------------------------------------------------
499 // 3D derivatives transpose
500 //------------------------------------------------------------------------------
501 template <int NUM_COMP, int P_1D, int Q_1D>
502 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
503   CeedScalar r_t1[T_1D];
504   CeedScalar r_t2[T_1D];
505   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
506     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t2);
507     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_G, r_t2);
508     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_G, r_t2);
509     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
510     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
511     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D);
512   }
513 }
514 
515 //------------------------------------------------------------------------------
516 // 3D quadrature weights
517 //------------------------------------------------------------------------------
518 template <int Q_1D>
519 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
520   const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
521   const CeedScalar pw = quad ? q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0;
522   for (CeedInt q = 0; q < Q_1D; q++) {
523     w[q] = quad ? pw*q_weight_1d[q] : 0.0;
524   }
525 }
526 
527 //------------------------------------------------------------------------------
528 
529 #endif
530