xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision edb2538e3dd6743c029967fc4e89c6fcafedb8c2)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #ifndef CEED_CUDA_SHARED_BASIS_TENSOR_TEMPLATES_H
11 #define CEED_CUDA_SHARED_BASIS_TENSOR_TEMPLATES_H
12 
13 #include <ceed.h>
14 
15 //------------------------------------------------------------------------------
16 // 1D
17 //------------------------------------------------------------------------------
18 
19 //------------------------------------------------------------------------------
20 // 1D tensor contraction x
21 //------------------------------------------------------------------------------
22 template <int NUM_COMP, int P_1D, int Q_1D>
23 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
24   data.slice[data.t_id_x] = *U;
25   __syncthreads();
26   *V = 0.0;
27   if (data.t_id_x < Q_1D) {
28     for (CeedInt i = 0; i < P_1D; i++) {
29       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
30     }
31   }
32   __syncthreads();
33 }
34 
35 //------------------------------------------------------------------------------
36 // 1D transpose tensor contraction x
37 //------------------------------------------------------------------------------
38 template <int NUM_COMP, int P_1D, int Q_1D>
39 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
40   data.slice[data.t_id_x] = *U;
41   __syncthreads();
42   *V = 0.0;
43   if (data.t_id_x < P_1D) {
44     for (CeedInt i = 0; i < Q_1D; i++) {
45       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
46     }
47   }
48   __syncthreads();
49 }
50 
51 //------------------------------------------------------------------------------
52 // 1D interpolate to quadrature points
53 //------------------------------------------------------------------------------
54 template <int NUM_COMP, int P_1D, int Q_1D>
55 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
56   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
57     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 1D interpolate transpose
63 //------------------------------------------------------------------------------
64 template <int NUM_COMP, int P_1D, int Q_1D>
65 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
66                                          CeedScalar *__restrict__ r_V) {
67   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
68     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
69   }
70 }
71 
72 //------------------------------------------------------------------------------
73 // 1D derivatives at quadrature points
74 //------------------------------------------------------------------------------
75 template <int NUM_COMP, int P_1D, int Q_1D>
76 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
77                               CeedScalar *__restrict__ r_V) {
78   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
79     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // 1D derivatives transpose
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int Q_1D>
87 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
88                                        CeedScalar *__restrict__ r_V) {
89   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
90     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
91   }
92 }
93 
94 //------------------------------------------------------------------------------
95 // 1D quadrature weights
96 //------------------------------------------------------------------------------
97 template <int Q_1D>
98 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
99   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
100 }
101 
102 //------------------------------------------------------------------------------
103 // 2D
104 //------------------------------------------------------------------------------
105 
106 //------------------------------------------------------------------------------
107 // 2D tensor contraction x
108 //------------------------------------------------------------------------------
109 template <int NUM_COMP, int P_1D, int Q_1D>
110 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
111   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
112   __syncthreads();
113   *V = 0.0;
114   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
115     for (CeedInt i = 0; i < P_1D; i++) {
116       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
117     }
118   }
119   __syncthreads();
120 }
121 
122 //------------------------------------------------------------------------------
123 // 2D tensor contract y
124 //------------------------------------------------------------------------------
125 template <int NUM_COMP, int P_1D, int Q_1D>
126 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
127   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
128   __syncthreads();
129   *V = 0.0;
130   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
131     for (CeedInt i = 0; i < P_1D; i++) {
132       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
133     }
134   }
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 2D transpose tensor contract y
140 //------------------------------------------------------------------------------
141 template <int NUM_COMP, int P_1D, int Q_1D>
142 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
143   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
144   __syncthreads();
145   *V = 0.0;
146   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
147     for (CeedInt i = 0; i < Q_1D; i++) {
148       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
149     }
150   }
151   __syncthreads();
152 }
153 
154 //------------------------------------------------------------------------------
155 // 2D transpose tensor contract x
156 //------------------------------------------------------------------------------
157 template <int NUM_COMP, int P_1D, int Q_1D>
158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
159   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
160   __syncthreads();
161   *V = 0.0;
162   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
163     for (CeedInt i = 0; i < Q_1D; i++) {
164       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
165     }
166   }
167   __syncthreads();
168 }
169 
170 //------------------------------------------------------------------------------
171 // 2D transpose tensor contract and add x
172 //------------------------------------------------------------------------------
173 template <int NUM_COMP, int P_1D, int Q_1D>
174 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
175   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
176   __syncthreads();
177   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
178     for (CeedInt i = 0; i < Q_1D; i++) {
179       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
180     }
181   }
182   __syncthreads();
183 }
184 
185 //------------------------------------------------------------------------------
186 // 2D interpolate to quadrature points
187 //------------------------------------------------------------------------------
188 template <int NUM_COMP, int P_1D, int Q_1D>
189 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
190                                       CeedScalar *__restrict__ r_V) {
191   CeedScalar r_t[1];
192   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
194     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
195   }
196 }
197 
198 //------------------------------------------------------------------------------
199 // 2D interpolate transpose
200 //------------------------------------------------------------------------------
201 template <int NUM_COMP, int P_1D, int Q_1D>
202 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
203                                                CeedScalar *__restrict__ r_V) {
204   CeedScalar r_t[1];
205   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
206     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
207     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
208   }
209 }
210 
211 //------------------------------------------------------------------------------
212 // 2D derivatives at quadrature points
213 //------------------------------------------------------------------------------
214 template <int NUM_COMP, int P_1D, int Q_1D>
215 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
216                                     CeedScalar *__restrict__ r_V) {
217   CeedScalar r_t[1];
218   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
219     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
220     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
221     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
222     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
223   }
224 }
225 
226 //------------------------------------------------------------------------------
227 // 2D derivatives transpose
228 //------------------------------------------------------------------------------
229 template <int NUM_COMP, int P_1D, int Q_1D>
230 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
231                                              CeedScalar *__restrict__ r_V) {
232   CeedScalar r_t[1];
233   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
234     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
235     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
236     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
237     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
238   }
239 }
240 
241 //------------------------------------------------------------------------------
242 // 2D quadrature weights
243 //------------------------------------------------------------------------------
244 template <int Q_1D>
245 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
246   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
247 }
248 
249 //------------------------------------------------------------------------------
250 // 3D
251 //------------------------------------------------------------------------------
252 
253 //------------------------------------------------------------------------------
254 // 3D tensor contract x
255 //------------------------------------------------------------------------------
256 template <int NUM_COMP, int P_1D, int Q_1D>
257 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
258   CeedScalar r_B[P_1D];
259   for (CeedInt i = 0; i < P_1D; i++) {
260     r_B[i] = B[i + data.t_id_x * P_1D];
261   }
262 
263   for (CeedInt k = 0; k < P_1D; k++) {
264     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
265     __syncthreads();
266     V[k] = 0.0;
267     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
268       for (CeedInt i = 0; i < P_1D; i++) {
269         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
270       }
271     }
272     __syncthreads();
273   }
274 }
275 
276 //------------------------------------------------------------------------------
277 // 3D tensor contract y
278 //------------------------------------------------------------------------------
279 template <int NUM_COMP, int P_1D, int Q_1D>
280 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
281   CeedScalar r_B[P_1D];
282   for (CeedInt i = 0; i < P_1D; i++) {
283     r_B[i] = B[i + data.t_id_y * P_1D];
284   }
285 
286   for (CeedInt k = 0; k < P_1D; k++) {
287     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
288     __syncthreads();
289     V[k] = 0.0;
290     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
291       for (CeedInt i = 0; i < P_1D; i++) {
292         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
293       }
294     }
295     __syncthreads();
296   }
297 }
298 
299 //------------------------------------------------------------------------------
300 // 3D tensor contract z
301 //------------------------------------------------------------------------------
302 template <int NUM_COMP, int P_1D, int Q_1D>
303 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
304   for (CeedInt k = 0; k < Q_1D; k++) {
305     V[k] = 0.0;
306     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
307       for (CeedInt i = 0; i < P_1D; i++) {
308         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
309       }
310     }
311   }
312 }
313 
314 //------------------------------------------------------------------------------
315 // 3D transpose tensor contract z
316 //------------------------------------------------------------------------------
317 template <int NUM_COMP, int P_1D, int Q_1D>
318 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
319   for (CeedInt k = 0; k < P_1D; k++) {
320     V[k] = 0.0;
321     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
322       for (CeedInt i = 0; i < Q_1D; i++) {
323         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
324       }
325     }
326   }
327 }
328 
329 //------------------------------------------------------------------------------
330 // 3D transpose tensor contract y
331 //------------------------------------------------------------------------------
332 template <int NUM_COMP, int P_1D, int Q_1D>
333 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
334   CeedScalar r_B[Q_1D];
335   for (CeedInt i = 0; i < Q_1D; i++) {
336     r_B[i] = B[data.t_id_y + i * P_1D];
337   }
338 
339   for (CeedInt k = 0; k < P_1D; k++) {
340     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
341     __syncthreads();
342     V[k] = 0.0;
343     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
344       for (CeedInt i = 0; i < Q_1D; i++) {
345         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
346       }
347     }
348     __syncthreads();
349   }
350 }
351 
352 //------------------------------------------------------------------------------
353 // 3D transpose tensor contract y
354 //------------------------------------------------------------------------------
355 template <int NUM_COMP, int P_1D, int Q_1D>
356 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
357   CeedScalar r_B[Q_1D];
358   for (CeedInt i = 0; i < Q_1D; i++) {
359     r_B[i] = B[data.t_id_y + i * P_1D];
360   }
361 
362   for (CeedInt k = 0; k < P_1D; k++) {
363     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
364     __syncthreads();
365     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
366       for (CeedInt i = 0; i < Q_1D; i++) {
367         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
368       }
369     }
370     __syncthreads();
371   }
372 }
373 
374 //------------------------------------------------------------------------------
375 // 3D transpose tensor contract x
376 //------------------------------------------------------------------------------
377 template <int NUM_COMP, int P_1D, int Q_1D>
378 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
379   CeedScalar r_B[Q_1D];
380   for (CeedInt i = 0; i < Q_1D; i++) {
381     r_B[i] = B[data.t_id_x + i * P_1D];
382   }
383 
384   for (CeedInt k = 0; k < P_1D; k++) {
385     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
386     __syncthreads();
387     V[k] = 0.0;
388     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
389       for (CeedInt i = 0; i < Q_1D; i++) {
390         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
391       }
392     }
393     __syncthreads();
394   }
395 }
396 
397 //------------------------------------------------------------------------------
398 // 3D transpose tensor contract add x
399 //------------------------------------------------------------------------------
400 template <int NUM_COMP, int P_1D, int Q_1D>
401 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
402   CeedScalar r_B[Q_1D];
403   for (CeedInt i = 0; i < Q_1D; i++) {
404     r_B[i] = B[data.t_id_x + i * P_1D];
405   }
406 
407   for (CeedInt k = 0; k < P_1D; k++) {
408     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
409     __syncthreads();
410     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
411       for (CeedInt i = 0; i < Q_1D; i++) {
412         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
413       }
414     }
415     __syncthreads();
416   }
417 }
418 
419 //------------------------------------------------------------------------------
420 // 3D interpolate to quadrature points
421 //------------------------------------------------------------------------------
422 template <int NUM_COMP, int P_1D, int Q_1D>
423 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
424                                       CeedScalar *__restrict__ r_V) {
425   CeedScalar r_t1[T_1D];
426   CeedScalar r_t2[T_1D];
427   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
428     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
429     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
430     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
431   }
432 }
433 
434 //------------------------------------------------------------------------------
435 // 3D interpolate transpose
436 //------------------------------------------------------------------------------
437 template <int NUM_COMP, int P_1D, int Q_1D>
438 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
439                                                CeedScalar *__restrict__ r_V) {
440   CeedScalar r_t1[T_1D];
441   CeedScalar r_t2[T_1D];
442   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
443     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
444     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
445     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
446   }
447 }
448 
449 //------------------------------------------------------------------------------
450 // 3D derivatives at quadrature points
451 //------------------------------------------------------------------------------
452 template <int NUM_COMP, int P_1D, int Q_1D>
453 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
454                                     CeedScalar *__restrict__ r_V) {
455   CeedScalar r_t1[T_1D];
456   CeedScalar r_t2[T_1D];
457   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
458     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
459     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
460     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
461     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
462     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
463     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
464     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
465     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
466     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
467   }
468 }
469 
470 //------------------------------------------------------------------------------
471 // 3D derivatives transpose
472 //------------------------------------------------------------------------------
473 template <int NUM_COMP, int P_1D, int Q_1D>
474 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
475                                              CeedScalar *__restrict__ r_V) {
476   CeedScalar r_t1[T_1D];
477   CeedScalar r_t2[T_1D];
478   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
479     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
480     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
481     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
482     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
483     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
484     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
485     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
486     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
487     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
488   }
489 }
490 
491 //------------------------------------------------------------------------------
492 // 3D derivatives at quadrature points
493 //------------------------------------------------------------------------------
494 template <int NUM_COMP, int P_1D, int Q_1D>
495 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
496                                               CeedScalar *__restrict__ r_V) {
497   CeedScalar r_t1[T_1D];
498   CeedScalar r_t2[T_1D];
499   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
500     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
501     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
502     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
503     ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
504     ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
505     ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
506   }
507 }
508 
509 //------------------------------------------------------------------------------
510 // 3D derivatives transpose
511 //------------------------------------------------------------------------------
512 template <int NUM_COMP, int P_1D, int Q_1D>
513 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
514                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
515   CeedScalar r_t1[T_1D];
516   CeedScalar r_t2[T_1D];
517   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
518     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
519     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
520     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
521     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
522     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
523     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
524   }
525 }
526 
527 //------------------------------------------------------------------------------
528 // 3D quadrature weights
529 //------------------------------------------------------------------------------
530 template <int Q_1D>
531 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
532   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
533   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
534   for (CeedInt q = 0; q < Q_1D; q++) {
535     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
536   }
537 }
538 
539 //------------------------------------------------------------------------------
540 
541 #endif  // CEED_CUDA_SHARED_BASIS_TENSOR_TEMPLATES_H
542