xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 5a5594ffd3205667210320a8dc336d609d13e75f)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 
11 #include <ceed.h>
12 
13 //------------------------------------------------------------------------------
14 // 1D
15 //------------------------------------------------------------------------------
16 
17 //------------------------------------------------------------------------------
18 // 1D tensor contraction x
19 //------------------------------------------------------------------------------
20 template <int NUM_COMP, int P_1D, int Q_1D>
21 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
22   data.slice[data.t_id_x] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (data.t_id_x < Q_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
28     }
29   }
30   __syncthreads();
31 }
32 
33 //------------------------------------------------------------------------------
34 // 1D transpose tensor contraction x
35 //------------------------------------------------------------------------------
36 template <int NUM_COMP, int P_1D, int Q_1D>
37 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
38   data.slice[data.t_id_x] = *U;
39   __syncthreads();
40   *V = 0.0;
41   if (data.t_id_x < P_1D) {
42     for (CeedInt i = 0; i < Q_1D; i++) {
43       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
44     }
45   }
46   __syncthreads();
47 }
48 
49 //------------------------------------------------------------------------------
50 // 1D interpolate to quadrature points
51 //------------------------------------------------------------------------------
52 template <int NUM_COMP, int P_1D, int Q_1D>
53 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
54   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
55     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
56   }
57 }
58 
59 //------------------------------------------------------------------------------
60 // 1D interpolate transpose
61 //------------------------------------------------------------------------------
62 template <int NUM_COMP, int P_1D, int Q_1D>
63 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
64                                          CeedScalar *__restrict__ r_V) {
65   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
66     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
67   }
68 }
69 
70 //------------------------------------------------------------------------------
71 // 1D derivatives at quadrature points
72 //------------------------------------------------------------------------------
73 template <int NUM_COMP, int P_1D, int Q_1D>
74 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
75                               CeedScalar *__restrict__ r_V) {
76   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
77     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // 1D derivatives transpose
83 //------------------------------------------------------------------------------
84 template <int NUM_COMP, int P_1D, int Q_1D>
85 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
86                                        CeedScalar *__restrict__ r_V) {
87   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
88     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
89   }
90 }
91 
92 //------------------------------------------------------------------------------
93 // 1D quadrature weights
94 //------------------------------------------------------------------------------
95 template <int Q_1D>
96 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
97   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
98 }
99 
100 //------------------------------------------------------------------------------
101 // 2D
102 //------------------------------------------------------------------------------
103 
104 //------------------------------------------------------------------------------
105 // 2D tensor contraction x
106 //------------------------------------------------------------------------------
107 template <int NUM_COMP, int P_1D, int Q_1D>
108 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
109   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
115     }
116   }
117   __syncthreads();
118 }
119 
120 //------------------------------------------------------------------------------
121 // 2D tensor contract y
122 //------------------------------------------------------------------------------
123 template <int NUM_COMP, int P_1D, int Q_1D>
124 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
125   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
126   __syncthreads();
127   *V = 0.0;
128   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
129     for (CeedInt i = 0; i < P_1D; i++) {
130       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
131     }
132   }
133   __syncthreads();
134 }
135 
136 //------------------------------------------------------------------------------
137 // 2D transpose tensor contract y
138 //------------------------------------------------------------------------------
139 template <int NUM_COMP, int P_1D, int Q_1D>
140 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
141   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
142   __syncthreads();
143   *V = 0.0;
144   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
145     for (CeedInt i = 0; i < Q_1D; i++) {
146       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
147     }
148   }
149   __syncthreads();
150 }
151 
152 //------------------------------------------------------------------------------
153 // 2D transpose tensor contract x
154 //------------------------------------------------------------------------------
155 template <int NUM_COMP, int P_1D, int Q_1D>
156 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
157   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
158   __syncthreads();
159   *V = 0.0;
160   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
161     for (CeedInt i = 0; i < Q_1D; i++) {
162       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
163     }
164   }
165   __syncthreads();
166 }
167 
168 //------------------------------------------------------------------------------
169 // 2D transpose tensor contract and add x
170 //------------------------------------------------------------------------------
171 template <int NUM_COMP, int P_1D, int Q_1D>
172 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
173   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
174   __syncthreads();
175   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
176     for (CeedInt i = 0; i < Q_1D; i++) {
177       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
178     }
179   }
180   __syncthreads();
181 }
182 
183 //------------------------------------------------------------------------------
184 // 2D interpolate to quadrature points
185 //------------------------------------------------------------------------------
186 template <int NUM_COMP, int P_1D, int Q_1D>
187 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
188                                       CeedScalar *__restrict__ r_V) {
189   CeedScalar r_t[1];
190   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
191     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
192     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
193   }
194 }
195 
196 //------------------------------------------------------------------------------
197 // 2D interpolate transpose
198 //------------------------------------------------------------------------------
199 template <int NUM_COMP, int P_1D, int Q_1D>
200 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
201                                                CeedScalar *__restrict__ r_V) {
202   CeedScalar r_t[1];
203   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
204     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
205     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
206   }
207 }
208 
209 //------------------------------------------------------------------------------
210 // 2D derivatives at quadrature points
211 //------------------------------------------------------------------------------
212 template <int NUM_COMP, int P_1D, int Q_1D>
213 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
214                                     CeedScalar *__restrict__ r_V) {
215   CeedScalar r_t[1];
216   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
217     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
218     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
219     ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
220     ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
221   }
222 }
223 
224 //------------------------------------------------------------------------------
225 // 2D derivatives transpose
226 //------------------------------------------------------------------------------
227 template <int NUM_COMP, int P_1D, int Q_1D>
228 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
229                                              CeedScalar *__restrict__ r_V) {
230   CeedScalar r_t[1];
231   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
232     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
233     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
234     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
235     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
236   }
237 }
238 
239 //------------------------------------------------------------------------------
240 // 2D quadrature weights
241 //------------------------------------------------------------------------------
242 template <int Q_1D>
243 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
244   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
245 }
246 
247 //------------------------------------------------------------------------------
248 // 3D
249 //------------------------------------------------------------------------------
250 
251 //------------------------------------------------------------------------------
252 // 3D tensor contract x
253 //------------------------------------------------------------------------------
254 template <int NUM_COMP, int P_1D, int Q_1D>
255 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
256   CeedScalar r_B[P_1D];
257   for (CeedInt i = 0; i < P_1D; i++) {
258     r_B[i] = B[i + data.t_id_x * P_1D];
259   }
260 
261   for (CeedInt k = 0; k < P_1D; k++) {
262     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
263     __syncthreads();
264     V[k] = 0.0;
265     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
266       for (CeedInt i = 0; i < P_1D; i++) {
267         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
268       }
269     }
270     __syncthreads();
271   }
272 }
273 
274 //------------------------------------------------------------------------------
275 // 3D tensor contract y
276 //------------------------------------------------------------------------------
277 template <int NUM_COMP, int P_1D, int Q_1D>
278 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
279   CeedScalar r_B[P_1D];
280   for (CeedInt i = 0; i < P_1D; i++) {
281     r_B[i] = B[i + data.t_id_y * P_1D];
282   }
283 
284   for (CeedInt k = 0; k < P_1D; k++) {
285     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
286     __syncthreads();
287     V[k] = 0.0;
288     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
289       for (CeedInt i = 0; i < P_1D; i++) {
290         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
291       }
292     }
293     __syncthreads();
294   }
295 }
296 
297 //------------------------------------------------------------------------------
298 // 3D tensor contract z
299 //------------------------------------------------------------------------------
300 template <int NUM_COMP, int P_1D, int Q_1D>
301 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302   for (CeedInt k = 0; k < Q_1D; k++) {
303     V[k] = 0.0;
304     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
305       for (CeedInt i = 0; i < P_1D; i++) {
306         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
307       }
308     }
309   }
310 }
311 
312 //------------------------------------------------------------------------------
313 // 3D transpose tensor contract z
314 //------------------------------------------------------------------------------
315 template <int NUM_COMP, int P_1D, int Q_1D>
316 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
317   for (CeedInt k = 0; k < P_1D; k++) {
318     V[k] = 0.0;
319     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
320       for (CeedInt i = 0; i < Q_1D; i++) {
321         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
322       }
323     }
324   }
325 }
326 
327 //------------------------------------------------------------------------------
328 // 3D transpose tensor contract y
329 //------------------------------------------------------------------------------
330 template <int NUM_COMP, int P_1D, int Q_1D>
331 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
332   CeedScalar r_B[Q_1D];
333   for (CeedInt i = 0; i < Q_1D; i++) {
334     r_B[i] = B[data.t_id_y + i * P_1D];
335   }
336 
337   for (CeedInt k = 0; k < P_1D; k++) {
338     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
339     __syncthreads();
340     V[k] = 0.0;
341     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
342       for (CeedInt i = 0; i < Q_1D; i++) {
343         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
344       }
345     }
346     __syncthreads();
347   }
348 }
349 
350 //------------------------------------------------------------------------------
351 // 3D transpose tensor contract y
352 //------------------------------------------------------------------------------
353 template <int NUM_COMP, int P_1D, int Q_1D>
354 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355   CeedScalar r_B[Q_1D];
356   for (CeedInt i = 0; i < Q_1D; i++) {
357     r_B[i] = B[data.t_id_y + i * P_1D];
358   }
359 
360   for (CeedInt k = 0; k < P_1D; k++) {
361     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
362     __syncthreads();
363     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
364       for (CeedInt i = 0; i < Q_1D; i++) {
365         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
366       }
367     }
368     __syncthreads();
369   }
370 }
371 
372 //------------------------------------------------------------------------------
373 // 3D transpose tensor contract x
374 //------------------------------------------------------------------------------
375 template <int NUM_COMP, int P_1D, int Q_1D>
376 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
377   CeedScalar r_B[Q_1D];
378   for (CeedInt i = 0; i < Q_1D; i++) {
379     r_B[i] = B[data.t_id_x + i * P_1D];
380   }
381 
382   for (CeedInt k = 0; k < P_1D; k++) {
383     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
384     __syncthreads();
385     V[k] = 0.0;
386     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
387       for (CeedInt i = 0; i < Q_1D; i++) {
388         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
389       }
390     }
391     __syncthreads();
392   }
393 }
394 
395 //------------------------------------------------------------------------------
396 // 3D transpose tensor contract add x
397 //------------------------------------------------------------------------------
398 template <int NUM_COMP, int P_1D, int Q_1D>
399 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
400   CeedScalar r_B[Q_1D];
401   for (CeedInt i = 0; i < Q_1D; i++) {
402     r_B[i] = B[data.t_id_x + i * P_1D];
403   }
404 
405   for (CeedInt k = 0; k < P_1D; k++) {
406     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
407     __syncthreads();
408     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
409       for (CeedInt i = 0; i < Q_1D; i++) {
410         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
411       }
412     }
413     __syncthreads();
414   }
415 }
416 
417 //------------------------------------------------------------------------------
418 // 3D interpolate to quadrature points
419 //------------------------------------------------------------------------------
420 template <int NUM_COMP, int P_1D, int Q_1D>
421 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
422                                       CeedScalar *__restrict__ r_V) {
423   CeedScalar r_t1[T_1D];
424   CeedScalar r_t2[T_1D];
425   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
426     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
427     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
428     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
429   }
430 }
431 
432 //------------------------------------------------------------------------------
433 // 3D interpolate transpose
434 //------------------------------------------------------------------------------
435 template <int NUM_COMP, int P_1D, int Q_1D>
436 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
437                                                CeedScalar *__restrict__ r_V) {
438   CeedScalar r_t1[T_1D];
439   CeedScalar r_t2[T_1D];
440   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
441     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
442     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
443     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
444   }
445 }
446 
447 //------------------------------------------------------------------------------
448 // 3D derivatives at quadrature points
449 //------------------------------------------------------------------------------
450 template <int NUM_COMP, int P_1D, int Q_1D>
451 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
452                                     CeedScalar *__restrict__ r_V) {
453   CeedScalar r_t1[T_1D];
454   CeedScalar r_t2[T_1D];
455   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
456     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
457     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
458     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
459     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
460     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
461     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
462     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
463     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
464     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
465   }
466 }
467 
468 //------------------------------------------------------------------------------
469 // 3D derivatives transpose
470 //------------------------------------------------------------------------------
471 template <int NUM_COMP, int P_1D, int Q_1D>
472 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
473                                              CeedScalar *__restrict__ r_V) {
474   CeedScalar r_t1[T_1D];
475   CeedScalar r_t2[T_1D];
476   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
477     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
478     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
479     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
480     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
481     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
482     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
483     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
484     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
485     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
486   }
487 }
488 
489 //------------------------------------------------------------------------------
490 // 3D derivatives at quadrature points
491 //------------------------------------------------------------------------------
492 template <int NUM_COMP, int P_1D, int Q_1D>
493 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
494                                               CeedScalar *__restrict__ r_V) {
495   CeedScalar r_t1[T_1D];
496   CeedScalar r_t2[T_1D];
497   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
498     ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
499     ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
500     ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
501     ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
502     ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
503     ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
504   }
505 }
506 
507 //------------------------------------------------------------------------------
508 // 3D derivatives transpose
509 //------------------------------------------------------------------------------
510 template <int NUM_COMP, int P_1D, int Q_1D>
511 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
512                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
513   CeedScalar r_t1[T_1D];
514   CeedScalar r_t2[T_1D];
515   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
516     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
517     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
518     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
519     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
520     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
521     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
522   }
523 }
524 
525 //------------------------------------------------------------------------------
526 // 3D quadrature weights
527 //------------------------------------------------------------------------------
528 template <int Q_1D>
529 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
530   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
531   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
532   for (CeedInt q = 0; q < Q_1D; q++) {
533     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
534   }
535 }
536